In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from datetime import datetime
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('display.max_columns', None) 

In [26]:
df = pd.read_csv('data.csv')

In [27]:
df = df.drop(columns=['Ended', 'Explore link'])
df['Started'] = pd.to_datetime(df['Started'])

In [28]:
# Function to convert string to numeric
def convert_to_number(s):
    s = s.replace('+', '')  # Remove the '+' sign
    if 'M' in s:
        return int(float(s.replace('M', '')) * 1_000_000)
    elif 'K' in s:
        return int(float(s.replace('K', '')) * 1_000)
    else:
        return int(s)  # For plain numbers

# Apply the function to the array
numeric_data = np.array([convert_to_number(value) for value in df['Search volume']])

# Output the result
df['Search volume'] = numeric_data

In [None]:
# Combine the "Trends" and "Trend breakdown" columns for text analysis
df['combined_text'] = df['Trends'] + "," + df['Trend breakdown'].fillna("")

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)

# Use Agglomerative Clustering to group similar topics
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5, affinity='precomputed', linkage='average')
clusters = clustering_model.fit_predict(1 - cosine_sim)  # 1 - cosine similarity to convert to distance

# Add cluster labels to the dataset
df['Cluster'] = clusters

# Display the grouped data
grouped_data = df[['Trends', 'Cluster']].sort_values(by='Cluster')

In [30]:
df.head()

Unnamed: 0,Trends,Search volume,Started,Trend breakdown,combined_text,Cluster
0,wicked book,2000000,2024-11-20 22:00:00+05:00,"wicked book,elphaba,wicked musical,wicked trai...","wicked book wicked book,elphaba,wicked musical...",5
1,moana 2,1000000,2024-11-22 07:20:00+05:00,"moana 2,moana 2 release date,when does moana 2...","moana 2 moana 2,moana 2 release date,when does...",77
2,who won dancing with the stars 2024,1000000,2024-11-25 22:00:00+05:00,"who won dancing with the stars 2024,dancing wi...",who won dancing with the stars 2024 who won da...,120
3,pam bondi,1000000,2024-11-21 13:40:00+05:00,pam bondi,pam bondi pam bondi,91
4,lake effect snow warning,1000000,2024-11-27 12:20:00+05:00,"lake effect snow warning,thanksgiving,thanksgi...",lake effect snow warning lake effect snow warn...,99


In [31]:
# Group the data by the Cluster column to collect trends
grouped_by_cluster = df.groupby('Cluster')['Trends'].apply(list).reset_index()
grouped_by_cluster.columns = ['Cluster', 'Trends']

# Group by cluster to calculate total search volume
cluster_summary = df.groupby('Cluster')['Search volume'].sum().reset_index()
cluster_summary.columns = ['Cluster', 'Total Search Volume']

# Merge the two DataFrames on the Cluster column
combined_df = pd.merge(grouped_by_cluster, cluster_summary, on='Cluster')
combined_df

Unnamed: 0,Cluster,Trends,Total Search Volume
0,0,"[champions league, champions league schedule]",520000
1,1,"[kendrick lamar, gnx]",1000000
2,2,"[lizzo, lizzo weight loss]",150000
3,3,[codi alert],500
4,4,"[girona fc, girona]",1500
...,...,...,...
123,123,[regal cinemas],100000
124,124,[pecan pie],50000
125,125,[butterball turkey],500000
126,126,[iphone 17],200000


In [32]:
df_new = pd.merge(df, combined_df, on='Cluster', how = 'inner')
df_new['Trends_y'] = df_new['Trends_y'].astype(str)
#df_new
len(df_new)

138

In [33]:
df_new.head()

Unnamed: 0,Trends_x,Search volume,Started,Trend breakdown,combined_text,Cluster,Trends_y,Total Search Volume
0,wicked book,2000000,2024-11-20 22:00:00+05:00,"wicked book,elphaba,wicked musical,wicked trai...","wicked book wicked book,elphaba,wicked musical...",5,"['wicked book', 'wicked showtimes']",2100000
1,wicked showtimes,100000,2024-11-20 17:10:00+05:00,"wicked showtimes,amc theatres","wicked showtimes wicked showtimes,amc theatres",5,"['wicked book', 'wicked showtimes']",2100000
2,moana 2,1000000,2024-11-22 07:20:00+05:00,"moana 2,moana 2 release date,when does moana 2...","moana 2 moana 2,moana 2 release date,when does...",77,['moana 2'],1000000
3,who won dancing with the stars 2024,1000000,2024-11-25 22:00:00+05:00,"who won dancing with the stars 2024,dancing wi...",who won dancing with the stars 2024 who won da...,120,['who won dancing with the stars 2024'],1000000
4,pam bondi,1000000,2024-11-21 13:40:00+05:00,pam bondi,pam bondi pam bondi,91,['pam bondi'],1000000


In [34]:
df_new = df_new.drop_duplicates(subset=['Trends_y','Total Search Volume'])
len(df_new)

128

In [35]:
df_new

Unnamed: 0,Trends_x,Search volume,Started,Trend breakdown,combined_text,Cluster,Trends_y,Total Search Volume
0,wicked book,2000000,2024-11-20 22:00:00+05:00,"wicked book,elphaba,wicked musical,wicked trai...","wicked book wicked book,elphaba,wicked musical...",5,"['wicked book', 'wicked showtimes']",2100000
2,moana 2,1000000,2024-11-22 07:20:00+05:00,"moana 2,moana 2 release date,when does moana 2...","moana 2 moana 2,moana 2 release date,when does...",77,['moana 2'],1000000
3,who won dancing with the stars 2024,1000000,2024-11-25 22:00:00+05:00,"who won dancing with the stars 2024,dancing wi...",who won dancing with the stars 2024 who won da...,120,['who won dancing with the stars 2024'],1000000
4,pam bondi,1000000,2024-11-21 13:40:00+05:00,pam bondi,pam bondi pam bondi,91,['pam bondi'],1000000
5,lake effect snow warning,1000000,2024-11-27 12:20:00+05:00,"lake effect snow warning,thanksgiving,thanksgi...",lake effect snow warning lake effect snow warn...,99,['lake effect snow warning'],1000000
...,...,...,...,...,...,...,...,...
133,indiana fever,1000,2024-11-27 09:00:00+05:00,indiana fever,indiana fever indiana fever,17,['indiana fever'],1000
134,keith kellogg,1000,2024-11-27 13:00:00+05:00,"keith kellogg,richard grenell","keith kellogg keith kellogg,richard grenell",8,['keith kellogg'],1000
135,codi alert,500,2024-11-27 12:30:00+05:00,codi alert,codi alert codi alert,3,['codi alert'],500
136,medrick burnett jr,500,2024-11-27 12:00:00+05:00,medrick burnett jr,medrick burnett jr medrick burnett jr,19,['medrick burnett jr'],500


In [36]:
from transformers import pipeline

# Use a zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [None]:
categories = [
    "Arts & Entertainment",
    "Autos & Vehicles",
    "Beauty & Fitness",
    "Books & Literature",
    "Business & Industrial",
    "Computers & Electronics",
    "Finance",
    "Food & Drink",
    "Games",
    "Health",
    "Hobbies & Leisure",
    "Home & Garden",
    "Internet & Telecom",
    "Jobs & Education",
    "Law & Government",
    "News",
    "Online Communities",
    "People & Society",
    "Pets & Animals",
    "Real Estate",
    "Reference",
    "Science",
    "Shopping",
    "Sports",
    "Travel"
]

In [38]:
#setting empty values for the columns
index = -1
for label in categories:
    df_new[label] = -1

for j in range(len(df_new)):
    #counter for progress/debugging
    index+=1
    if(index%10 == 0): 
        print(index)
        
    #running the classifier on the column    
    res = classifier(
        df_new.iloc[j]['combined_text'],
        candidate_labels = categories,
        multi_label = True
    )
    #setting the column values according to the output from the classifier ("_m" = multiclass)
    for i in range(len(res['labels'])):
        df_new[res['labels'][i]].iloc[j] = res['scores'][i]

0
10
20
30
40
50
60
70
80
90
100
110
120


In [39]:
# df_demo['max'] = df_demo[['cost_m', 'efficient_m', 'effective_m', 'ease of use_m']].max(axis=1)

In [58]:
df_new['max'] = df_new.iloc[:,8:].max(axis=1)

In [None]:
len(df_new)
len(df_new[df_new['max'] > 0.8])

128

27

In [63]:
df_new[df_new['max'] > 0.8]

Unnamed: 0,Trends_x,Search volume,Started,Trend breakdown,combined_text,Cluster,Trends_y,Total Search Volume,Technology and Science,Health and Wellness,Travel and Leisure,Food and Lifestyle,Education and Knowledge,Sports and Recreation,Finance and Business,Arts and Entertainment,Relationships and Society,Environment and Current Events,max
0,wicked book,2000000,2024-11-20 22:00:00+05:00,"wicked book,elphaba,wicked musical,wicked trai...","wicked book wicked book,elphaba,wicked musical...",5,"['wicked book', 'wicked showtimes']",2100000,0.005286,0.009133,0.065937,0.007739,0.087492,0.004223,0.011364,0.879352,0.042857,0.052078,0.879352
3,who won dancing with the stars 2024,1000000,2024-11-25 22:00:00+05:00,"who won dancing with the stars 2024,dancing wi...",who won dancing with the stars 2024 who won da...,120,['who won dancing with the stars 2024'],1000000,0.85402,0.900018,0.953339,0.86542,0.907713,0.752964,0.88031,0.961735,0.9778,0.965705,0.9778
5,lake effect snow warning,1000000,2024-11-27 12:20:00+05:00,"lake effect snow warning,thanksgiving,thanksgi...",lake effect snow warning lake effect snow warn...,99,['lake effect snow warning'],1000000,0.038112,0.027817,0.558794,0.118383,0.228534,0.020488,0.034818,0.104366,0.277582,0.97419,0.97419
17,ellen degeneres,200000,2024-11-20 17:10:00+05:00,"ellen degeneres,ellen degeneres moving,portia ...","ellen degeneres ellen degeneres,ellen degenere...",82,['ellen degeneres'],200000,0.089019,0.13882,0.828423,0.379133,0.229511,0.174229,0.165789,0.154959,0.707028,0.789309,0.828423
22,how to get juice wrld in fortnite,200000,2024-11-25 09:00:00+05:00,"how to get juice wrld in fortnite,how to get t...",how to get juice wrld in fortnite how to get j...,111,['how to get juice wrld in fortnite'],200000,0.41522,0.295298,0.291647,0.395969,0.461821,0.052376,0.09399,0.311555,0.368358,0.883945,0.883945
23,a man on the inside,200000,2024-11-21 05:10:00+05:00,"a man on the inside,man on the inside,man on t...","a man on the inside a man on the inside,man on...",76,['a man on the inside'],200000,0.052594,0.080552,0.10328,0.03136,0.06426,0.003915,0.071144,0.868572,0.36257,0.18537,0.868572
27,blake snell,100000,2024-11-26 22:50:00+05:00,"blake snell,snell,blake snell contract,blake s...","blake snell blake snell,snell,blake snell cont...",71,['blake snell'],100000,0.088716,0.043542,0.065235,0.009716,0.117581,0.175311,0.55771,0.061419,0.833556,0.663994,0.833556
30,target black friday,100000,2024-11-21 02:40:00+05:00,"target black friday,target","target black friday target black friday,target",113,['target black friday'],100000,0.077959,0.005759,0.015263,0.012972,0.001933,0.000897,0.277932,0.047945,0.102586,0.847914,0.847914
33,lizzo,100000,2024-11-22 16:50:00+05:00,"lizzo,lil yachty","lizzo lizzo,lil yachty",2,"['lizzo', 'lizzo weight loss']",150000,0.0025,0.00182,0.951469,0.81213,0.000362,0.220595,0.007354,0.667839,0.079974,0.041031,0.951469
41,maui invitational,100000,2024-11-25 09:10:00+05:00,"maui invitational,uconn basketball,uconn men's...","maui invitational maui invitational,uconn bask...",107,['maui invitational'],100000,0.087489,0.00455,0.958245,0.002408,0.007516,0.948943,0.000881,0.74731,0.012835,0.093551,0.958245


In [43]:
df_new[df_new['max']>=0.8]

Unnamed: 0,Trends_x,Search volume,Started,Trend breakdown,combined_text,Cluster,Trends_y,Total Search Volume,Technology and Science,Health and Wellness,Travel and Leisure,Food and Lifestyle,Education and Knowledge,Sports and Recreation,Finance and Business,Arts and Entertainment,Relationships and Society,Environment and Current Events,max
0,wicked book,2000000,2024-11-20 22:00:00+05:00,"wicked book,elphaba,wicked musical,wicked trai...","wicked book wicked book,elphaba,wicked musical...",5,"['wicked book', 'wicked showtimes']",2100000,0.005286,0.009133,0.065937,0.007739,0.087492,0.004223,0.011364,0.879352,0.042857,0.052078,2100000.0
2,moana 2,1000000,2024-11-22 07:20:00+05:00,"moana 2,moana 2 release date,when does moana 2...","moana 2 moana 2,moana 2 release date,when does...",77,['moana 2'],1000000,0.043362,0.016064,0.254693,0.115338,0.065446,0.001407,0.023147,0.518922,0.178054,0.288054,1000000.0
3,who won dancing with the stars 2024,1000000,2024-11-25 22:00:00+05:00,"who won dancing with the stars 2024,dancing wi...",who won dancing with the stars 2024 who won da...,120,['who won dancing with the stars 2024'],1000000,0.854020,0.900018,0.953339,0.865420,0.907713,0.752964,0.880310,0.961735,0.977800,0.965705,1000000.0
4,pam bondi,1000000,2024-11-21 13:40:00+05:00,pam bondi,pam bondi pam bondi,91,['pam bondi'],1000000,0.135664,0.088051,0.170337,0.190012,0.063940,0.009066,0.340765,0.208433,0.312846,0.136041,1000000.0
5,lake effect snow warning,1000000,2024-11-27 12:20:00+05:00,"lake effect snow warning,thanksgiving,thanksgi...",lake effect snow warning lake effect snow warn...,99,['lake effect snow warning'],1000000,0.038112,0.027817,0.558794,0.118383,0.228534,0.020488,0.034818,0.104366,0.277582,0.974190,1000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,indiana fever,1000,2024-11-27 09:00:00+05:00,indiana fever,indiana fever indiana fever,17,['indiana fever'],1000,0.000055,0.000260,0.000542,0.000157,0.000078,0.000175,0.000122,0.000081,0.001055,0.039367,1000.0
134,keith kellogg,1000,2024-11-27 13:00:00+05:00,"keith kellogg,richard grenell","keith kellogg keith kellogg,richard grenell",8,['keith kellogg'],1000,0.450366,0.309760,0.440693,0.150663,0.353568,0.150348,0.255047,0.364686,0.531668,0.342729,1000.0
135,codi alert,500,2024-11-27 12:30:00+05:00,codi alert,codi alert codi alert,3,['codi alert'],500,0.340516,0.183729,0.238147,0.102996,0.236895,0.062233,0.212675,0.119971,0.407309,0.371419,500.0
136,medrick burnett jr,500,2024-11-27 12:00:00+05:00,medrick burnett jr,medrick burnett jr medrick burnett jr,19,['medrick burnett jr'],500,0.524757,0.384591,0.437422,0.159708,0.484181,0.137986,0.267295,0.390745,0.557926,0.413187,500.0


In [62]:
df_new[df_new['max']>=0.8].to_csv('pretrained.csv')