In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from datetime import datetime
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('display.max_columns', None) 

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df = df.drop(columns=['Ended', 'Explore link'])
df['Started'] = pd.to_datetime(df['Started'])

In [4]:
# Function to convert string to numeric
def convert_to_number(s):
    s = s.replace('+', '')  # Remove the '+' sign
    if 'M' in s:
        return int(float(s.replace('M', '')) * 1_000_000)
    elif 'K' in s:
        return int(float(s.replace('K', '')) * 1_000)
    else:
        return int(s)  # For plain numbers

# Apply the function to the array
numeric_data = np.array([convert_to_number(value) for value in df['Search volume']])

# Output the result
df['Search volume'] = numeric_data

In [5]:
# Combine the "Trends" and "Trend breakdown" columns for text analysis
df['combined_text'] = df['Trends'] + " " + df['Trend breakdown'].fillna("")

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)

# Use Agglomerative Clustering to group similar topics
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5, affinity='precomputed', linkage='average')
clusters = clustering_model.fit_predict(1 - cosine_sim)  # 1 - cosine similarity to convert to distance

# Add cluster labels to the dataset
df['Cluster'] = clusters

# Display the grouped data
grouped_data = df[['Trends', 'Cluster']].sort_values(by='Cluster')

In [6]:
df.head()

Unnamed: 0,Trends,Search volume,Started,Trend breakdown,combined_text,Cluster
0,wicked book,2000000,2024-11-20 22:00:00+05:00,"wicked book,elphaba,wicked musical,wicked trai...","wicked book wicked book,elphaba,wicked musical...",5
1,moana 2,1000000,2024-11-22 07:20:00+05:00,"moana 2,moana 2 release date,when does moana 2...","moana 2 moana 2,moana 2 release date,when does...",77
2,who won dancing with the stars 2024,1000000,2024-11-25 22:00:00+05:00,"who won dancing with the stars 2024,dancing wi...",who won dancing with the stars 2024 who won da...,120
3,pam bondi,1000000,2024-11-21 13:40:00+05:00,pam bondi,pam bondi pam bondi,91
4,lake effect snow warning,1000000,2024-11-27 12:20:00+05:00,"lake effect snow warning,thanksgiving,thanksgi...",lake effect snow warning lake effect snow warn...,99


In [7]:
# Group the data by the Cluster column to collect trends
grouped_by_cluster = df.groupby('Cluster')['Trends'].apply(list).reset_index()
grouped_by_cluster.columns = ['Cluster', 'Trends']

# Group by cluster to calculate total search volume
cluster_summary = df.groupby('Cluster')['Search volume'].sum().reset_index()
cluster_summary.columns = ['Cluster', 'Total Search Volume']

# Merge the two DataFrames on the Cluster column
combined_df = pd.merge(grouped_by_cluster, cluster_summary, on='Cluster')
combined_df

Unnamed: 0,Cluster,Trends,Total Search Volume
0,0,"[champions league, champions league schedule]",520000
1,1,"[kendrick lamar, gnx]",1000000
2,2,"[lizzo, lizzo weight loss]",150000
3,3,[codi alert],500
4,4,"[girona fc, girona]",1500
...,...,...,...
123,123,[regal cinemas],100000
124,124,[pecan pie],50000
125,125,[butterball turkey],500000
126,126,[iphone 17],200000


In [8]:
df_new = pd.merge(df, combined_df, on='Cluster', how = 'inner')
df_new['Trends_y'] = df_new['Trends_y'].astype(str)
#df_new
len(df_new)

138

In [9]:
df_new.head()

Unnamed: 0,Trends_x,Search volume,Started,Trend breakdown,combined_text,Cluster,Trends_y,Total Search Volume
0,wicked book,2000000,2024-11-20 22:00:00+05:00,"wicked book,elphaba,wicked musical,wicked trai...","wicked book wicked book,elphaba,wicked musical...",5,"['wicked book', 'wicked showtimes']",2100000
1,wicked showtimes,100000,2024-11-20 17:10:00+05:00,"wicked showtimes,amc theatres","wicked showtimes wicked showtimes,amc theatres",5,"['wicked book', 'wicked showtimes']",2100000
2,moana 2,1000000,2024-11-22 07:20:00+05:00,"moana 2,moana 2 release date,when does moana 2...","moana 2 moana 2,moana 2 release date,when does...",77,['moana 2'],1000000
3,who won dancing with the stars 2024,1000000,2024-11-25 22:00:00+05:00,"who won dancing with the stars 2024,dancing wi...",who won dancing with the stars 2024 who won da...,120,['who won dancing with the stars 2024'],1000000
4,pam bondi,1000000,2024-11-21 13:40:00+05:00,pam bondi,pam bondi pam bondi,91,['pam bondi'],1000000


In [10]:
df_new = df_new.drop_duplicates(subset=['Trends_y','Total Search Volume'])
len(df_new)

128

In [11]:
df_new

Unnamed: 0,Trends_x,Search volume,Started,Trend breakdown,combined_text,Cluster,Trends_y,Total Search Volume
0,wicked book,2000000,2024-11-20 22:00:00+05:00,"wicked book,elphaba,wicked musical,wicked trai...","wicked book wicked book,elphaba,wicked musical...",5,"['wicked book', 'wicked showtimes']",2100000
2,moana 2,1000000,2024-11-22 07:20:00+05:00,"moana 2,moana 2 release date,when does moana 2...","moana 2 moana 2,moana 2 release date,when does...",77,['moana 2'],1000000
3,who won dancing with the stars 2024,1000000,2024-11-25 22:00:00+05:00,"who won dancing with the stars 2024,dancing wi...",who won dancing with the stars 2024 who won da...,120,['who won dancing with the stars 2024'],1000000
4,pam bondi,1000000,2024-11-21 13:40:00+05:00,pam bondi,pam bondi pam bondi,91,['pam bondi'],1000000
5,lake effect snow warning,1000000,2024-11-27 12:20:00+05:00,"lake effect snow warning,thanksgiving,thanksgi...",lake effect snow warning lake effect snow warn...,99,['lake effect snow warning'],1000000
...,...,...,...,...,...,...,...,...
133,indiana fever,1000,2024-11-27 09:00:00+05:00,indiana fever,indiana fever indiana fever,17,['indiana fever'],1000
134,keith kellogg,1000,2024-11-27 13:00:00+05:00,"keith kellogg,richard grenell","keith kellogg keith kellogg,richard grenell",8,['keith kellogg'],1000
135,codi alert,500,2024-11-27 12:30:00+05:00,codi alert,codi alert codi alert,3,['codi alert'],500
136,medrick burnett jr,500,2024-11-27 12:00:00+05:00,medrick burnett jr,medrick burnett jr medrick burnett jr,19,['medrick burnett jr'],500


In [12]:
from transformers import pipeline

# Use a zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [13]:
categories = [
    "Technology and Science",
    "Health and Wellness",
    "Travel and Leisure",
    "Food and Lifestyle",
    "Education and Knowledge",
    "Sports and Recreation",
    "Finance and Business",
    "Arts and Entertainment",
    "Relationships and Society",
    "Environment and Current Events"
]

In [None]:
#setting empty values for the columns
index = -1
for label in categories:
    df_new[label] = -1

for j in range(len(df_new)):
    #counter for progress/debugging
    index+=1
    if(index%10 == 0): 
        print(index)
        
    #running the classifier on the column    
    res = classifier(
        df_new.iloc[j]['combined_text'],
        candidate_labels = categories,
        multi_label = True
    )
    #setting the column values according to the output from the classifier ("_m" = multiclass)
    for i in range(len(res['labels'])):
        df_new[res['labels'][i]].iloc[j] = res['scores'][i]

0
10
20
30
40
50
60
70
80
90
100
110
120


In [15]:
# df_demo['max'] = df_demo[['cost_m', 'efficient_m', 'effective_m', 'ease of use_m']].max(axis=1)

In [20]:
df_new['max'] = df_new.iloc[:,4:].max(axis=1)

In [21]:
len(df_new)
len(df_new[df_new['max'] > 0.8])

128

128

In [24]:
df

Unnamed: 0,Trends,Search volume,Started,Trend breakdown,combined_text,Cluster,Technology and Science,Health and Wellness,Travel and Leisure,Food and Lifestyle,Education and Knowledge,Sports and Recreation,Finance and Business,Arts and Entertainment,Relationships and Society,Environment and Current Events,max
0,wicked book,2000000,2024-11-20 22:00:00+05:00,"wicked book,elphaba,wicked musical,wicked trai...","wicked book wicked book,elphaba,wicked musical...",5,0.005286,0.009133,0.065937,0.007739,0.087492,0.004223,0.011364,0.879352,0.042857,0.052078,5.0
1,moana 2,1000000,2024-11-22 07:20:00+05:00,"moana 2,moana 2 release date,when does moana 2...","moana 2 moana 2,moana 2 release date,when does...",77,0.043362,0.016064,0.254693,0.115338,0.065446,0.001407,0.023147,0.518922,0.178054,0.288054,77.0
2,who won dancing with the stars 2024,1000000,2024-11-25 22:00:00+05:00,"who won dancing with the stars 2024,dancing wi...",who won dancing with the stars 2024 who won da...,120,0.854020,0.900018,0.953339,0.865420,0.907713,0.752964,0.880310,0.961735,0.977800,0.965705,120.0
3,pam bondi,1000000,2024-11-21 13:40:00+05:00,pam bondi,pam bondi pam bondi,91,0.135664,0.088051,0.170337,0.190012,0.063940,0.009066,0.340765,0.208433,0.312846,0.136041,91.0
4,lake effect snow warning,1000000,2024-11-27 12:20:00+05:00,"lake effect snow warning,thanksgiving,thanksgi...",lake effect snow warning lake effect snow warn...,99,0.038112,0.027817,0.558794,0.118383,0.228534,0.020488,0.034818,0.104366,0.277582,0.974190,99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,costco black friday deals,1000,2024-11-27 00:20:00+05:00,costco black friday deals,costco black friday deals costco black friday ...,11,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,11.0
134,codi alert,500,2024-11-27 12:30:00+05:00,codi alert,codi alert codi alert,3,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,3.0
135,medrick burnett jr,500,2024-11-27 12:00:00+05:00,medrick burnett jr,medrick burnett jr medrick burnett jr,19,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,19.0
136,babylon,500,2024-11-27 12:30:00+05:00,babylon,babylon babylon,13,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,13.0


In [22]:
df_new[df_new['max']>=0.8]

Unnamed: 0,Trends_x,Search volume,Started,Trend breakdown,combined_text,Cluster,Trends_y,Total Search Volume,max
0,wicked book,2000000,2024-11-20 22:00:00+05:00,"wicked book,elphaba,wicked musical,wicked trai...","wicked book wicked book,elphaba,wicked musical...",5,"['wicked book', 'wicked showtimes']",2100000,2100000
2,moana 2,1000000,2024-11-22 07:20:00+05:00,"moana 2,moana 2 release date,when does moana 2...","moana 2 moana 2,moana 2 release date,when does...",77,['moana 2'],1000000,1000000
3,who won dancing with the stars 2024,1000000,2024-11-25 22:00:00+05:00,"who won dancing with the stars 2024,dancing wi...",who won dancing with the stars 2024 who won da...,120,['who won dancing with the stars 2024'],1000000,1000000
4,pam bondi,1000000,2024-11-21 13:40:00+05:00,pam bondi,pam bondi pam bondi,91,['pam bondi'],1000000,1000000
5,lake effect snow warning,1000000,2024-11-27 12:20:00+05:00,"lake effect snow warning,thanksgiving,thanksgi...",lake effect snow warning lake effect snow warn...,99,['lake effect snow warning'],1000000,1000000
...,...,...,...,...,...,...,...,...,...
133,indiana fever,1000,2024-11-27 09:00:00+05:00,indiana fever,indiana fever indiana fever,17,['indiana fever'],1000,1000
134,keith kellogg,1000,2024-11-27 13:00:00+05:00,"keith kellogg,richard grenell","keith kellogg keith kellogg,richard grenell",8,['keith kellogg'],1000,1000
135,codi alert,500,2024-11-27 12:30:00+05:00,codi alert,codi alert codi alert,3,['codi alert'],500,500
136,medrick burnett jr,500,2024-11-27 12:00:00+05:00,medrick burnett jr,medrick burnett jr medrick burnett jr,19,['medrick burnett jr'],500,500


In [19]:
df.to_csv('pretrained.csv')