In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from datetime import datetime
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('display.max_columns', None) 

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df = df.drop(columns=['Ended', 'Explore link'])
df['Started'] = pd.to_datetime(df['Started'])

In [None]:
# Function to convert string to numeric
def convert_to_number(s):
    s = s.replace('+', '')  # Remove the '+' sign
    if 'M' in s:
        return int(float(s.replace('M', '')) * 1_000_000)
    elif 'K' in s:
        return int(float(s.replace('K', '')) * 1_000)
    else:
        return int(s)  # For plain numbers

# Apply the function to the array
numeric_data = np.array([convert_to_number(value) for value in df['Search volume']])

# Output the result
df['Search volume'] = numeric_data

In [None]:
# Combine the "Trends" and "Trend breakdown" columns for text analysis
df['combined_text'] = df['Trends'] + " " + df['Trend breakdown'].fillna("")

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix)

# Use Agglomerative Clustering to group similar topics
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5, affinity='precomputed', linkage='average')
clusters = clustering_model.fit_predict(1 - cosine_sim)  # 1 - cosine similarity to convert to distance

# Add cluster labels to the dataset
df['Cluster'] = clusters

# Display the grouped data
grouped_data = df[['Trends', 'Cluster']].sort_values(by='Cluster')

In [None]:
df.head()

In [None]:
# Group the data by the Cluster column to collect trends
grouped_by_cluster = df.groupby('Cluster')['Trends'].apply(list).reset_index()
grouped_by_cluster.columns = ['Cluster', 'Trends']

# Group by cluster to calculate total search volume
cluster_summary = df.groupby('Cluster')['Search volume'].sum().reset_index()
cluster_summary.columns = ['Cluster', 'Total Search Volume']

# Merge the two DataFrames on the Cluster column
combined_df = pd.merge(grouped_by_cluster, cluster_summary, on='Cluster')
combined_df

In [None]:
df_new = pd.merge(df, combined_df, on='Cluster', how = 'inner')
df_new['Trends_y'] = df_new['Trends_y'].astype(str)
#df_new
len(df_new)

In [None]:
df_new.head()

In [None]:
df_new = df_new.drop_duplicates(subset=['Trends_y','Total Search Volume'])
len(df_new)

In [None]:
df_new

In [None]:
from transformers import pipeline

# Use a zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [None]:
categories = [
    "Technology and Science",
    "Health and Wellness",
    "Travel and Leisure",
    "Food and Lifestyle",
    "Education and Knowledge",
    "Sports and Recreation",
    "Finance and Business",
    "Arts and Entertainment",
    "Relationships and Society",
    "Environment and Current Events"
]

In [None]:
#setting empty values for the columns
index = -1
for label in categories:
    df[label] = -1

for j in range(len(df)):
    #counter for progress/debugging
    index+=1
    if(index%10 == 0): 
        print(index)
        
    #running the classifier on the column    
    res = classifier(
        df.iloc[j]['combined_text'],
        candidate_labels = categories,
        multi_label = True
    )
    #setting the column values according to the output from the classifier ("_m" = multiclass)
    for i in range(len(res['labels'])):
        df[res['labels'][i]].iloc[j] = res['scores'][i]

In [None]:
# df_demo['max'] = df_demo[['cost_m', 'efficient_m', 'effective_m', 'ease of use_m']].max(axis=1)

In [None]:
df['max'] = df.iloc[:,4:].max(axis=1)

In [None]:
len(df)
len(df[df['max'] > 0.8])

In [None]:
df[df['max']>=0.8]

In [None]:
df.to_csv('pretrained.csv')