In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from datetime import datetime
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.set_option('display.max_columns', None) 

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df = df.drop(columns=['Ended', 'Explore link'])
df['Started'] = pd.to_datetime(df['Started'])

In [4]:
# Function to convert string to numeric
def convert_to_number(s):
    s = s.replace('+', '')  # Remove the '+' sign
    if 'M' in s:
        return int(float(s.replace('M', '')) * 1_000_000)
    elif 'K' in s:
        return int(float(s.replace('K', '')) * 1_000)
    else:
        return int(s)  # For plain numbers

# Apply the function to the array
numeric_data = np.array([convert_to_number(value) for value in df['Search volume']])

# Output the result
df['Search volume'] = numeric_data

In [5]:
df.head()

Unnamed: 0,Trends,Search volume,Started,Trend breakdown
0,wicked book,2000000,2024-11-20 22:00:00+05:00,"wicked book,elphaba,wicked musical,wicked trai..."
1,moana 2,1000000,2024-11-22 07:20:00+05:00,"moana 2,moana 2 release date,when does moana 2..."
2,who won dancing with the stars 2024,1000000,2024-11-25 22:00:00+05:00,"who won dancing with the stars 2024,dancing wi..."
3,pam bondi,1000000,2024-11-21 13:40:00+05:00,pam bondi
4,lake effect snow warning,1000000,2024-11-27 12:20:00+05:00,"lake effect snow warning,thanksgiving,thanksgi..."


In [6]:
from transformers import pipeline

# Use a zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [7]:
categories = [
    "Technology and Science",
    "Health and Wellness",
    "Travel and Leisure",
    "Food and Lifestyle",
    "Education and Knowledge",
    "Sports and Recreation",
    "Finance and Business",
    "Arts and Entertainment",
    "Relationships and Society",
    "Environment and Current Events"
]

In [8]:
#setting empty values for the columns
index = -1
for label in categories:
    df[label] = -1

for j in range(len(df)):
    #counter for progress/debugging
    index+=1
    if(index%10 == 0): 
        print(index)
        
    #running the classifier on the column    
    res = classifier(
        df.iloc[j]['Trends'],
        candidate_labels = categories,
        multi_label = True
    )
    #setting the column values according to the output from the classifier ("_m" = multiclass)
    for i in range(len(res['labels'])):
        df[res['labels'][i]].iloc[j] = res['scores'][i]

0
10
20
30
40
50
60
70
80
90
100
110
120
130


In [9]:
# df_demo['max'] = df_demo[['cost_m', 'efficient_m', 'effective_m', 'ease of use_m']].max(axis=1)

In [10]:
df['max'] = df.iloc[:,4:].max(axis=1)

In [11]:
len(df)
len(df[df['max'] > 0.8])

138

27

In [14]:
df[df['max']>=0.8]

Unnamed: 0,Trends,Search volume,Started,Trend breakdown,Technology and Science,Health and Wellness,Travel and Leisure,Food and Lifestyle,Education and Knowledge,Sports and Recreation,Finance and Business,Arts and Entertainment,Relationships and Society,Environment and Current Events,max
1,moana 2,1000000,2024-11-22 07:20:00+05:00,"moana 2,moana 2 release date,when does moana 2...",0.248187,0.112772,0.232687,0.183295,0.084218,0.023727,0.058333,0.863986,0.199458,0.209053,0.863986
2,who won dancing with the stars 2024,1000000,2024-11-25 22:00:00+05:00,"who won dancing with the stars 2024,dancing wi...",0.059512,0.012218,0.03244,0.000518,0.005181,0.016132,0.005418,0.804821,0.054749,0.047658,0.804821
4,lake effect snow warning,1000000,2024-11-27 12:20:00+05:00,"lake effect snow warning,thanksgiving,thanksgi...",0.00268,0.004861,0.095617,0.001321,0.003681,0.001316,0.009115,0.000791,0.065345,0.958014,0.958014
5,butterball turkey,500000,2024-11-24 08:20:00+05:00,"butterball turkey,butterball turkey recall,but...",0.000692,0.00139,0.047149,0.882402,0.000289,0.027688,0.008625,0.098987,0.022402,0.016866,0.882402
13,ellen degeneres,200000,2024-11-20 17:10:00+05:00,"ellen degeneres,ellen degeneres moving,portia ...",0.42346,0.211851,0.267165,0.354418,0.967954,0.046834,0.24647,0.228367,0.492851,0.692686,0.967954
28,wicked showtimes,100000,2024-11-20 17:10:00+05:00,"wicked showtimes,amc theatres",0.302545,0.188074,0.339622,0.02303,0.067401,0.074423,0.067329,0.933368,0.36716,0.15898,0.933368
38,maui invitational,100000,2024-11-25 09:10:00+05:00,"maui invitational,uconn basketball,uconn men's...",0.34262,0.05241,0.952833,0.369487,0.017221,0.906881,0.035402,0.853587,0.13932,0.165071,0.952833
42,natalie harp,50000,2024-11-25 13:10:00+05:00,natalie harp,0.073936,0.15181,0.188407,0.118528,0.034574,0.020555,0.097181,0.814627,0.380115,0.18643,0.814627
47,pecan pie,50000,2024-11-21 15:00:00+05:00,pecan pie,0.000181,0.001642,0.004626,0.962455,0.000439,0.000177,0.000362,0.001527,0.004,0.002449,0.962455
48,macy's thanksgiving day parade,50000,2024-11-25 07:10:00+05:00,"macy's thanksgiving day parade,thanksgiving pa...",0.001432,0.000927,0.00843,0.459689,0.000656,0.092036,0.000643,0.868431,0.011811,0.007848,0.868431


In [12]:
df.to_csv('pretrained.csv')