In [1]:
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import pandas as pd
from top2vec import Top2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

total_df = pd.read_csv('./Data/User_Videos_joined.csv')



  from .autonotebook import tqdm as notebook_tqdm
2023-11-11 18:43:07.324660: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:


## this data frame holds the video transcript for each video id for each user with the total number of seconds
total_df
## text_concat is the transcript, start_max is the total number of seconds for each video


Unnamed: 0,video_id,uid,text_concat,start_max,count
0,1QAqxiO8VHM,DtdDe6l482ONxFGKUPIZfJs4U4P2,Take the graph of y = sin(x) and transform \ni...,1394.520,256.0
1,lg5wznn3IBE,DtdDe6l482ONxFGKUPIZfJs4U4P2,"this is the prestigious Walnut cup and,if you ...",1387.679,632.0
2,yTe-haq1ZUc,DtdDe6l482ONxFGKUPIZfJs4U4P2,"elon musk recently gave a warning to,mark zuck...",513.360,239.0
3,tBQY1MC7Pf8,DtdDe6l482ONxFGKUPIZfJs4U4P2,"Russia continues to threaten Elon Musk,but thi...",589.200,285.0
4,HxAoQSkMHcw,DtdDe6l482ONxFGKUPIZfJs4U4P2,"and so where specifically will you be in,terms...",918.000,334.0
...,...,...,...,...,...
1288,7ns4J-DgVyA,UdZxcsQWpDO5BLH4rNerJl9Y7Fz1,"so I'm sure a lot of you are absolutely,shocke...",1337.340,565.0
1289,Hue46YwRvSc,UdZxcsQWpDO5BLH4rNerJl9Y7Fz1,A few people in the comments sections of my\nr...,1035.032,213.0
1290,wSLoSUGbGaA,UdZxcsQWpDO5BLH4rNerJl9Y7Fz1,"foreign,[Music],makes I mean these are the pri...",2722.079,1251.0
1291,e0plyXEc1Ak,UdZxcsQWpDO5BLH4rNerJl9Y7Fz1,The governing council\n decided the following:...,4641.570,1290.0


In [5]:
total_df[total_df.text_concat.isna() == False]
total_df = total_df.drop_duplicates()
total_df = total_df[total_df.text_concat.isna() == False]



In [None]:
# Initialize lists to store topic terms, topic info, and video data
topic_terms = []
topic_info_ls = []
topic_vds_ls = []

# Loop over each unique user ID
for uid in total_df['uid'].unique():
    # Filter dataframe for current user ID
    df = total_df[total_df['uid'] == uid]
    
    # Train Top2Vec model on the text data
    model = Top2Vec(list(df['text_concat']), embedding_model='universal-sentence-encoder-multilingual')
    
    # Get topics from the model
    topics = model.get_topics()
    
    # Initialize list to store most separating words
    most_seperating_words = []
    
    # Loop over each topic
    for i in range(len(topics[0])):
        # Get terms and weights for current topic
        terms = topics[0][i]
        weights = topics[1][i]
        
        # Loop over each term
        for k in range(len(terms)):
            # Add first term to most separating words
            if k == 0:
                most_seperating_words.append(terms[k])
            
            # Add term info to topic terms
            topic_terms.append({
            'cluster':i,
            'term': terms[k],
            'weight': weights[k],
                'uid':uid
        })
    
    # Calculate cosine similarity between word vectors and topic vectors
    # the result is a matrix of size (number of words) x (number of topics) to know each word how close it is to each topic
    topic_word = cosine_similarity(model.word_vectors, model.topic_vectors)
    
    # Perform PCA for dimensionality reduction
    # this will compress the data into 2 dimensions to be able to plot the topics 
    pca = PCA(n_components=2)
    pca.fit(topic_word)
    
    # Initialize list to store PCA results
    ls = []
    
    # Loop over each topic
    for i in range(len(topics[0])):
        # Get PCA components for current topic
        x = pca.components_[0][i]
        y = pca.components_[1][i]
        
        # Add PCA results to list
        ls.append({
        'cluster' : i,
        'x' : x,
        'y' : y,
            'uid' : uid
    })
    
    # Convert list to dataframe
    topic_info_df = pd.DataFrame(ls)
    
    # Assign topic (cluster) to each document
    df['cluster'] = model.doc_top
    
    # Create new dataframe with video info
    topic_videos_df = df[['video_id','start_max','cluster','uid']]
    
    # Initialize dictionary to store total minutes for each cluster
    topic_minutes = {}
    
    # Calculate total minutes for each cluster
    for i in topic_videos_df.groupby('cluster').agg({
    'start_max':'sum'
    }).iterrows():
        topic_minutes[i[0]] = i[1][0] / 60
    
    # Add total minutes to dataframe
    topic_info_df['total_minutes'] = topic_info_df['cluster'].apply(lambda r: topic_minutes[r])
    
    # Add most separating word to dataframe
    topic_info_df['most_seperating_word'] = most_seperating_words
    
    # Add dataframe info to topic info list
    for tp in topic_info_df.iterrows():
        topic_info_ls.append(tp[1].to_dict())
    
    # Add video info to video data list
    for vd in topic_videos_df.iterrows():
        topic_vds_ls.append(vd[1].to_dict())

In [None]:
Videos_With_Cluster_df = pd.DataFrame(topic_vds_ls)
Topic_Info_df = pd.DataFrame(topic_info_ls)
topic_terms_df = pd.DataFrame(topic_terms)
                              
# Save dataframes to CSV files
Topic_Info_df.to_csv('topic_info.csv', index=False)
Videos_With_Cluster_df.to_csv('videos_with_cluster.csv', index=False)
topic_terms_df.to_csv('topic_terms.csv', index=False)
