# Topic Modeling

In [1]:
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import decomposition
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
import gensim
from sklearn.metrics.pairwise import cosine_similarity
from statistics import mean
import warnings
from tqdm import tqdm

warnings.filterwarnings('ignore')

#venv m2

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1108)>


In [2]:
path = '../data_cleaning/data/youtube_dislike_dataset.csv' #raw download
clean_transcripts = pd.read_csv(path)
clean_transcripts.head(1)
#splitting the dislikes dataframe by the median 
clean_transcripts['dislikes'].median()
below_median = clean_transcripts[clean_transcripts['dislikes'] < 796]
above_median = clean_transcripts[clean_transcripts['dislikes'] >= 796]
print(len(above_median))
print(len(below_median))
#Create Final Text Column for Machine Learning and Remove Stop Words
below_median["all_text"] = below_median["title"] + below_median["description"]
above_median["all_text"] = above_median["title"] + above_median["description"]
stop = stopwords.words('english')
below_median['all_text'] = below_median['all_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
above_median['all_text'] = above_median['all_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

below_median['title'] = below_median['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
above_median['title'] = above_median['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

18712
18710


In [3]:
clean_transcripts_list = below_median['title'].tolist()
documents_train = clean_transcripts_list
print(len(documents_train))

18710


## LDA

In [4]:
def compute_lda(n_topics,documents_train):
    tfidf_vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_documents = tfidf_vectorizer.fit_transform(documents_train)
    tf_vectorizer = CountVectorizer(stop_words="english")
    tf_documents = tf_vectorizer.fit_transform(documents_train)
    tf_feature_names = tf_vectorizer.get_feature_names_out()

    # This cell will take a couple of minutes to run...
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
    lda.fit(tf_documents)
    topic_models = lda.components_
    num_top_words = 8
    return lda, tf_feature_names, num_top_words
    #display_topics(lda, tf_feature_names, num_top_words)

In [5]:
def display_topics(model, feature_names, no_top_words):
    topic_to_terms = {}
    for topic_idx, topic in enumerate(model.components_):
        term_list = [
            feature_names[i] for i in topic.argsort()[: -no_top_words - 1 : -1]
        ]
        #print("topic %d:" % (topic_idx), term_list)
        topic_to_terms[topic_idx] = term_list
    return topic_to_terms

In [6]:
pretrained_path = '../../../word2vec/GoogleNews-vectors-negative300.bin'
Word2VecModel = KeyedVectors.load_word2vec_format(pretrained_path, binary = True)

In [7]:
#https://www.kaggle.com/code/nkitgupta/text-representations
def topical_coherence(items, w2vmodel):

    result = []
    for item in items:
        try:
            if w2vmodel==Word2VecModel:
                result.append(w2vmodel[item[0]])
            elif w2vmodel==glove_embeddings:
                result.append(w2vmodel[item[0]])
        except KeyError:
            pass
    if len(result) == 0:
        return 0

    matrix_sim = cosine_similarity(result)
    np.fill_diagonal(matrix_sim, 0)
    return np.mean(matrix_sim)

def answer_coherence_a(w2vmodel):
    a = topical_coherence(['train', 'car', 'bicycle', 'bus', 'vehicle', 'transport'], w2vmodel)
    b = topical_coherence(['scsi', 'drive', 'computer', 'storage', 'megabyte'], w2vmodel)
    c = topical_coherence(['introduction', 'pickle', 'guard', 'red', 'valiant'], w2vmodel)

    return a, b, c
print("These values reveal that good coherences are roughly above 0.3 ")
print(answer_coherence_a(w2vmodel=Word2VecModel))

(0.40483522, 0.31330317, 0.2883513)


In [50]:
def mean_coherence_per_lda(test_found_topics, min_coherence=0.33):
    coherence_per_lda = []
    remaining_topics = []
    num_removed_topics = 0
    for key in test_found_topics.keys():
        terms = test_found_topics[key]
        val = topical_coherence(terms, w2vmodel=Word2VecModel)
        if val>min_coherence:
            coherence_per_lda.append(val)
            remaining_topics.append(terms)
        else:
            num_removed_topics += 1
    return mean(coherence_per_lda),remaining_topics,num_removed_topics

In [66]:
def best_lda_per_df(documents_train):
    max_topics = None
    max_lda = None
    max_tf_feature_names = None
    max_num_top_words = None
    max_coherence = 0
    best_topics = None
    final_num_removed_topics = None
    results = []
    for i in tqdm(range(5, 15, 1)):
        lda, tf_feature_names, num_top_words = compute_lda(i,documents_train)
        test_found_topics = display_topics(lda, tf_feature_names, num_top_words)
        coherence, remaining_topics,num_removed_topcs = mean_coherence_per_lda(test_found_topics)
        results.append(coherence)
        if coherence>max_coherence:
            max_coherence = coherence
            max_lda = lda
            max_tf_feature_names = tf_feature_names
            max_num_top_words = num_top_words
            max_topics = i
            best_topics = test_found_topics
            final_num_removed_topics = num_removed_topcs
    print(f"best number topics: {max_topics}")
    print(results)
    print(max_coherence)
    print("The values for the best average coherence model are")
    print(display_topics(max_lda, max_tf_feature_names, num_top_words))
    print(f"Number of final topics removed: {final_num_removed_topics}")
    return remaining_topics, max_topics
    #19 was best  

In [67]:
documents_train_below = below_median["title"]
remaining_topics_below, max_topics_below = best_lda_per_df(documents_train_below)

100%|██████████| 10/10 [06:37<00:00, 39.72s/it]

best number topics: 14
[0.36841208, 0.35977232, 0.35831177, 0.35342705, 0.36817968, 0.36395422, 0.3481807, 0.37460297, 0.3514898, 0.38780206]
0.38780206
The values for the best average coherence model are
{0: ['vs', 'ufc', 'fifa', 'team', 'fight', 'pregnant', 'celtic', '22'], 1: ['nba', 'tv', '2021', 'episode', 'lakers', 'game', 'hermitcraft', 'ep'], 2: ['scenes', '2020', 'minecraft', 'live', 'chunkz', 'watch', 'day', 'fans'], 3: ['fortnite', '2021', 'la', 'en', 'new', 'episode', 'les', 'vlog'], 4: ['trailer', 'official', 'season', 'ft', 'little', '000', 'tour', 'announcement'], 5: ['video', 'official', 'music', 'feat', 'audio', 'ft', 'ep', 'lil'], 6: ['man', 'news', 'utd', 'transfer', 'united', 'goldbridge', 'manchester', 'reaction'], 7: ['highlights', 'league', 'vs', 'premier', 'city', 'united', 'arsenal', 'chelsea'], 8: ['2020', 'highlights', 'vs', 'world', 'cup', 'week', 'game', 'nfl'], 9: ['new', 'update', 'car', '2021', 'results', 'national', 'draw', 'lottery'], 10: ['live', '202




In [68]:
documents_train_above = above_median["title"]
remaining_topics_above, max_topics_above= best_lda_per_df(documents_train_above)

100%|██████████| 10/10 [05:47<00:00, 34.71s/it]

best number topics: 14
[0.37262762, 0.3813132, 0.37289613, 0.36515358, 0.371236, 0.36844745, 0.38009697, 0.35895538, 0.36470556, 0.38535935]
0.38535935
The values for the best average coherence model are
{0: ['vs', 'highlights', '2020', 'week', 'news', '2021', 'nfl', 'super'], 1: ['vs', 'iphone', 'sidemen', '12', 'tv', 'pro', 'la', '19'], 2: ['minecraft', '100', 'days', 'vs', '24', 'man', 'dream', 'paul'], 3: ['2021', 'ep', 'highlights', 'final', 'ka', 'episode', 'league', 'taarak'], 4: ['000', 'en', 'le', 'les', 'challenge', 'wins', 'la', 'ft'], 5: ['shorts', 'fortnite', 'tiktok', 'season', 'vs', 'new', 'challenge', 'hacks'], 6: ['trailer', 'official', 'theory', 'film', 'reveal', 'food', 'teaser', 'don'], 7: ['night', 'friday', 'funkin', 'mod', 'live', 'hot', 'tour', 'animation'], 8: ['video', 'official', 'music', 'ft', 'feat', 'lil', 'audio', 'oficial'], 9: ['game', '2021', '2020', 'highlights', 'bts', 'life', 'squid', 'clip'], 10: ['minecraft', 'new', 'star', 'update', 'day', 'tik',




In [74]:
def create_findings_table(remaining_topics):
    df = pd.DataFrame()
    # Iterate over each inner list
    for i, inner_list in enumerate(remaining_topics):
        # Create a new column with column name as 'Column_i'
        df['Topic{}'.format(i)] = inner_list
    # Display the DataFrame
    return df

In [77]:
below_table = create_findings_table(remaining_topics_below)
below_table

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7
0,trailer,man,highlights,2020,new,live,house,best
1,official,news,league,highlights,update,2021,new,day
2,season,utd,vs,vs,car,quiz,tour,life
3,ft,transfer,premier,world,2021,virtual,year,2020
4,little,united,city,cup,results,pub,van,baby
5,000,goldbridge,united,week,national,la,home,reveal
6,tour,manchester,arsenal,game,draw,date,moving,nba
7,announcement,reaction,chelsea,nfl,lottery,day,tom,build


In [78]:
above_table = create_findings_table(remaining_topics_above)
above_table

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8
0,minecraft,000,shorts,trailer,night,video,game,mv,free
1,100,en,fortnite,official,friday,official,2021,teaser,got
2,days,le,tiktok,theory,funkin,music,2020,2020,people
3,vs,les,season,film,mod,ft,highlights,bad,season
4,24,challenge,vs,reveal,live,feat,bts,boss,war
5,man,wins,new,food,hot,lil,life,try,unboxing
6,dream,la,challenge,teaser,tour,audio,squid,2021,snl
7,paul,ft,hacks,don,animation,oficial,clip,movie,cold


In [95]:
above_list = above_table.values.flatten().tolist()
below_list = below_table.values.flatten().tolist()

In [2]:
#Venn Diagram
from matplotlib_venn import venn2

plt.figure(figsize=(4,4))
set1 = set(above_list)
set2 = set(below_list)

venn2([set1, set2], ('Set1', 'Set2'))
plt.show()

ModuleNotFoundError: No module named 'matplotlib_venn'

In [3]:
from matplotlib_venn_wordcloud import venn2_wordcloud
#https://github.com/paulbrodersen/matplotlib_venn_wordcloud

ModuleNotFoundError: No module named 'matplotlib_venn_wordcloud'

In [None]:
topical_coherence(test_list, w2vmodel)

Visualizing Results

Using Title+Description in All_Text
[0.43928024, 0.44796124, 0.45181724, 0.44953543, 0.44237196, 0.46059352, 0.45885643, 0.43762824, 0.4357811, 0.42841437, 0.4201536, 0.42084923, 0.42975327, 0.4204068, 0.41796622, 0.40867746, 0.4156867, 0.41626924, 0.41916725]
The values for the best average coherence model are
{0: ['com', 'https', 'http', 'ly', 'www', 'bit', 'instagram', 'twitter'], 1: ['https', 'com', 'www', 'twitter', 'instagram', 'youtube', 'http', 'facebook'], 2: ['https', 'com', 'www', 'youtube', 'instagram', 'twitter', 'channel', 'http'], 3: ['https', 'und', 'die', 'com', 'www', 'auf', 'der', 'http'], 4: ['com', 'https', 'que', 'da', 'like', 'official', 'se', 'www'], 5: ['https', 'amzn', 'ly', 'bit', 'la', 'et', 'le', 'les']}

In [12]:
import numpy as np
max_index = np.argmax(results)
print(max_index)
print(results[max_index])

0
0.3125137


[0.43928024, 0.44796124, 0.45181724, 0.44953543, 0.44237196, 0.46059352, 0.45885643, 0.43762824, 0.4357811, 0.42841437, 0.4201536, 0.42084923, 0.42975327, 0.4204068, 0.41796622, 0.40867746, 0.4156867, 0.41626924, 0.41916725]