# Topic Modeling

This trains an LSI, LDA, and NMF to come up with groupings of terms that should be in the same topics.
These algorithms do not give topic names other than topic0 topic1 etc. ChatGPT was asked to name the topics.

These algorithms can be used to label a new document with the most likely topics by probability that correspond to it
ie: predict_topics_lda

In [16]:
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import decomposition
from sklearn.decomposition import LatentDirichletAllocation

In [17]:
#path = '/Users/gabrielalon/Desktop/Milestone2/siads_696_milestoneII_project/src/data_cleaning/data/video_transcripts_clean.csv'
path = '../data_cleaning/data/video_transcripts_clean.csv'

clean_transcripts = pd.read_csv(path)
clean_transcripts.head(2)

Unnamed: 0.1,Unnamed: 0,Title,Video ID,Published At,Keyword,Likes,Comments,Views,Is_Generated,Transcript_Blob,Transcript_Clean,Title_Clean
0,0,Apple Pay Is Killing the Physical Wallet After...,wAZZ-UWGVHI,2022-08-23,tech,3407.0,672.0,135612.0,True,[Music] this is your tech news briefing for t...,music tech news brief tuesday august 23rd im z...,appl pay kill physic wallet eight year tech ne...
1,1,The most EXPENSIVE thing I own.,b3x28s61q3c,2022-08-24,tech,76779.0,4306.0,1758063.0,True,this is my car this is my wife's car and this...,car wife car right power suppli tester bought ...,expens thing


In [18]:
clean_transcripts_list = clean_transcripts['Title'].tolist()
documents_train = clean_transcripts_list
print(len(documents_train))

1881


## LSI

In [19]:
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 1), min_df=2, max_df=0.95, stop_words="english"
)  # default English stopwords

tfidf_documents = tfidf_vectorizer.fit_transform(documents_train)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

n_topics = 5
lsi = TruncatedSVD(n_components=n_topics, random_state=0)

# This is the matrix U_k:  num_term_features x num_topics
reduced_term_matrix = lsi.fit_transform(np.transpose(tfidf_documents))

# and this is the matrix V_k^T  num_topics x num_documents
reduced_document_matrix = lsi.components_

# these are the the values along the diagonal of matrix \Sigma.
singular_values = lsi.singular_values_

#In Progress Applying LSI to label videos as topics
    # Missing


## LDA

In [20]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
tfidf_documents = tfidf_vectorizer.fit_transform(documents_train)

# get_feature_names() deprecated
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

tf_vectorizer = CountVectorizer(stop_words="english")
tf_documents = tf_vectorizer.fit_transform(documents_train)

# get_feature_names() deprecated
# tf_feature_names = tf_vectorizer.get_feature_names()
tf_feature_names = tf_vectorizer.get_feature_names_out()

print(tf_documents.shape)

# This cell will take a couple of minutes to run...
n_topics = 10

lda = LatentDirichletAllocation(n_components=n_topics, random_state=0)
lda.fit(tf_documents)
topic_models = lda.components_

num_top_words = 8


def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        term_list = [
            feature_names[i] for i in topic.argsort()[: -no_top_words - 1 : -1]
        ]
        print("topic %d:" % (topic_idx), term_list)

display_topics(lda, tf_feature_names, num_top_words)

(1881, 4590)
topic 0: ['39', 'amp', 'mukbang', 'cube', 'biology', 'rubik', 'asmr', 'eating']
topic 1: ['xbox', '39', '000', 'video', '100', 'vs', 'animals', 'trolling']
topic 2: ['apple', 'iphone', '2022', '14', '39', 'new', 'amp', 'games']
topic 3: ['quot', 'sat', 'literature', 'reaction', 'review', 'score', 'bed', 'movie']
topic 4: ['game', '2022', '39', 'development', 'amp', 'questions', 'new', 'answers']
topic 5: ['nintendo', 'switch', 'business', 'interview', 'food', 'job', 'movies', 'best']
topic 6: ['lofi', 'music', 'sports', 'hip', 'hop', 'chill', 'beats', 'study']
topic 7: ['science', 'learning', 'data', 'machine', 'computer', 'course', 'beginners', 'tutorial']
topic 8: ['2022', 'best', 'music', 'songs', 'marvel', 'food', 'studios', 'gaming']
topic 9: ['sat', 'world', '39', 'things', 'google', 'trolling', 'tips', 'study']


# Applying LDA to  Predict Topics on a New Document

In [21]:
new_document = "My nintendo switch"
print(f"The Top topics for '{new_document}':")
def predict_topics_lda(doc, num_topics=3):
    new_document_vector = tf_vectorizer.transform([new_document])
    topic_probabilities = lda.transform(new_document_vector)
   # print(topic_probabilities)
    flat_values = np.ravel(topic_probabilities)
    descending_indexes = np.argsort(flat_values)[::-1]
    top_n_topics_sorted = descending_indexes[:num_topics]
    #print(descending_indexes[0])
    return list(top_n_topics_sorted)
predict_topics_lda(new_document)

The Top topics for 'My nintendo switch':


[5, 9, 2]

In [22]:
#CHATGPT
topic_dictionary = {
    "ASMR and Rubik's Cube Science": ['39', 'amp', 'mukbang', 'cube', 'biology', 'rubik', 'asmr', 'eating'],
    "Gaming Encounters and Pranks": ['xbox', '39', '000', 'video', '100', 'vs', 'animals', 'trolling'],
    "Latest Apple Tech and Games": ['apple', 'iphone', '2022', '14', '39', 'new', 'amp', 'games'],
    "Movie Reviews and Literature": ['quot', 'sat', 'literature', 'reaction', 'review', 'score', 'bed', 'movie'],
    "Game Development Insights": ['game', '2022', '39', 'development', 'amp', 'questions', 'new', 'answers'],
    "Nintendo Business and Entertainment": ['nintendo', 'switch', 'business', 'interview', 'food', 'job', 'movies', 'best'],
    "Lo-fi Music for Study and Sports": ['lofi', 'music', 'sports', 'hip', 'hop', 'chill', 'beats', 'study'],
    "Intro to Machine Learning": ['science', 'learning', 'data', 'machine', 'computer', 'course', 'beginners', 'tutorial'],
    "Best of 2022: Music, Movies, and Gaming": ['2022', 'best', 'music', 'songs', 'marvel', 'food', 'studios', 'gaming'],
    "Worldly Study Tips and Tricks": ['sat', 'world', '39', 'things', 'google', 'trolling', 'tips', 'study']
}
print(topic_dictionary.keys())


dict_keys(["ASMR and Rubik's Cube Science", 'Gaming Encounters and Pranks', 'Latest Apple Tech and Games', 'Movie Reviews and Literature', 'Game Development Insights', 'Nintendo Business and Entertainment', 'Lo-fi Music for Study and Sports', 'Intro to Machine Learning', 'Best of 2022: Music, Movies, and Gaming', 'Worldly Study Tips and Tricks'])


In [23]:
# Use transform on the original document-term matrix to get the document weights per topic

lda_output = lda.transform(tf_documents)

#print("LDA transform output:\n", lda_output)

best_document_per_topic = np.argsort(lda_output, axis=0)[::-1]
for topic_index in range(0, 10):
    best_index = best_document_per_topic[0, topic_index]
    print(
        "Highest topic",
        topic_index,
        "weight is document",
        best_index,
        ":",
        documents_train[best_index][0:20],
    )

Highest topic 0 weight is document 857 : ASMR MUKBANG 편의점 핵불닭
Highest topic 1 weight is document 1668 : THE ROCK quiere DC V
Highest topic 2 weight is document 1612 : LEAKED Gameplay for 
Highest topic 3 weight is document 639 : NLE Choppa on Beef w
Highest topic 4 weight is document 1201 : MÌNH SINH TỒN 100 NG
Highest topic 5 weight is document 588 : ZOMBIE GIRL ESCAPE P
Highest topic 6 weight is document 1112 : Chạnh Lòng Thương Cô
Highest topic 7 weight is document 174 : Sports Car Giant Tes
Highest topic 8 weight is document 66 : Breaking News: Jhark
Highest topic 9 weight is document 95 : Free Fire | Bất Ngờ 


## NMF Topics

In [24]:
n_topics = 10
X = tfidf_documents

nmf = decomposition.NMF(n_components=n_topics, random_state=0, init="nndsvd")
W = nmf.fit_transform(X)
H = nmf.components_

top = 8
topic_index_max = n_topics

for topic_index in range(0, topic_index_max):
    top_indices = np.argsort(H[topic_index, :])[::-1]
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append(tfidf_feature_names[term_index])
    print("topic ", topic_index, top_terms)

topic  0 ['science', 'data', 'computer', 'course', 'learn', 'beginners', 'scientist', 'minutes']
topic  1 ['learning', 'machine', 'tutorial', 'learn', 'python', 'beginners', 'course', 'deep']
topic  2 ['lofi', 'hip', 'hop', 'chill', 'beats', 'study', 'music', 'mix']
topic  3 ['39', 'cube', 'rubik', 'solve', 'world', 'google', 'apple', 'cubes']
topic  4 ['2022', 'movies', 'best', 'action', 'new', 'songs', 'edm', 'music']
topic  5 ['game', 'development', 'years', 'unity', 'learning', 'indie', 'developer', 'xbox']
topic  6 ['questions', 'answers', 'tech', 'wired', 'twitter', 'support', 'interview', 'biology']
topic  7 ['official', 'video', 'music', 'bed', 'marvel', 'studios', 'business', 'disney']
topic  8 ['amp', 'asmr', 'mukbang', 'food', 'eating', '먹방', 'noodles', '만든']
topic  9 ['quot', 'sports', 'nintendo', 'switch', 'moments', 'history', 'craziest', 'literature']


In [25]:
#CHATGPT generated
topic_dictionary = {
    "Introductory Data Science": ['science', 'data', 'computer', 'course', 'learn', 'beginners', 'scientist', 'minutes'],
    "Machine Learning Basics": ['learning', 'machine', 'tutorial', 'learn', 'python', 'beginners', 'course', 'deep'],
    "Lo-fi Hip Hop Music": ['lofi', 'hip', 'hop', 'chill', 'beats', 'study', 'music', 'mix'],
    "Rubik's Cube and Tech Giants": ['39', 'cube', 'rubik', 'solve', 'world', 'google', 'apple', 'cubes'],
    "2022 Media and Entertainment": ['2022', 'movies', 'best', 'action', 'new', 'songs', 'edm', 'music'],
    "Game Development Journey": ['game', 'development', 'years', 'unity', 'learning', 'indie', 'developer', 'xbox'],
    "Tech & Biology Q&A": ['questions', 'answers', 'tech', 'wired', 'twitter', 'support', 'interview', 'biology'],
    "Music Videos and Business Media": ['official', 'video', 'music', 'bed', 'marvel', 'studios', 'business', 'disney'],
    "ASMR & Mukbang Delights": ['amp', 'asmr', 'mukbang', 'food', 'eating', '먹방', 'noodles', '만든'],
    "Sports and Literature Moments": ['quot', 'sports', 'nintendo', 'switch', 'moments', 'history', 'craziest', 'literature']
}
print(topic_dictionary.keys())

dict_keys(['Introductory Data Science', 'Machine Learning Basics', 'Lo-fi Hip Hop Music', "Rubik's Cube and Tech Giants", '2022 Media and Entertainment', 'Game Development Journey', 'Tech & Biology Q&A', 'Music Videos and Business Media', 'ASMR & Mukbang Delights', 'Sports and Literature Moments'])


In [29]:
from gensim.models import Word2Vec, KeyedVectors
pretrained_path = '../../../word2vec/GoogleNews-vectors-negative300.bin'
Word2VecModel = KeyedVectors.load_word2vec_format(pretrained_path, binary = True)

In [32]:
import numpy as np
import gensim
from sklearn.metrics.pairwise import cosine_similarity

#https://www.kaggle.com/code/nkitgupta/text-representations
def topical_coherence(items, w2vmodel):

    result = []
    for item in items:
        try:
            if w2vmodel==Word2VecModel:
                result.append(w2vmodel[item[0]])
            elif w2vmodel==glove_embeddings:
                result.append(w2vmodel[item[0]])
        except KeyError:
            pass
    if len(result) == 0:
        return 0

    matrix_sim = cosine_similarity(result)
    np.fill_diagonal(matrix_sim, 0)
    return np.mean(matrix_sim)

def answer_coherence_a(w2vmodel):
    a = topical_coherence(['train', 'car', 'bicycle', 'bus', 'vehicle', 'transport'], w2vmodel)
    b = topical_coherence(['scsi', 'drive', 'computer', 'storage', 'megabyte'], w2vmodel)
    c = topical_coherence(['introduction', 'pickle', 'guard', 'red', 'valiant'], w2vmodel)

    return a, b, c
print(answer_coherence_a(w2vmodel=Word2VecModel))

(0.40483522, 0.31330317, 0.2883513)


### Predicting Average Cluster Coherence