In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import pyLDAvis
import pyLDAvis.lda_model
import pyLDAvis.gensim_models

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

from spacy.lang.en.stop_words import STOP_WORDS as stopwords

from collections import Counter, defaultdict

nlp = spacy.load("en_core_web_sm")

In [2]:
# Load the cleaned and preprocessed data
df = pd.read_csv('../data/reddit_preprocessed.csv')

In [3]:
# This function comes from the BTAP repo.

def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))

In [4]:
# Creating count-vectorizer for pre-processed data
count_text_vectorizer = CountVectorizer(stop_words='english', min_df=5, max_df=0.7)
count_text_vectors = count_text_vectorizer.fit_transform(df["processed_text"])

In [5]:
# Creating TF-IDF vectorizier for pre-processed data
tfidf_text_vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.7)
tfidf_text_vectors = tfidf_text_vectorizer.fit_transform(df['processed_text'])

In [None]:
# Nonnegative Matrix Factorization (NMF) Model
nmf_text_model = NMF(n_components=2, random_state=509)
W_text_matrix = nmf_text_model.fit_transform(tfidf_text_vectors)
H_text_matrix = nmf_text_model.components_

display_topics(nmf_text_model, tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  study (2.58)
  new (2.43)
  people (1.22)
  use (1.17)
  risk (0.91)

Topic 01
  trump (3.26)
  tech (3.22)
  china (2.00)
  power (1.36)
  job (1.22)


In [7]:
# Latent Semantic Analysis (LSA)
lsa_text_model = TruncatedSVD(n_components=2, random_state=509)
W_svd_para_matrix = lsa_text_model.fit_transform(tfidf_text_vectors)
H_svd_para_matrix = lsa_text_model.components_

display_topics(lsa_text_model, tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  new (2.09)
  study (2.01)
  use (1.20)
  people (0.96)
  risk (0.71)

Topic 01
  trump (6.90)
  tech (6.80)
  china (4.53)
  power (2.80)
  google (2.51)


In [8]:
# Latent Dirichlet Allocation (LDA) model
lda_text_model = LatentDirichletAllocation(n_components = 2, random_state=509)
W_lda_text_matrix = lda_text_model.fit_transform(count_text_vectors)
H_lda_text_matrix = lda_text_model.components_

display_topics(lda_text_model, tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  study (2.47)
  new (2.24)
  people (1.57)
  use (1.33)
  risk (1.17)

Topic 01
  new (1.72)
  study (1.35)
  trump (1.19)
  scientist (0.95)
  researcher (0.93)
