In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import pyLDAvis
import pyLDAvis.lda_model
import pyLDAvis.gensim_models

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

from spacy.lang.en.stop_words import STOP_WORDS as stopwords

from collections import Counter, defaultdict

nlp = spacy.load("en_core_web_sm")

In [2]:
# Load the cleaned and preprocessed data
df = pd.read_csv('../data/reddit_preprocessed.csv')

In [3]:
# This function comes from the BTAP repo.

def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))

In [4]:
count_text_vectorizer = CountVectorizer(stop_words='english', min_df=5, max_df=0.7)
count_text_vectors = count_text_vectorizer.fit_transform(df["processed_text"])

In [5]:
tfidf_text_vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.7)
tfidf_text_vectors = tfidf_text_vectorizer.fit_transform(df['processed_text'])

In [6]:
nmf_text_model = NMF(n_components=2, random_state=509)
W_text_matrix = nmf_text_model.fit_transform(tfidf_text_vectors)
H_text_matrix = nmf_text_model.components_

display_topics(nmf_text_model, tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  ai (13.91)
  job (2.43)
  model (2.19)
  use (1.74)
  google (1.69)

Topic 01
  study (2.27)
  new (2.18)
  people (1.10)
  use (0.98)
  risk (0.81)


In [7]:
lsa_text_model = TruncatedSVD(n_components=2, random_state=509)
W_svd_para_matrix = lsa_text_model.fit_transform(tfidf_text_vectors)
H_svd_para_matrix = lsa_text_model.components_

display_topics(lsa_text_model, tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  ai (3.94)
  new (1.86)
  study (1.58)
  use (1.31)
  model (0.84)

Topic 01
  ai (6.75)
  job (1.22)
  google (0.74)
  ceo (0.66)
  model (0.63)


In [8]:
lda_text_model = LatentDirichletAllocation(n_components = 2, random_state=509)
W_lda_text_matrix = lda_text_model.fit_transform(count_text_vectors)
H_lda_text_matrix = lda_text_model.components_

display_topics(lda_text_model, tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  ai (1.68)
  new (1.59)
  trump (1.32)
  use (1.11)
  tech (0.97)

Topic 01
  study (3.14)
  new (2.20)
  ai (1.85)
  people (1.39)
  risk (1.04)
