In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import pyLDAvis
import pyLDAvis.lda_model
import pyLDAvis.gensim_models

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation

from spacy.lang.en.stop_words import STOP_WORDS as stopwords

from collections import Counter, defaultdict

nlp = spacy.load("en_core_web_sm")

In [2]:
# Load the cleaned and preprocessed data
df = pd.read_csv('/Users/mtc/ADS/ADS 509/reddit-text-mining-project/data/reddit_preprocessed.csv')

In [3]:
# This function comes from the BTAP repo.

def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))

In [4]:
count_text_vectorizer = CountVectorizer(stop_words=list(stopwords), min_df=5, max_df=0.7)
count_text_vectors = count_text_vectorizer.fit_transform(df["processed_text"])



In [5]:
tfidf_text_vectorizer = TfidfVectorizer(stop_words=list(stopwords), min_df=5, max_df=0.7)
tfidf_text_vectors = tfidf_text_vectorizer.fit_transform(df['processed_text'])

In [6]:
nmf_text_model = NMF(n_components=2, random_state=509)
W_text_matrix = nmf_text_model.fit_transform(tfidf_text_vectors)
H_text_matrix = nmf_text_model.components_

display_topics(nmf_text_model, tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  study (2.42)
  new (2.11)
  find (2.04)
  people (1.09)
  use (0.93)

Topic 01
  ai (13.15)
  job (2.32)
  model (2.08)
  use (1.69)
  google (1.61)


In [8]:
lsa_text_model = TruncatedSVD(n_components=2, random_state=509)
W_svd_para_matrix = lsa_text_model.fit_transform(tfidf_text_vectors)
H_svd_para_matrix = lsa_text_model.components_

display_topics(lsa_text_model, tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  ai (3.34)
  new (1.84)
  study (1.73)
  find (1.43)
  use (1.22)

Topic 01
  ai (10.54)
  job (1.94)
  google (1.22)
  model (1.15)
  ceo (1.03)


In [9]:
lda_text_model = LatentDirichletAllocation(n_components = 2, random_state=509)
W_lda_text_matrix = lda_text_model.fit_transform(count_text_vectors)
H_lda_text_matrix = lda_text_model.components_

display_topics(lda_text_model, tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  ai (4.05)
  new (1.82)
  trump (1.20)
  use (1.11)
  tech (0.88)

Topic 01
  study (3.21)
  find (2.45)
  new (1.99)
  people (1.44)
  risk (1.08)
