# Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [2]:
import sys
sys.path.insert(0, '../src')
from cleaner import clean_text
%load_ext autoreload
%autoreload 2

# Functions

In [3]:
def show_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

# Data

In [4]:
df = pd.read_csv('../data/all_comments_with_sentiment.csv')

In [5]:
cleaned = df['body'].apply(clean_text)
cleaned.dropna(inplace=True)

nan
nan
nan
nan
nan


# Vectorize & Fit

In [6]:
tfidfvectorizer = TfidfVectorizer()

In [7]:
tf_vec = tfidfvectorizer.fit_transform(cleaned)

In [8]:
number_of_topics = 10
random_seed = 99
nmfmodel = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)

In [9]:
nmfmodel.fit(tf_vec)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=2000,
    n_components=10, random_state=99, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

# Check the topics

In [14]:
top_n_words = 20
tf_feature_names = tfidfvectorizer.get_feature_names()
show_topics(nmfmodel,tf_feature_names,top_n_words)

Topic #0: villager day time game move want first new think sure house go people play thing back way still every got
Topic #1: thank much oh awesome okay ok ah lt know great omg amazing god appreciate help good haha try sharing helpful
Topic #2: thanks much ok oh awesome know okay info cool ah help try good tip got lot sharing great haha see
Topic #3: dm code dodo please send sent looking pm friend interested shop open anyone bring want tip qr sure message chat
Topic #4: love would much come visit cute omg idea amazing see absolutely design favorite great adorable hi wow id lt join
Topic #5: yes please omg oh ah lol say need haha move pls course unfortunately exactly believe message pm god random right
Topic #6: island come fruit visit need looking anyone cherry flower nook open peach orange shop apple pear want sell native fish
Topic #7: like look lol would know good nice really oh cute great make feel cool amazing something awesome yeah thing sound
Topic #8: one got make need recipe f

# Downsize vectorizer and topics

In [31]:
tfidfvectorizer2 = TfidfVectorizer(
max_features = 10000
)

In [32]:
tf_vec2 = tfidfvectorizer2.fit_transform(cleaned)
tf_vec2.shape

(2021736, 10000)

In [37]:
number_of_topics = 10
random_seed = 99
nmfmodel2 = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)

In [None]:
nmfmodel2.fit(tf_vec2)

In [None]:
top_n_words = 20
tf_feature_names2 = tfidfvectorizer2.get_feature_names()
show_topics(nmfmodel2,tf_feature_names2,top_n_words)