# Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [3]:
import sys
sys.path.insert(0, '../src')
from cleaner import clean_text
%load_ext autoreload
%autoreload 2

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
  from collections import Sequence, defaultdict
  from collections import Counter, Iterable


In [13]:
def show_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

# Load data

In [4]:
df = pd.read_csv('../data/all_comments_with_sentiment.csv')

In [5]:
all_cleaned = df['body'].apply(clean_text)
all_cleaned.dropna(inplace=True)

nan
nan
nan
nan
nan


# Vectorization

In [8]:
sample1 = all_cleaned.sample(500000)

In [9]:
countvectorizer = CountVectorizer(
max_features = 1000
)

In [10]:
count_vec = countvectorizer.fit_transform(sample1)
count_vec.shape

(100000, 1000)

# Model

In [11]:
number_of_topics = 10
random_seed = 99
ldamodel = LatentDirichletAllocation(
    n_components=number_of_topics,
    max_iter=50,
    learning_method='online',
    learning_offset=50.,
    random_state=random_seed)

In [12]:
ldamodel.fit(count_vec)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=50,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=99, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [14]:
top_n_words = 10
ct_feature_names = countvectorizer.get_feature_names()
show_topics(ldamodel,ct_feature_names,top_n_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,like,4278.3,game,7407.9,think,3904.8,villager,6604.2,fruit,3282.2,code,4567.6,get,3459.7,thank,3391.3,make,3092.5,thanks,3168.2
1,look,2941.2,time,5285.5,new,3443.8,love,4386.7,island,3197.8,dm,3685.6,item,2786.3,nook,2870.8,got,2530.8,oh,1866.5
2,know,2167.6,day,2909.2,get,2937.2,move,2649.1,flower,3071.9,dodo,3007.1,bell,2677.8,one,2244.0,could,2286.7,good,1786.2
3,yes,2068.3,like,2880.9,way,2885.6,island,2540.7,come,2795.1,please,2616.8,house,2544.3,day,2107.6,itch,1818.9,nice,1551.2
4,sell,1704.5,see,2802.9,much,2589.4,lol,2051.4,sure,2627.8,open,2328.3,back,2307.4,fish,1811.2,stuff,1623.5,maybe,1514.1
5,well,1556.4,people,2416.3,like,2549.0,though,1961.5,visit,2003.9,take,2311.1,need,2209.6,bug,1502.5,lot,1578.5,cute,1478.5
6,actually,1493.2,thing,2333.8,even,2454.0,would,1731.9,tree,1978.5,looking,2250.1,recipe,1918.7,put,1494.3,today,1560.9,another,1256.8
7,really,1478.9,play,2242.8,time,2079.8,one,1369.0,orange,1469.3,friend,1851.9,diy,1803.9,trying,1448.0,made,1467.3,seen,1102.9
8,egg,1133.4,still,1975.8,one,2020.5,first,1357.7,cherry,1399.4,island,1609.2,furniture,1569.7,get,1441.2,design,1390.1,idea,1084.1
9,spawn,1094.0,animal,1883.1,yeah,1830.0,great,1321.8,also,1298.2,leave,1528.4,know,1459.4,wait,1330.2,cool,1296.3,player,1035.1


In [15]:
p = pyLDAvis.sklearn.prepare(ldamodel, count_vec, countvectorizer)

In [17]:
# pyLDAvis.save_html(p, 'ldavis_100k.html')

# Increase sample size

In [18]:
sample2 = all_cleaned.sample(500000)

In [19]:
countvectorizer2 = CountVectorizer(
max_features = 1000
)

In [20]:
count_vec2 = countvectorizer2.fit_transform(sample2)
count_vec2.shape

(500000, 1000)

In [21]:
number_of_topics = 10
random_seed = 99
ldamodel2 = LatentDirichletAllocation(
    n_components=number_of_topics,
    max_iter=50,
    learning_method='online',
    learning_offset=50.,
    random_state=random_seed)

In [22]:
ldamodel2.fit(count_vec2)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=50,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=99, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [23]:
top_n_words = 10
ct_feature_names2 = countvectorizer2.get_feature_names()
show_topics(ldamodel2,ct_feature_names2,top_n_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,thank,16822.9,code,22553.0,know,23187.6,island,33699.7,really,11826.5,day,34097.6,love,21360.9,like,22639.8,one,18628.7,nook,13975.3
1,thanks,16054.3,looking,18491.5,people,21171.8,see,14123.4,fish,11040.6,time,28285.5,got,13053.0,lol,15033.1,villager,17667.5,move,13266.9
2,much,11643.8,dm,17959.1,make,16715.9,villager,10994.0,give,9589.8,game,19795.9,recipe,7927.1,new,12781.6,get,16771.7,right,11967.6
3,yes,10521.8,fruit,16708.0,someone,11708.5,one,10779.1,buy,9394.8,first,16051.5,made,7221.4,thing,11166.0,like,16695.9,need,10626.7
4,good,8705.4,island,16585.5,play,11032.6,get,10633.6,go,8829.9,back,11525.4,nice,7176.0,something,10684.3,look,14711.2,though,9944.2
5,star,7152.1,dodo,14516.3,let,10050.1,house,10069.1,two,8591.9,one,10711.0,design,6719.9,cute,7438.8,would,13886.2,going,8710.7
6,still,6787.6,please,12752.6,game,10039.0,find,10028.4,bug,7436.4,animal,9526.5,cool,6160.8,game,7411.9,tree,13433.4,get,8347.5
7,wait,6666.5,flower,11869.3,every,9375.4,say,9019.7,help,7295.5,oh,9268.0,egg,5774.6,pretty,7344.0,think,10634.9,island,8067.5
8,haha,5883.1,come,11819.6,feel,9364.9,like,8840.7,make,6705.1,crossing,7401.6,name,5433.8,playing,6801.3,way,9296.1,item,7904.2
9,work,5657.6,anyone,11613.9,time,8451.9,use,8722.4,rock,6201.4,actually,7289.8,diy,4704.4,far,4888.1,itch,9186.1,sure,7113.3


In [24]:
p2 = pyLDAvis.sklearn.prepare(ldamodel2, count_vec2, countvectorizer2)

In [25]:
pyLDAvis.save_html(p2, 'ldavis_500k.html')