# Importing Library

In [187]:
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time

In [188]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [189]:
import sys
sys.path.insert(0, '../src')
from cleaner import clean_text
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Define handy functions

In [173]:
def show_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [205]:
def show_topics2(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

# Importing Data

In [2]:
df = pd.read_csv('../data/all_comments_with_sentiment.csv')

# Prelaunch Focus

## Cleaning the text

In [135]:
period_start = '2020-03-13' #inclusive
period_stop = '2020-03-20' #exclusive

df_pre_pos = df[(df['date'] < period_stop) & (df['date'] >= period_start) & (df['sentiment'] == 'pos')]
df_pre_neu = df[(df['date'] < period_stop) & (df['date'] >= period_start) & (df['sentiment'] == 'neu')]
df_pre_neg = df[(df['date'] < period_stop) & (df['date'] >= period_start) & (df['sentiment'] == 'neg')]
df_pre_pos.drop(columns=['Unnamed: 0','id','subreddit'],inplace=True)
df_pre_neu.drop(columns=['Unnamed: 0','id','subreddit'],inplace=True)
df_pre_neg.drop(columns=['Unnamed: 0','id','subreddit'],inplace=True)

In [136]:
df_pre_pos.loc[:,'cleaned'] = df_pre_pos['body'].apply(clean_text)
df_pre_pos.dropna(axis=0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [137]:
df_pre_neu.loc[:,'cleaned'] = df_pre_neu['body'].apply(clean_text)
df_pre_neu.dropna(axis=0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [138]:
df_pre_neg.loc[:,'cleaned'] = df_pre_neg['body'].apply(clean_text)
df_pre_neg.dropna(axis=0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [141]:
# check the dataframes
df_pre_pos

Unnamed: 0,datetime,author,body,date,sent_score,sentiment,cleaned
1,2020-03-13 14:00:21,soundwave145,"Doom,Final fantasy 7,Resident evil 3,Persona 5...",2020-03-13,0.0772,pos,doom final fantasy resident evil persona royal...
2,2020-03-13 14:00:22,frescapades,Dang! Just went to pick one up and bc there wa...,2020-03-13,0.2003,pos,dang went pick one bc barcode website anything...
4,2020-03-13 14:00:43,pearlescentsheep,SW-1421-4584-3913 — Probably going with Pearl...,2020-03-13,0.1007,pos,probably going pearl still deciding island nam...
5,2020-03-13 14:00:45,derekscardigan,The adding and removing insurance tip was key....,2020-03-13,0.4199,pos,adding removing insurance tip key thank
8,2020-03-13 14:00:58,AnonymousSplash,That sounds adorable! I think we have similar ...,2020-03-13,0.9095,pos,sound adorable think similar mindset trying de...
...,...,...,...,...,...,...,...
93720,2020-03-19 23:59:46,Smiles-Bite,It isn't great in the least. Our virus count i...,2020-03-19,0.3235,pos,great least virus count skyrocketing still pro...
93722,2020-03-19 23:59:47,Flux85,You do realize a major part of the game in thi...,2020-03-19,0.1154,pos,realize major part game version resides server...
93723,2020-03-19 23:59:54,Benson2500,Of course you would. I 100% believe you lmao,2020-03-19,0.5994,pos,course would believe lmao
93724,2020-03-19 23:59:54,BloomyC,Really cute! I would buy from you.,2020-03-19,0.5551,pos,really cute would buy


## Vectorization

In [192]:
tfidfvectorizer = TfidfVectorizer(
#     max_df = 0.99,
#     min_df = 0.01,
#     max_features = 
)

In [193]:
tf_vec = tfidfvectorizer.fit_transform(df_pre_neg['cleaned'])

In [194]:
tf_vec.shape

(10670, 10782)

In [177]:
countvectorizer = CountVectorizer(
#     max_df=0.95,
#     min_df=2,
#     max_features=n_features,
)

In [178]:
count_vec = countvectorizer.fit_transform(df_pre_neg['cleaned'])

## Fitting LDA model

In [170]:
number_of_topics = 10
random_seed = 99
ldamodel = LatentDirichletAllocation(
    n_components=number_of_topics,
    max_iter=50,
    learning_method='online',
    learning_offset=50.,
    random_state=random_seed)

In [179]:
ldamodel.fit(count_vec)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=50,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=99, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [182]:
top_n_words = 10
ct_feature_names = countvectorizer.get_feature_names()
show_topics(ldamodel,ct_feature_names,top_n_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,get,684.6,oh,245.5,played,140.3,im,175.4,save,73.6,digital,411.4,game,1030.8,damn,170.6,animal,494.1,die,210.6
1,game,655.5,hate,189.4,never,105.8,love,129.6,as,57.7,copy,389.3,one,835.4,like,104.8,crossing,450.9,sorry,180.4
2,itch,492.1,pick,187.3,moved,103.3,fruit,95.5,meant,50.6,physical,251.4,like,647.4,look,56.1,post,214.8,man,123.4
3,order,462.2,worry,104.0,villager,97.9,bob,72.6,true,50.0,two,152.0,time,636.3,cant,51.4,shit,179.3,fuck,103.5
4,store,447.1,dont,99.8,name,82.2,sadly,70.5,axe,43.7,bad,136.6,new,581.0,eye,50.5,doom,120.7,op,55.7
5,got,392.2,suck,66.0,read,80.7,lazy,67.3,japanese,40.0,want,91.0,people,561.3,color,50.2,miss,95.1,ich,37.8
6,day,365.9,yet,62.1,code,76.2,always,66.8,favourite,34.6,scared,79.4,get,558.9,design,49.7,see,72.6,actual,37.8
7,release,362.1,essential,60.5,question,66.9,sure,66.2,dizzy,34.1,problem,76.5,would,493.7,idk,47.1,please,58.0,da,34.1
8,still,344.9,keep,60.1,cute,57.7,hard,65.1,rock,32.5,really,71.9,know,474.5,send,46.6,information,56.0,image,33.9
9,mine,316.0,non,54.9,house,57.0,mad,64.1,personality,31.6,gyroids,69.0,really,469.4,bitch,46.6,removed,48.7,cherry,33.6


In [197]:
p = pyLDAvis.sklearn.prepare(ldamodel, count_vec, countvectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [198]:
pyLDAvis.save_html(p, 'lda_pre_neg.html')

## NMF model

In [229]:
number_of_topics = 10
random_seed = 99
nmfmodel = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel.fit(tf_vec)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=2000,
    n_components=10, random_state=99, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [230]:
top_n_words = 10
tf_feature_names = tfidfvectorizer.get_feature_names()
show_topics2(nmfmodel,tf_feature_names,top_n_words)

Topic #0: get time like people really bad know want think way
Topic #1: sorry im man hear mean suck omg right meant sold
Topic #2: mine got th still say delayed ordered amazon yet shipped
Topic #3: problem thank yeah enjoy bud lt see fun need make
Topic #4: oh damn shit god yeah hell suck well hot sad
Topic #5: game animal crossing played doom never play stop first release
Topic #6: new sad villager leaf town horizon moved favorite first away
Topic #7: one itch island ac name per hacked console hard know
Topic #8: digital cancel copy order physical buy pre go get cancelled
Topic #9: would die cry ride goldie doubt slider marshal hard pay



# Launch focus

In [209]:
period_start = '2020-03-20' #inclusive
period_stop = '2020-04-01' #exclusive

df_launch = df[(df['date'] < period_stop) & (df['date'] >= period_start)]
df_launch.drop(columns=['Unnamed: 0','id','subreddit'],inplace=True)

df_launch.loc[:,'cleaned'] = df_launch['body'].apply(clean_text)
df_launch.dropna(axis=0,inplace=True)

nan
nan


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [211]:
# df_launch.to_csv('../data/launch_cleaned_all_sentiment.csv')

# Fitting NMF Model

In [213]:
tfidfvectorizer_launch = TfidfVectorizer(
#     max_df = 0.99,
#     min_df = 0.01,
#     max_features = 
)

In [222]:
sentiment_focus = 'pos'
filtered_frame = df_launch[df_launch['sentiment'] == sentiment_focus]['cleaned']

tf_vec_launch = tfidfvectorizer_launch.fit_transform(filtered_frame)
tf_vec_launch.shape

(238242, 43316)

In [223]:
number_of_topics = 10
random_seed = 99
nmfmodel_launch = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_launch.fit(tf_vec_launch)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=2000,
    n_components=10, random_state=99, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [224]:
top_n_words = 10
tf_feature_names_launch = tfidfvectorizer_launch.get_feature_names()
show_topics2(nmfmodel_launch,tf_feature_names_launch,top_n_words)

Topic #0: thank much awesome oh okay ok ah great amazing know
Topic #1: code dodo please looking fruit dm peach orange cherry apple
Topic #2: thanks ok much awesome oh okay cool know info ah
Topic #3: yes please omg move day believe ah one unfortunately oh
Topic #4: like island get one game time day villager know want
Topic #5: love would much visit omg come amazing see great design
Topic #6: lol oh yeah got ok mine omg thought know need
Topic #7: nice look oh really job would wow great work good
Topic #8: friend code request sent add play best send new itch
Topic #9: cute awesome super omg look really oh great idea job



In [225]:
sentiment_focus = 'neu'
filtered_frame = df_launch[df_launch['sentiment'] == sentiment_focus]['cleaned']

tf_vec_launch = tfidfvectorizer_launch.fit_transform(filtered_frame)
tf_vec_launch.shape

(163143, 30733)

In [226]:
number_of_topics = 10
random_seed = 99
nmfmodel_launch = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_launch.fit(tf_vec_launch)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=2000,
    n_components=10, random_state=99, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [227]:
top_n_words = 10
tf_feature_names_launch = tfidfvectorizer_launch.get_feature_names()
show_topics2(nmfmodel_launch,tf_feature_names_launch,top_n_words)

Topic #0: code dodo dm qr pm send looking open new hemisphere
Topic #1: island come fruit open visit nook hemisphere fish southern mile
Topic #2: added open gate name back pop ya bring hi add
Topic #3: cherry peach pear apple orange trade bring looking coconut fruit
Topic #4: get recipe nook mile able game villager house trying ladder
Topic #5: got today mine pear balloon recipe one first yesterday iron
Topic #6: way make know game go think find headed work bringing
Topic #7: day one remindme time think next first villager new game
Topic #8: sent request dm pm message chat amp link check downloadable
Topic #9: need still fruit iron know open anything omg shop space



In [231]:
sentiment_focus = 'neg'
filtered_frame = df_launch[df_launch['sentiment'] == sentiment_focus]['cleaned']

tf_vec_launch = tfidfvectorizer_launch.fit_transform(filtered_frame)
tf_vec_launch.shape

(73427, 23125)

In [232]:
number_of_topics = 10
random_seed = 99
nmfmodel_launch = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_launch.fit(tf_vec_launch)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=2000,
    n_components=10, random_state=99, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [233]:
top_n_words = 10
tf_feature_names_launch = tfidfvectorizer_launch.get_feature_names()
show_topics2(nmfmodel_launch,tf_feature_names_launch,top_n_words)

Topic #0: time game like people think day thing get really know
Topic #1: sorry loss know sure im new closed mean hear internet
Topic #2: problem lol know fun else exact anyone yeah see fix
Topic #3: island tarantula mystery nook mile fruit ticket native bamboo flower
Topic #4: tree get axe fruit rock shake shovel hit stone flimsy
Topic #5: bad want feel luck lol really like go make bot
Topic #6: oh damn shit god fuck know thanks thank hell suck
Topic #7: code dodo drop need peach cherry orange apple fruit pear
Topic #8: one got day caught first fish per catch bait two
Topic #9: villager move first house get plot campsite leave nook invite



# Bunny day focus

In [234]:
period_start = '2020-04-01' #inclusive
period_stop = '2020-04-13' #exclusive

df_bunny = df[(df['date'] < period_stop) & (df['date'] >= period_start)]
df_bunny.drop(columns=['Unnamed: 0','id','subreddit'],inplace=True)

df_bunny.loc[:,'cleaned'] = df_bunny['body'].apply(clean_text)
df_bunny.dropna(axis=0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


nan


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [235]:
df_bunny.to_csv('../data/bunny_cleaned_all_sentiment.csv')

# NMF - Bunny day

In [238]:
tfidfvectorizer_bunny = TfidfVectorizer(
#     max_df = 0.99,
#     min_df = 0.01,
#     max_features = 
)

In [254]:
sentiment_focus = 'neu'
filtered_frame = df_bunny[df_bunny['sentiment'] == sentiment_focus]['cleaned']

tf_vec_bunny = tfidfvectorizer_bunny.fit_transform(filtered_frame)
tf_vec_bunny.shape

(151479, 27319)

In [257]:
number_of_topics = 20
random_seed = 99
nmfmodel_bunny = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_bunny.fit(tf_vec_bunny)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=2000,
    n_components=20, random_state=99, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [258]:
top_n_words = 20
tf_feature_names_bunny = tfidfvectorizer_bunny.get_feature_names()
show_topics2(nmfmodel_bunny,tf_feature_names_bunny,top_n_words)

Topic #0: recipe cherry blossom diy balloon extra wand trade petal craft give bonsai sakura lantern flooring wall make anyone branch bunny
Topic #1: dm looking ed check dodo selling anyone hey price make sending celeste let turnip tip offer send sent cant trade
Topic #2: code dodo send qr creator give new message bring design work open able sister hey post shop use visit working
Topic #3: get recipe star rid trying wait able way balloon could point item back even net many isabelle rock trophy wallpaper
Topic #4: island mystery visit tarantula go flick spawn find bamboo bug someone found people fish rock fruit name hemisphere celeste native
Topic #5: one make first per could found seen another new give two craft balloon find buy thought also gotten different would
Topic #6: need many craft omg anything star space much really iron life crafted make item extra material thing still least flower
Topic #7: day bunny next remindme th last every event first wait per april zipper item two ago b