# Importing Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [2]:
import sys
sys.path.insert(0, '../src')
from cleaner import clean_text
%load_ext autoreload
%autoreload 2

# Define handy functions

In [None]:
def show_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [14]:
def show_topics2(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

# Importing Data

In [3]:
df = pd.read_csv('../data/all_comments_with_sentiment.csv')

# Prelaunch Focus

## Cleaning the text

In [4]:
period_start = '2020-03-13' #inclusive
period_stop = '2020-03-20' #exclusive

df_pre_pos = df[(df['date'] < period_stop) & (df['date'] >= period_start) & (df['sentiment'] == 'pos')]
df_pre_neu = df[(df['date'] < period_stop) & (df['date'] >= period_start) & (df['sentiment'] == 'neu')]
df_pre_neg = df[(df['date'] < period_stop) & (df['date'] >= period_start) & (df['sentiment'] == 'neg')]
df_pre_pos.drop(columns=['Unnamed: 0','id','subreddit'],inplace=True)
df_pre_neu.drop(columns=['Unnamed: 0','id','subreddit'],inplace=True)
df_pre_neg.drop(columns=['Unnamed: 0','id','subreddit'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
df_pre_pos.loc[:,'cleaned'] = df_pre_pos['body'].apply(clean_text)
df_pre_pos.dropna(axis=0,inplace=True)

In [None]:
df_pre_neu.loc[:,'cleaned'] = df_pre_neu['body'].apply(clean_text)
df_pre_neu.dropna(axis=0,inplace=True)

In [5]:
df_pre_neg.loc[:,'cleaned'] = df_pre_neg['body'].apply(clean_text)
df_pre_neg.dropna(axis=0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# check the dataframes
df_pre_pos

## Vectorization

In [25]:
tfidfvectorizer = TfidfVectorizer(
#      max_features = 1000
)

In [26]:
tf_vec = tfidfvectorizer.fit_transform(df_pre_neg['cleaned'])

In [27]:
tf_vec.shape

(10670, 10782)

In [None]:
countvectorizer = CountVectorizer(
#     max_df=0.95,
#     min_df=2,
#     max_features=n_features,
)

In [None]:
count_vec = countvectorizer.fit_transform(df_pre_neg['cleaned'])

## Pre-launch LDA

In [None]:
number_of_topics = 10
random_seed = 99
ldamodel = LatentDirichletAllocation(
    n_components=number_of_topics,
    max_iter=50,
    learning_method='online',
    learning_offset=50.,
    random_state=random_seed)

In [None]:
ldamodel.fit(count_vec)

In [None]:
top_n_words = 10
ct_feature_names = countvectorizer.get_feature_names()
show_topics(ldamodel,ct_feature_names,top_n_words)

In [None]:
p = pyLDAvis.sklearn.prepare(ldamodel, count_vec, countvectorizer)

In [None]:
# pyLDAvis.save_html(p, 'lda_pre_neg.html')

## NMF model

In [35]:
number_of_topics = 8
random_seed = 99
nmfmodel = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel.fit(tf_vec)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=2000,
    n_components=8, random_state=99, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [36]:
top_n_words = 15
tf_feature_names = tfidfvectorizer.get_feature_names()
show_topics2(nmfmodel,tf_feature_names,top_n_words)

Topic #0: time like get would really new bad think know villager want people day town make
Topic #1: sorry im man hear mean suck right really omg meant sold know understand loss link
Topic #2: order digital cancel copy got mine pre get ordered physical amazon th cancelled delayed store
Topic #3: problem thank yeah enjoy bud lt see fun need make afford add opposite adult hope
Topic #4: oh damn shit sad god yeah suck hell well hot lol know bad holy gosh
Topic #5: game stop people play release early nintendo store come ac day getting even playing week
Topic #6: animal crossing played doom never new horizon first spoil isabelle eternal play story itch forget
Topic #7: one itch island ac per yet hacked console know got store even hard able buy



# Launch focus

In [None]:
period_start = '2020-03-20' #inclusive
period_stop = '2020-04-01' #exclusive

df_launch = df[(df['date'] < period_stop) & (df['date'] >= period_start)]
df_launch.drop(columns=['Unnamed: 0','id','subreddit'],inplace=True)

df_launch.loc[:,'cleaned'] = df_launch['body'].apply(clean_text)
df_launch.dropna(axis=0,inplace=True)

In [None]:
# df_launch.to_csv('../data/launch_cleaned_all_sentiment.csv')

In [None]:
tfidfvectorizer_launch = TfidfVectorizer(
#     max_df = 0.99,
#     min_df = 0.01,
#     max_features = 
)

### Launch - Positive

In [None]:
sentiment_focus = 'pos'
filtered_frame = df_launch[df_launch['sentiment'] == sentiment_focus]['cleaned']

tf_vec_launch = tfidfvectorizer_launch.fit_transform(filtered_frame)
tf_vec_launch.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_launch = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_launch.fit(tf_vec_launch)

In [None]:
top_n_words = 10
tf_feature_names_launch = tfidfvectorizer_launch.get_feature_names()
show_topics2(nmfmodel_launch,tf_feature_names_launch,top_n_words)

### Launch - Neutral

In [None]:
sentiment_focus = 'neu'
filtered_frame = df_launch[df_launch['sentiment'] == sentiment_focus]['cleaned']

tf_vec_launch = tfidfvectorizer_launch.fit_transform(filtered_frame)
tf_vec_launch.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_launch = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_launch.fit(tf_vec_launch)

In [None]:
top_n_words = 10
tf_feature_names_launch = tfidfvectorizer_launch.get_feature_names()
show_topics2(nmfmodel_launch,tf_feature_names_launch,top_n_words)

### Launch - Negative

In [None]:
sentiment_focus = 'neg'
filtered_frame = df_launch[df_launch['sentiment'] == sentiment_focus]['cleaned']

tf_vec_launch = tfidfvectorizer_launch.fit_transform(filtered_frame)
tf_vec_launch.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_launch = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_launch.fit(tf_vec_launch)

In [None]:
top_n_words = 20
tf_feature_names_launch = tfidfvectorizer_launch.get_feature_names()
show_topics2(nmfmodel_launch,tf_feature_names_launch,top_n_words)

# Bunny day focus

In [None]:
period_start = '2020-04-01' #inclusive
period_stop = '2020-04-13' #exclusive

df_bunny = df[(df['date'] < period_stop) & (df['date'] >= period_start)]
df_bunny.drop(columns=['Unnamed: 0','id','subreddit'],inplace=True)

df_bunny.loc[:,'cleaned'] = df_bunny['body'].apply(clean_text)
df_bunny.dropna(axis=0,inplace=True)

In [None]:
# df_bunny.to_csv('../data/bunny_cleaned_all_sentiment.csv')

# NMF - Bunny day

In [None]:
# df_bunny = pd.read_csv('../data/bunny_cleaned_all_sentiment.csv')
# df_bunny.dropna(axis=0, inplace=True)

In [None]:
tfidfvectorizer_bunny = TfidfVectorizer(
#     max_df = 0.99,
#     min_df = 0.01,
#     max_features = 
)

### bunny day - positive

In [None]:
sentiment_focus = 'pos'
filtered_frame = df_bunny[df_bunny['sentiment'] == sentiment_focus]['cleaned']

tf_vec_bunny = tfidfvectorizer_bunny.fit_transform(filtered_frame)
tf_vec_bunny.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_bunny = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_bunny.fit(tf_vec_bunny)

In [None]:
top_n_words = 10
tf_feature_names_bunny = tfidfvectorizer_bunny.get_feature_names()
show_topics2(nmfmodel_bunny,tf_feature_names_bunny,top_n_words)

### bunny day - neutral

In [None]:
sentiment_focus = 'neu'
filtered_frame = df_bunny[df_bunny['sentiment'] == sentiment_focus]['cleaned']

tf_vec_bunny = tfidfvectorizer_bunny.fit_transform(filtered_frame)
tf_vec_bunny.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_bunny = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_bunny.fit(tf_vec_bunny)

In [None]:
top_n_words = 10
tf_feature_names_bunny = tfidfvectorizer_bunny.get_feature_names()
show_topics2(nmfmodel_bunny,tf_feature_names_bunny,top_n_words)

### Bunny day - negative

In [None]:
sentiment_focus = 'neg'
filtered_frame = df_bunny[df_bunny['sentiment'] == sentiment_focus]['cleaned']

tf_vec_bunny = tfidfvectorizer_bunny.fit_transform(filtered_frame)
tf_vec_bunny.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_bunny = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_bunny.fit(tf_vec_bunny)

In [None]:
top_n_words = 20
tf_feature_names_bunny = tfidfvectorizer_bunny.get_feature_names()
show_topics2(nmfmodel_bunny,tf_feature_names_bunny,top_n_words)

# Post Bunny-day

In [None]:
period_start = '2020-04-13' #inclusive
period_stop = '2020-04-23' #exclusive

df_postbunny = df[(df['date'] < period_stop) & (df['date'] >= period_start)]
df_postbunny.drop(columns=['Unnamed: 0','id','subreddit'],inplace=True)

df_postbunny.loc[:,'cleaned'] = df_postbunny['body'].apply(clean_text)
df_postbunny.dropna(axis=0,inplace=True)

In [None]:
tfidfvectorizer_postbunny = TfidfVectorizer(
#     max_df = 0.99,
#     min_df = 0.01,
#     max_features = 
)

### Post-bunny - positive

In [None]:
sentiment_focus = 'pos'
filtered_frame = df_postbunny[df_postbunny['sentiment'] == sentiment_focus]['cleaned']

tf_vec_postbunny = tfidfvectorizer_postbunny.fit_transform(filtered_frame)
tf_vec_postbunny.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_postbunny = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_postbunny.fit(tf_vec_postbunny)

In [None]:
top_n_words = 10
tf_feature_names_postbunny = tfidfvectorizer_postbunny.get_feature_names()
show_topics2(nmfmodel_postbunny,tf_feature_names_postbunny,top_n_words)

### Post-bunny - neutral

In [None]:
sentiment_focus = 'neu'
filtered_frame = df_postbunny[df_postbunny['sentiment'] == sentiment_focus]['cleaned']

tf_vec_postbunny = tfidfvectorizer_postbunny.fit_transform(filtered_frame)
tf_vec_postbunny.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_postbunny = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_postbunny.fit(tf_vec_postbunny)

In [None]:
top_n_words = 10
tf_feature_names_postbunny = tfidfvectorizer_postbunny.get_feature_names()
show_topics2(nmfmodel_postbunny,tf_feature_names_postbunny,top_n_words)

### Post-bunny - Negative

In [None]:
sentiment_focus = 'neg'
filtered_frame = df_postbunny[df_postbunny['sentiment'] == sentiment_focus]['cleaned']

tf_vec_postbunny = tfidfvectorizer_postbunny.fit_transform(filtered_frame)
tf_vec_postbunny.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_postbunny = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_postbunny.fit(tf_vec_postbunny)

In [None]:
top_n_words = 15
tf_feature_names_postbunny = tfidfvectorizer_postbunny.get_feature_names()
show_topics2(nmfmodel_postbunny,tf_feature_names_postbunny,top_n_words)

# Spring update

In [None]:
period_start = '2020-04-23' #inclusive
period_stop = '2020-05-01' #exclusive

df_spring = df[(df['date'] < period_stop) & (df['date'] >= period_start)]
df_spring.drop(columns=['Unnamed: 0','id','subreddit'],inplace=True)

df_spring.loc[:,'cleaned'] = df_spring['body'].apply(clean_text)
df_spring.dropna(axis=0,inplace=True)

In [None]:
tfidfvectorizer_spring = TfidfVectorizer(
#     max_df = 0.99,
#     min_df = 0.01,
#     max_features = 
)

### Spring - positive

In [None]:
sentiment_focus = 'pos'
filtered_frame = df_spring[df_spring['sentiment'] == sentiment_focus]['cleaned']

tf_vec_spring = tfidfvectorizer_spring.fit_transform(filtered_frame)
tf_vec_spring.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_spring = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_spring.fit(tf_vec_spring)

In [None]:
top_n_words = 10
tf_feature_names_spring = tfidfvectorizer_spring.get_feature_names()
show_topics2(nmfmodel_spring,tf_feature_names_spring,top_n_words)

### Spring - Neutral

In [None]:
sentiment_focus = 'neu'
filtered_frame = df_spring[df_spring['sentiment'] == sentiment_focus]['cleaned']

tf_vec_spring = tfidfvectorizer_spring.fit_transform(filtered_frame)
tf_vec_spring.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_spring = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_spring.fit(tf_vec_spring)

In [None]:
top_n_words = 10
tf_feature_names_spring = tfidfvectorizer_spring.get_feature_names()
show_topics2(nmfmodel_spring,tf_feature_names_spring,top_n_words)

### Spring - negative

In [None]:
sentiment_focus = 'neg'
filtered_frame = df_spring[df_spring['sentiment'] == sentiment_focus]['cleaned']

tf_vec_spring = tfidfvectorizer_spring.fit_transform(filtered_frame)
tf_vec_spring.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_spring = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_spring.fit(tf_vec_spring)

In [None]:
top_n_words = 10
tf_feature_names_spring = tfidfvectorizer_spring.get_feature_names()
show_topics2(nmfmodel_spring,tf_feature_names_spring,top_n_words)

# Post May-day

In [None]:
period_start = '2020-05-01' #inclusive
period_stop = '2020-05-09' #exclusive

df_may = df[(df['date'] < period_stop) & (df['date'] >= period_start)]
df_may.drop(columns=['Unnamed: 0','id','subreddit'],inplace=True)

df_may.loc[:,'cleaned'] = df_may['body'].apply(clean_text)
df_may.dropna(axis=0,inplace=True)

In [None]:
tfidfvectorizer_may = TfidfVectorizer(
#     max_df = 0.99,
#     min_df = 0.01,
#     max_features = 
)

### Post May-day - Positive

In [None]:
sentiment_focus = 'pos'
filtered_frame = df_may[df_may['sentiment'] == sentiment_focus]['cleaned']

tf_vec_may = tfidfvectorizer_may.fit_transform(filtered_frame)
tf_vec_may.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_may = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_may.fit(tf_vec_may)

In [None]:
top_n_words = 10
tf_feature_names_may = tfidfvectorizer_may.get_feature_names()
show_topics2(nmfmodel_may,tf_feature_names_may,top_n_words)

### Post may-day - neutral

In [None]:
sentiment_focus = 'neu'
filtered_frame = df_may[df_may['sentiment'] == sentiment_focus]['cleaned']

tf_vec_may = tfidfvectorizer_may.fit_transform(filtered_frame)
tf_vec_may.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_may = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_may.fit(tf_vec_may)

In [None]:
top_n_words = 10
tf_feature_names_may = tfidfvectorizer_may.get_feature_names()
show_topics2(nmfmodel_may,tf_feature_names_may,top_n_words)

### Post may-day - negative

In [None]:
sentiment_focus = 'neg'
filtered_frame = df_may[df_may['sentiment'] == sentiment_focus]['cleaned']

tf_vec_may = tfidfvectorizer_may.fit_transform(filtered_frame)
tf_vec_may.shape

In [None]:
number_of_topics = 10
random_seed = 99
nmfmodel_may = NMF(
    n_components=number_of_topics,
    max_iter=2000,
    random_state=random_seed)
nmfmodel_may.fit(tf_vec_may)

In [None]:
top_n_words = 10
tf_feature_names_may = tfidfvectorizer_may.get_feature_names()
show_topics2(nmfmodel_may,tf_feature_names_may,top_n_words)