In [199]:
import pickle as pckl

# personal script
import tokenize_review

import numpy as np

from sklearn.base import TransformerMixin
# from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


from nltk.corpus import stopwords

from nltk.tokenize import WhitespaceTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [362]:
# adjustable parameters
num_features = 1000
num_topics = 10
num_top_words = 10
num_top_reviews = 3

df = pckl.load(open('SSR_sorted_data_0122.pckl','rb'))
reviews = df.review

In [363]:
ws_tokenizer = WhitespaceTokenizer()
lancaster = LancasterStemmer()
porter = PorterStemmer()
snowball = SnowballStemmer('english')

try:
    stopwords = set(stopwords.words('english'))
except:
    nltk.download('stopwords')
    stopwords = set(stopwords.words('english'))

In [364]:
def cleanText(text):
    
    # import a dictionary of English contractions from another file
    from contractions import contractions_dict
    contraction_dict = contractions_dict

    # replace the contractions with their expanded form
    for contraction, expansion in contraction_dict.items():
        text = text.replace(contraction.lower(),expansion.lower())
    
    # get rid of newlines
    symbols = ['\'', '\"', '.', ',', '[', ']', '(', ')', '?', '!', '@', '$', '#', '&', '%']
    
    text = text.strip().replace('\n', ' ').replace('\r', ' ').replace('-',' ')
    
    for symbol in symbols:
        text = text.replace(symbol, '')

    # lowercase
    text = text.lower()

    return text

In [365]:
def gen_tokens(review, *args):

    ws_tokenized = ws_tokenizer.tokenize(review)
    #print(type(ws_tokenized),ws_tokenized[0])

    cleaned_tokens = []

    for token in ws_tokenized:
        if token not in stopwords:
            cleaned_tokens.append(token)
            
    #print(type(cleaned_tokens),cleaned_tokens[0])

    stemmed_tokens = []

    try:
        method = args[0]
    except:
        method = 'lancaster'

    if method == 'lancaster':
        for token in cleaned_tokens:
            #stemmed_tokens.append(lancaster.stem(token))
            stemmed_tokens.append(lancaster.stem(token.lower().strip()))

    elif method == 'porter':
        for token in cleaned_tokens:
            stemmed_tokens.append(porter.stem(token.lower().strip()))

    elif method == 'snowball':
        for token in cleaned_tokens:
            stemmed_tokens.append(snowball.stem(token.lower().strip()))

    stemmed_text = ' '.join(stemmed_tokens)

    return stemmed_text

In [366]:
def clean_and_tokenize(reviews):
    cleaned_reviews = []

    for review in reviews:
        review_tokens = []
        cleaned_text = cleanText(review)
        cleaned_reviews.append(gen_tokens(cleaned_text))

In [367]:
vectorizer = TfidfVectorizer(stop_words='english', 
                             max_features=num_features,
                             ngram_range=(1,2), 
                             max_df=0.9, min_df=3)

# use NMF model with the Frobenius norm
implement_nmf = NMF(n_components=num_topics,
                    random_state=1,
                    solver='mu',
                    beta_loss='frobenius')

In [368]:
transform = vectorizer.fit_transform(cleaned_reviews)
transform_array = transform.toarray()

In [369]:
# Non-negative matrix factorization (NMF) implementation W*H = original matrix
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html

W = implement_nmf.fit_transform(transform_array)
H = implement_nmf.components_

20

In [282]:
#num_top_words = 5

num_features = 1000
num_topics = 20
num_top_words = 5 # words per topic
num_top_reviews = 10

#### Sentiment analysis with _vaderSentiment_ package

In [297]:
def display_topics(vectorizer, clf, W, df, num_top_words, num_top_reviews):
    ''' Print out topics discovered by a model '''
    
    # get list of feature names
    feature_names = vectorizer.get_feature_names()
    
    # get vader sentiment analyzer
    analyzer = SentimentIntensityAnalyzer()    
    
    # list of topics and reviews to return
    topics, sentiments, reviews = [], [], []
    
    # loop over all the topics
    for topic_id, topic in enumerate(clf.components_):
        
        sentiment_sum = 0
        
        # grab the list of words describing the topic
        word_list = []
        for i in topic.argsort()[:-num_top_words - 1:-1]:
            word_list.append(feature_names[i])
        
        # split words in case there are some bigrams and get unique set
        split_list = []
        for word in word_list:
            for split in word.split():
                split_list.append(split)
        topic_words = list(set(split_list))
        
        # append topic words as a single string
        topics.append(' '.join([word for word in topic_words]))
        
        # print topic number and topic words
        print('Topic #%02d: %s' % (topic_id+1, topics[-1]))

        # loop over reviews for each topic
        top_doc_indices = np.argsort( W[:,topic_id] )[::-1][0:num_top_reviews]
        
        for doc_index in top_doc_indices:
            
            # check that the review contains one of the topic words
            if any(word in df['cleaned_reviews'].iloc[doc_index].lower() for word in topic_words):
                
                # sentiment analysis
                vader = analyzer.polarity_scores(df['cleaned_reviews'].iloc[doc_index])
                
                # append current review to the list 
                reviews.append(df.iloc[doc_index].to_dict())
                reviews[-1]['topic']       = topic_id
                reviews[-1]['topic_words'] = ' '.join([word for word in topic_words])
                reviews[-1]['sentiment']   = vader['compound']
                
                sentiment_sum += vader['compound']
                
                #print('User %20s on %s with rating %s' % (df['reviewerName'].iloc[doc_index][:20], 
                                     #df['reviewTime'].iloc[doc_index], df['overall'].iloc[doc_index]))            
                print("VADER: %f" % vader['compound'])
                #print(reviews[-1]['reviewText'])
            else:
                print("WARNING: TOPIC NOT IN DOCUMENT")
                            
            print()
        
        sentiments.append(sentiment_sum)
        
    return topics, sentiments, reviews

In [298]:
topics, sentiments, assc_reviews = display_topics(vectorizer, implement_nmf, W, df, num_top_words, num_top_reviews)

#(vectorizer, clf, W, df, n_top_words, n_top_documents)

Topic #01: level gam puzzl mech new
VADER: 0.927400

VADER: 0.994200

VADER: 0.989300

VADER: 0.970500

VADER: 0.968600

VADER: 0.982600

VADER: 0.799200

VADER: 0.985600

VADER: 0.938200

VADER: 0.972600

Topic #02: ev iv play gam
VADER: 0.340000

VADER: 0.340000

VADER: 0.765000

VADER: 0.765000

VADER: 0.340000

VADER: 0.765000

VADER: 0.765000

VADER: 0.765000

VADER: 0.802000

VADER: 0.765000

Topic #03: rol steph saus
VADER: 0.000000

VADER: 0.000000

VADER: 0.765000

VADER: -0.381800

VADER: 0.000000

VADER: 0.000000

VADER: -0.542300

VADER: -0.025800

VADER: 0.726900

VADER: -0.296000

Topic #04: play good puzzl gam
VADER: 0.440400

VADER: 0.440400

VADER: 0.440400

VADER: 0.440400

VADER: 0.440400

VADER: 0.440400

VADER: 0.440400

VADER: 0.440400

VADER: 0.648600

VADER: 0.440400

Topic #05: hurt fuck brain 10
VADER: -0.510600

VADER: 0.000000

VADER: -0.542300

VADER: 0.000000

VADER: -0.765000

VADER: 0.000000

VADER: 0.000000

VADER: 0.765000

VADER: 0.000000

VADER: 0.88

In [307]:
sum(int(x) for x in sentiments)
sentiments

[9.528199999999998,
 6.412,
 0.24600000000000005,
 4.6122,
 -0.1695000000000002,
 2.8968,
 3.9502,
 2.8036,
 0.6669999999999999,
 3.4929,
 -2.7748000000000004,
 1.6460000000000001,
 5.9178,
 2.2262,
 4.6622,
 4.4008,
 2.4697000000000005,
 0.5276000000000001,
 2.9536,
 2.7962]

In [159]:
# error = sqrt of sum of abs(matrix difference), i.e., how well did the refactorization work?
reconstruction_error = implement_nmf.reconstruction_err_

In [371]:
window_size = 100

#upvote_window = np.diff(df['upvotes'], n=1)
#zeros_upvote_window = np.zeros(window_size)

#full_upvote_window = 

In [379]:
upvote_window = []
percent_window = []

for i in range(len(df)):
    if i >= window_size - 1:
        upvote_period = df['upvotes'].iloc[i] - df['upvotes'].iloc[i+1-window_size]
        upvote_window.append(upvote_period)
        percent_window.append(upvote_period/window_size)
    else:
        upvote_window.append(df['upvotes'].iloc[i])
        percent_window.append(df['upvotes'].iloc[i]/df['total_votes'].iloc[i])

In [381]:
df.head()

Unnamed: 0,review,upvoted,comment_upvotes,comment_funny_votes,early_review,time_of_review,minutes_played,playtime_2weeks,last_played,games_owned,author_reviews,purchased,free,upvotes,total_votes,percent_upvotes,cleaned_reviews
0,Do not be fooled.\nThis game will invade your ...,True,71,41,False,2016-04-18 07:48:44,330,0,2018-11-26 07:57:48,6501,132,False,False,1,1,1.0,fool gam invad subconscy play hour burn many s...
1,One of the most challenging puzzle games out t...,True,34,20,False,2016-04-18 08:43:07,245,0,2016-04-18 08:48:29,361,10,False,False,2,2,1.0,on challeng puzzl gam nev unfair mech push rol...
2,You have to play the game to understand it and...,True,14,0,False,2016-04-18 11:00:45,1445,0,2016-05-26 17:03:53,67,1,True,False,3,3,1.0,play gam understand solv puzzl real satisfy ev...
3,"This game, huh? What a package! \n\nRight from...",True,10,2,False,2016-04-18 12:27:56,400,0,2017-04-24 12:00:53,365,68,False,False,4,4,1.0,gam huh pack right word go gam lov op menu mak...
4,I bet you are wondering what this game is actu...,True,15,6,False,2016-04-18 12:38:16,4109,0,2017-02-15 17:05:17,493,3,False,False,5,5,1.0,bet wond gam act lik play wheth wor money revi...


In [354]:
# data = df.reset_index(drop=True)

In [359]:
#ckl.dump(data,open('SSR_sorted_data_0122.pckl','wb'))

Unnamed: 0,review,upvoted,comment_upvotes,comment_funny_votes,early_review,time_of_review,minutes_played,playtime_2weeks,last_played,games_owned,author_reviews,purchased,free,upvotes,total_votes,percent_upvotes,cleaned_reviews
0,Do not be fooled.\nThis game will invade your ...,True,71,41,False,2016-04-18 07:48:44,330,0,2018-11-26 07:57:48,6501,132,False,False,1,1,1.0,fool gam invad subconscy play hour burn many s...
1,One of the most challenging puzzle games out t...,True,34,20,False,2016-04-18 08:43:07,245,0,2016-04-18 08:48:29,361,10,False,False,2,2,1.0,on challeng puzzl gam nev unfair mech push rol...
2,You have to play the game to understand it and...,True,14,0,False,2016-04-18 11:00:45,1445,0,2016-05-26 17:03:53,67,1,True,False,3,3,1.0,play gam understand solv puzzl real satisfy ev...
3,"This game, huh? What a package! \n\nRight from...",True,10,2,False,2016-04-18 12:27:56,400,0,2017-04-24 12:00:53,365,68,False,False,4,4,1.0,gam huh pack right word go gam lov op menu mak...
4,I bet you are wondering what this game is actu...,True,15,6,False,2016-04-18 12:38:16,4109,0,2017-02-15 17:05:17,493,3,False,False,5,5,1.0,bet wond gam act lik play wheth wor money revi...


In [141]:
cleaned_reviews = []

for review in reviews:
    review_tokens = []
    cleaned_text = cleanText(review)
    cleaned_reviews.append(gen_tokens(cleaned_text))

In [206]:
df['cleaned_reviews'] = cleaned_reviews

In [309]:
df['cleaned_reviews'].iloc[0]

'fool gam invad subconscy play hour burn many saus curs comput swear level imposs rag quit stil think day lat start gam attempt level fin solv yel triumph smil start process next level absolv bril sil frust weird amaz fun puzzl gam stop think also mak excel convers start poss convers end wel peopl walk away strange look fac youl brand weirdo profess lov rol saus around ground mind know tru tru beauty steph saus rol walk away smil eag get back gam solv on nef puzzl think week steph saus rol'

In [140]:
single_review = reviews[0]


cleaned_text = cleanText(single_review)
#cleaned_text

In [33]:
from nltk.tokenize import RegexpTokenizer
re_tokenizer = RegexpTokenizer(r'\w+')

regex_tokenized = re_tokenizer.tokenize(single_review)

In [37]:
regex_tokenized

['do',
 'not',
 'be',
 'fooled',
 'this',
 'game',
 'will',
 'invade',
 'your',
 'subconscious',
 'you',
 'will',
 'play',
 'for',
 'hours',
 'you',
 'will',
 'burn',
 'many',
 'sausages',
 'you',
 'will',
 'curse',
 'at',
 'your',
 'computer',
 'you',
 'will',
 'swear',
 'that',
 'level',
 'is',
 'impossible',
 'you',
 'will',
 'rage',
 'quit',
 'you',
 'will',
 'still',
 'be',
 'thinking',
 'about',
 'it',
 'days',
 'later',
 'you',
 'will',
 'start',
 'the',
 'game',
 'again',
 'and',
 'attempt',
 'a',
 'level',
 'again',
 'you',
 'will',
 'finally',
 'solve',
 'it',
 'you',
 'will',
 'yell',
 'in',
 'triumph',
 'and',
 'smile',
 'then',
 'you',
 'll',
 'start',
 'the',
 'process',
 'again',
 'on',
 'the',
 'next',
 'level',
 'it',
 's',
 'absolutely',
 'brilliant',
 'and',
 'silly',
 'and',
 'frustrating',
 'and',
 'weird',
 'it',
 'is',
 'an',
 'amazing',
 'and',
 'fun',
 'puzzle',
 'game',
 'that',
 'you',
 'll',
 'not',
 'stop',
 'thinking',
 'about',
 'it',
 's',
 'will',
 'als

In [8]:
single_review = reviews.iloc[0].lower()

In [26]:
from nltk.tokenize import WhitespaceTokenizer
ws_tokenizer = WhitespaceTokenizer()

In [32]:
ws_tokenized = ws_tokenizer.tokenize(single_review)

In [4]:
from nltk.corpus import stopwords

# nltk.download('stopwords') # if stopwords haven't been used before

stopwords = set(stopwords.words('english'))

In [55]:
ws_cleaned_tokens = []

for token in ws_tokenized:
    if token not in stopwords:
        ws_cleaned_tokens.append(token)

In [3]:
'''ws_tokenized = tokenize_review.tokenize(single_review)
stemmed_text = tokenize_review.full_review_stemmed(ws_tokenized)

## stemmed tokens or stemmed text for vectorization?
#stemmed_tokens = tokenize_review.stem_tokens(ws_tokenized)'''

NameError: name 'single_review' is not defined

In [28]:
class CleanTextTransformer(TransformerMixin):
    """
    Convert text to cleaned text
    """

    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

In [5]:
stemmed_reviews = []

for review in reviews:
    ws_tokenized = tokenize_review.tokenize(review)
    stemmed_text = tokenize_review.full_review_stemmed(ws_tokenized)
    
    stemmed_reviews.append(stemmed_text)

In [20]:
df['stemmed_review'] = stemmed_reviews

In [27]:
'''
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(stemmed_reviews)
print(vectorizer.get_feature_names(), len(vectorizer.get_feature_names()))
'''



In [46]:
def tokenize(review):

    ws_tokenized = ws_tokenizer.tokenize(review)

    cleaned_tokens = []

    for token in ws_tokenized:
        if token not in stopwords:
            cleaned_tokens.append(token)

    return cleaned_tokens

def stem_tokens(cleaned_tokens, *args):

    stemmed_tokens = []

    try:
        method = args[0]
    except:
        method = 'lancaster'

    if method == 'lancaster':
        for token in cleaned_tokens:
            stemmed_tokens.append(lancaster.stem(token))

    elif method == 'porter':
        for token in cleaned_tokens:
            stemmed_tokens.append(porter.stem(token))

    elif method == 'snowball':
        for token in cleaned_tokens:
            stemmed_tokens.append(snowball.stem(token))

    return stemmed_tokens

def full_review_stemmed(cleaned_tokens, *args):

    stemmed_tokens = []

    try:
        method = args[0]
    except:
        method = 'lancaster'

    if method == 'lancaster':
        for token in cleaned_tokens:
            stemmed_tokens.append(lancaster.stem(token))

    elif method == 'porter':
        for token in cleaned_tokens:
            stemmed_tokens.append(porter.stem(token))

    elif method == 'snowball':
        for token in cleaned_tokens:
            stemmed_tokens.append(snowball.stem(token))

    stemmed_text = ' '.join(stemmed_tokens)

    return stemmed_text

AttributeError: sortby not found