In [1]:
%config IPCompleter.greedy=True

In [3]:
##Importing all the necessary directories

import pandas as pd
import numpy as np
import re

import gensim
from gensim import models
from gensim import corpora
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_short
from gensim.utils import lemmatize
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import regexp_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

In [5]:
# open pickle df
import pickle
with open('df2.pkl', 'rb') as pkl_file:
    df = pickle.load(pkl_file) 
df.drop(['index'], axis =1, inplace = True)
df.head()# Pickle file for later use

Unnamed: 0,game,steam_purchase,received_for_free,written_during_early_access,voted_up,review,clean_words,clean_text
0,80360,True,False,False,True,Things are really heating up :),"[thing, heating]",thing heating
1,80360,True,False,False,True,Pure awesomeness! The soundtrack is so good in...,"[pure, awesomeness, soundtrack, good, support,...",pure awesomeness soundtrack good support perfe...
2,80360,True,False,False,True,As the other parts of the Blackwell series: I ...,"[blackwell, series, love, game, investigation,...",blackwell series love game investigation funny...
3,80360,True,False,False,True,The art style has completely changed from the ...,"[art, style, completely, change, game, bit, pi...",art style completely change game bit pixel art...
4,80360,True,False,False,True,If you haven't played the other three adventur...,"[haven, played, adventure, blackwell, series, ...",haven played adventure blackwell series play g...


In [6]:
my_dict = Dictionary(df.clean_words)
#my_dict.filter_extremes(no_below=5, no_above=0.90)

# Count Vectorization
dtm = [my_dict.doc2bow(doc) for doc in df.clean_words]

# TF-IDF Vectorization
tfidf_vectorizer = TfidfModel(dtm)
tfidf = tfidf_vectorizer[dtm]


In [127]:
from gensim.models import LsiModel, CoherenceModel, LdaModel


lsi_tfidf = LsiModel(corpus=tfidf, id2word=my_dict, num_topics=5)
lda_tfidf = LdaModel(corpus=tfidf, id2word=my_dict, num_topics=5)

In [128]:
# Get dominant topic and corresponding keywords for each article

def getKeywordsFromDominantTopic(model, corpus, texts): 
    # Init output
    topickeyword_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                topickeyword_df = topickeyword_df.append(pd.Series([topic_keywords]), ignore_index=True)
            else:
                break
    return(topickeyword_df)

In [129]:
df['LSI TF-IDF Keywords'] = getKeywordsFromDominantTopic(model=lsi_tfidf, corpus=tfidf, texts=df.review)
df['LDA TF-IDF Keywords'] = getKeywordsFromDominantTopic(model=lda_tfidf, corpus=tfidf, texts=df.review)

In [130]:
# Combine all the keywords
df['merged-keywords'] = df['LSI TF-IDF Keywords'] + ', ' + df['LDA TF-IDF Keywords'] 

In [None]:
# Get 5 most common keywords across the four groups of keywords
from collections import Counter 
for i in range(len(df)):
    words = df.loc[i, 'merged-keywords']
    most_occur = Counter(str(words).split()).most_common(5) 
    df.loc[i, 'Top 5 Freq Words'] = ' '.join([word[0] for word in most_occur])

df[['review', 'Top 5 Freq Words']].head()

In [None]:
lsi_tfidf.print_topics()

In [7]:
from gensim.models import Word2Vec
num_features = 100
model = Word2Vec(df['clean_words'], size=num_features, window=8, min_count=2, 
    sample=1e-3, sg=1, iter=5, workers=8)
vocab = set(model.wv.index2word)
len(vocab)

9474

In [8]:
df = df[df['game'] == '487120']

In [28]:
df.reset_index()

Unnamed: 0,index,game,steam_purchase,received_for_free,written_during_early_access,voted_up,review,clean_words,clean_text
0,6267,487120,True,False,False,True,Yes,[yes],yes
1,6268,487120,True,False,False,False,"It would a cool game if Ark, Rust, etc didn't ...","[cool, game, ark, rust, exist]",cool game ark rust exist
2,6269,487120,True,False,False,True,Me and a few friends grabbed this looking for ...,"[friend, grabbed, look, new, play, man, surpri...",friend grabbed look new play man surprised gam...
3,6270,487120,True,False,False,True,I havent played much of this wonderful game bu...,"[havent, played, wonderful, game, enjoy, secon...",havent played wonderful game enjoy second play...
4,6271,487120,True,False,False,True,cool game lots to do,"[cool, game, lot]",cool game lot
...,...,...,...,...,...,...,...,...,...
2925,9192,487120,True,False,False,True,(3 hours of game play from release day July 26...,"[hour, game, play, release, day, july, origina...",hour game play release day july originally wri...
2926,9193,487120,True,False,False,True,my lady said I should write a review when I ge...,"[lady, write, review, hour, write, spite]",lady write review hour write spite
2927,9194,487120,True,False,True,True,Best game i ever played!!! Love it!!!,"[best, game, played, love]",best game played love
2928,9195,487120,True,False,True,True,"The game needs optimising, however as a standa...","[game, need, optimise, standalone, game, enjoy...",game need optimise standalone game enjoyable t...


In [115]:
def get_sim(word1, sent):
    sent_sim = 0
    nwords = 0
    for word2 in sent:
        if(word2 in vocab) and (model.wv.similarity(w1 = word1, w2 = word2) >0.5 or model.wv.similarity(w1 = word1, w2 = word2) < -0.5):
            sent_sim = sent_sim + model.wv.similarity(w1 = word1, w2 = word2)
            nwords = nwords + 1
        if nwords == 0:
            return 0
        else:
            return round(sent_sim/nwords,3)

# Function to extract features from text data using the custom similarity function (get_sim)
# The features range between (-1 to 1)
def feature_extraction(df):
    df = df.reset_index()
    for i in range(len(df)):
        df.loc[i,"gameplay_sim"] = get_sim("gameplay",df.loc[i,"clean_words"])
        df.loc[i,"sound_design_sim"] = get_sim("music",df.loc[i,"clean_words"])
        df.loc[i,"ambience_sim"] = get_sim("ambience",df.loc[i,"clean_words"])
        df.loc[i,"story_sim"] = get_sim("story",df.loc[i,"clean_words"])
        df.loc[i,"animation_sim"] = get_sim("animation",df.loc[i,"clean_words"])
        df.loc[i,"humor_sim"] = get_sim("humor",df.loc[i,"clean_words"])
        df.loc[i,"puzzle_sim"] = get_sim("puzzle",df.loc[i,"clean_words"])
        df.loc[i,"character_sim"] = get_sim("character",df.loc[i,"clean_words"])
        df.loc[i,"difficult_sim"] = get_sim("difficult",df.loc[i,"clean_words"])
        df.loc[i,"sentiment_polarity"] = vader.polarity_scores(df.loc[i,"review"])['compound']
    return df

In [116]:
x2 = feature_extraction(df[['clean_words','review','voted_up']]) 


In [117]:
x2

Unnamed: 0,index,clean_words,review,voted_up,gameplay_sim,sound_design_sim,ambience_sim,story_sim,animation_sim,humor_sim,puzzle_sim,character_sim,difficult_sim,sentiment_polarity
0,6267,[yes],Yes,True,0.601,0.0,0.597,0.0,0.0,0.000,0.0,0.0,0.556,0.4019
1,6268,"[cool, game, ark, rust, exist]","It would a cool game if Ark, Rust, etc didn't ...",False,0.514,0.0,0.587,0.0,0.0,0.000,0.0,0.0,0.000,0.3182
2,6269,"[friend, grabbed, look, new, play, man, surpri...",Me and a few friends grabbed this looking for ...,True,0.000,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.9222
3,6270,"[havent, played, wonderful, game, enjoy, secon...",I havent played much of this wonderful game bu...,True,0.000,0.0,0.630,0.0,0.0,0.000,0.0,0.0,0.582,0.9905
4,6271,"[cool, game, lot]",cool game lots to do,True,0.514,0.0,0.587,0.0,0.0,0.000,0.0,0.0,0.000,0.3182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,9192,"[hour, game, play, release, day, july, origina...",(3 hours of game play from release day July 26...,True,0.000,0.0,0.000,0.0,0.0,0.000,0.0,0.0,0.000,0.9882
2926,9193,"[lady, write, review, hour, write, spite]",my lady said I should write a review when I ge...,True,0.000,0.0,0.676,0.0,0.0,0.518,0.0,0.0,0.000,-0.5267
2927,9194,"[best, game, played, love]",Best game i ever played!!! Love it!!!,True,0.000,0.0,0.590,0.0,0.0,0.521,0.0,0.0,0.000,0.9180
2928,9195,"[game, need, optimise, standalone, game, enjoy...","The game needs optimising, however as a standa...",True,0.633,0.0,0.582,0.0,0.0,0.000,0.0,0.0,0.000,0.9464


“Valence Aware Dictionary and sEntiment Reasoner” is another popular rule-based library for sentiment analysis. Like TextBlob, it uses a sentiment lexicon that contains intensity measures for each word based on human-annotated labels. A key difference however, is that VADER was designed with a focus on social media texts. This means that it puts a lot of emphasis on rules that capture the essence of text typically seen on social media — for example, short sentences with emojis, repetitive vocabulary and copious use of punctuation (such as exclamation marks). Below are some examples of the sentiment intensity scores output by VADER.

In [118]:
# Classification model function 
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
def classification_model(classifier, avg_train_features, train_y):
    classifier.fit(avg_train_features, train_y) 
    classifier.fit(avg_train_features, train_y)
    train_predict_y = classifier.predict(avg_train_features)
    print(" Train data stats: ")
    print(confusion_matrix(train_y, train_predict_y))
    print(classification_report(train_y, train_predict_y))
    classifier_cv_scores = cross_val_score(classifier, avg_train_features, train_y, cv=5)
    print('CV Accuracy (5-fold):', classifier_cv_scores)
    classifier_cv_mean_score = np.mean(classifier_cv_scores)
    print('Mean CV Accuracy:', classifier_cv_mean_score)
    
    #print(" Test data stats: ")
    #predict_test_score = classifier.score(avg_test_features, test_y)
    #test_predict_y = classifier.predict(avg_test_features)
    #print('Test Accuracy:', predict_test_score)
    #print(test_predict_y.shape)
    #print(confusion_matrix(test_y, test_predict_y))
    #print(classification_report(test_y, test_predict_y))
    return

In [119]:
x2.drop(['index','review','clean_words'], axis = 1, inplace = True)

In [120]:
# Encoding y variable (topic)
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
x2["voted_up"] = encoder.fit_transform(x2["voted_up"])
encoder_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print(encoder_mapping)

{False: 0, True: 1}


In [121]:
x2.dropna(how ='any',inplace = True)

In [122]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)

classification_model(lr, x2.drop(["voted_up"], axis = 1), x2.voted_up)
import statsmodels.api as sm
logit_model=sm.Logit(x2.voted_up, x2.drop(["voted_up"], axis = 1))
result=logit_model.fit()
print(result.summary2())

 Train data stats: 
[[ 619  433]
 [ 216 1644]]
              precision    recall  f1-score   support

           0       0.74      0.59      0.66      1052
           1       0.79      0.88      0.84      1860

    accuracy                           0.78      2912
   macro avg       0.77      0.74      0.75      2912
weighted avg       0.77      0.78      0.77      2912

CV Accuracy (5-fold): [0.80445969 0.77015437 0.7766323  0.77835052 0.75429553]
Mean CV Accuracy: 0.7767784831391134
Optimization terminated successfully.
         Current function value: 0.493733
         Iterations 6
                          Results: Logit
Model:              Logit            Pseudo R-squared: 0.245      
Dependent Variable: voted_up         AIC:              2895.4996  
Date:               2020-04-23 23:26 BIC:              2955.2655  
No. Observations:   2912             Log-Likelihood:   -1437.7    
Df Model:           9                LL-Null:          -1904.9    
Df Residuals:       2902        

In [123]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)

classification_model(lr, x2.drop(["voted_up","sentiment_polarity"], axis = 1), x2.voted_up)
import statsmodels.api as sm
logit_model=sm.Logit(x2.voted_up, x2.drop(["voted_up","sentiment_polarity"], axis = 1))
result=logit_model.fit()
print(result.summary2())

 Train data stats: 
[[ 136  916]
 [ 115 1745]]
              precision    recall  f1-score   support

           0       0.54      0.13      0.21      1052
           1       0.66      0.94      0.77      1860

    accuracy                           0.65      2912
   macro avg       0.60      0.53      0.49      2912
weighted avg       0.61      0.65      0.57      2912

CV Accuracy (5-fold): [0.6483705  0.61921098 0.64089347 0.6580756  0.64089347]
Mean CV Accuracy: 0.6414888036167943
Optimization terminated successfully.
         Current function value: 0.623735
         Iterations 6
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.046     
Dependent Variable: voted_up         AIC:              3650.6352 
Date:               2020-04-23 23:26 BIC:              3704.4246 
No. Observations:   2912             Log-Likelihood:   -1816.3   
Df Model:           8                LL-Null:          -1904.9   
Df Residuals:       2903             L

In [124]:
from sklearn.ensemble import RandomForestClassifier
rfc2 = RandomForestClassifier(n_estimators= 100)
classification_model(rfc2, x2.drop(["voted_up","sentiment_polarity"], axis = 1), x2.voted_up)
#rfc2.feature_importances_
feature_importances = pd.DataFrame(rfc2.feature_importances_,
                                   index = x2.drop(["voted_up","sentiment_polarity"], axis = 1).columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances

 Train data stats: 
[[ 383  669]
 [  82 1778]]
              precision    recall  f1-score   support

           0       0.82      0.36      0.50      1052
           1       0.73      0.96      0.83      1860

    accuracy                           0.74      2912
   macro avg       0.78      0.66      0.67      2912
weighted avg       0.76      0.74      0.71      2912

CV Accuracy (5-fold): [0.67409949 0.62778731 0.65635739 0.68041237 0.68900344]
Mean CV Accuracy: 0.6655319976658237


Unnamed: 0,importance
ambience_sim,0.314027
gameplay_sim,0.165782
difficult_sim,0.15279
animation_sim,0.096124
story_sim,0.09016
humor_sim,0.072856
sound_design_sim,0.05713
puzzle_sim,0.031529
character_sim,0.019602


In [125]:
from sklearn.ensemble import RandomForestClassifier
rfc2 = RandomForestClassifier(n_estimators= 100)
classification_model(rfc2, x2.drop(["voted_up"], axis = 1), x2.voted_up)
#rfc2.feature_importances_
feature_importances = pd.DataFrame(rfc2.feature_importances_,
                                   index = x2.drop(["voted_up"], axis = 1).columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances

 Train data stats: 
[[ 987   65]
 [  11 1849]]
              precision    recall  f1-score   support

           0       0.99      0.94      0.96      1052
           1       0.97      0.99      0.98      1860

    accuracy                           0.97      2912
   macro avg       0.98      0.97      0.97      2912
weighted avg       0.97      0.97      0.97      2912

CV Accuracy (5-fold): [0.76157804 0.71698113 0.75601375 0.75257732 0.72852234]
Mean CV Accuracy: 0.743134515746848


Unnamed: 0,importance
sentiment_polarity,0.677864
ambience_sim,0.106011
gameplay_sim,0.058322
difficult_sim,0.047318
animation_sim,0.028429
story_sim,0.027158
humor_sim,0.022591
sound_design_sim,0.017716
character_sim,0.007347
puzzle_sim,0.007244
