In [1]:
# In this notebook:

# utilize TFIDF to create sparse word vector
# conduct sentiment analysis on Amazon game reviews

In [2]:
import numpy as np
import pandas as pd
from pandas import pivot_table
import matplotlib.pyplot as plt

In [3]:
gedit = pd.read_csv('amazon_games_clean.csv',index_col=0)

print(f'Dataset has {gedit.shape[0]} samples')

gedit.head()

Dataset has 98144 samples


Unnamed: 0,asin,helppercent,overall,cleansum,cleantxt,cleanboth
0,700099867,0.666667,1,pay unlock content dont think,installing game struggle games windows live ch...,pay unlock content dont think installing game ...
1,700099867,0.7,3,awesome game crash frequently,got version instead ps version turned mistake ...,awesome game crash frequently got version inst...
2,700099867,1.0,4,dirt,dirt xbox okay game started playing games lapt...,dirt dirt xbox okay game started playing games...
3,700099867,0.846154,5,step dirt terrific,loved playing dirt thought graphics good purch...,step dirt terrific loved playing dirt thought ...
4,700099867,1.0,2,couldnt get one work,still havent figured one everything instructed...,couldnt get one work still havent figured one ...


In [4]:
print(f"There are {len(gedit['asin'].unique())} unique games")

There are 10441 unique games


In [5]:
for i in range(10):
    print(gedit['asin'].unique()[i])
# brief look at the unique identifiers

0700099867
6050036071
7100027950
7293000936
8176503290
907843905X
9625990674
9861019731
9882155456
B000003SQQ


In [132]:
vc_asin = gedit['asin'].value_counts()
vc_asin = vc_asin[vc_asin >= 20]

print(f'Unique games with 20+ reviews: {len(vc_asin)}')

Unique games with 20+ reviews: 1044


In [7]:
# removing games with less than 20 reviews
# this is to keep the word vectors manageable
gedit = gedit[gedit['asin'].isin(list(vc_asin.index))]

print(f'Dataset size of 20+ reviews games: {gedit.shape[0]}')
print(f"Dataset unique word size is: {len(pd.Series(' '.join(gedit['cleanboth']).split()).value_counts())}")

Dataset size of 20+ reviews games: 42471
Dataset unique word size is: 63464


## TFIDF

In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [82]:
tf = TfidfVectorizer(max_features=80000, lowercase=True, analyzer='word',
                        stop_words = 'english', ngram_range=(1,1))

tfvec = tf.fit_transform(gedit['cleanboth'])

tfvec

<42471x63255 sparse matrix of type '<class 'numpy.float64'>'
	with 4473159 stored elements in Compressed Sparse Row format>

## Sentiment Analysis

In [83]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [84]:
(x_train, x_test, y_train, y_test) = train_test_split(tfvec, gedit['overall'], test_size=0.25)

print(f'x_train: {x_train.shape}\ny_train: {y_train.shape}')
print(f'x_test: {x_test.shape}\ny_test: {y_test.shape}')

x_train: (31853, 63255)
y_train: (31853,)
x_test: (10618, 63255)
y_test: (10618,)


In [141]:
for c in [0.01, 0.25, 0.5, 1.0, 1.5, 2.0, 5.0]:
    
    lreg = LogisticRegression(C=c)
    lreg.fit(x_train, y_train)
    print (f'Logistic Regression Accuracy: C={c} {accuracy_score(y_test, lreg.predict(x_test))}')

Logistic Regression Accuracy: C=0.01 0.5550009417969486
Logistic Regression Accuracy: C=0.25 0.6166886419288001
Logistic Regression Accuracy: C=0.5 0.6343002448672066
Logistic Regression Accuracy: C=1.0 0.6465436051987191
Logistic Regression Accuracy: C=1.5 0.6507816914673197
Logistic Regression Accuracy: C=2.0 0.6505933320776041
Logistic Regression Accuracy: C=5.0 0.6451309097758523


In [142]:
mnb = MultinomialNB()
mnb = mnb.fit(x_train, y_train)

print (f'Naive Bayes Accuracy: {accuracy_score(y_test, mnb.predict(x_test))}')

Naive Bayes Accuracy: 0.5557543793558108


## Test predictions

In [143]:
test_phrase = ['There are no words to describe the experience that is Zelda Ocerina of Time, simply amazing!',
               'Zombicide is an unpolished piece of crap made to cheat fans out of their money',
               'Rampage is a cheesy game with some great gems in it',
               'This game is so so so good, will play again!',
               'Do not waste your money on this trash heap',
               'Some fun levels, but UI is pretty clunky and video angle is terrible']

In [144]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

def clean_test_phrase(x):
    x = x.lower()
    new_str = ''
    for c in x:
        if (c.isalpha() == True) or (c == ' '):
            new_str += c
    new_str = " ".join(w for w in str(new_str).split() if w not in stop)
                       
    return new_str

In [145]:
test_clean = []

for i in test_phrase:
    test_clean.append(clean_test_phrase(i))
print(test_clean)

['words describe experience zelda ocerina time simply amazing', 'zombicide unpolished piece crap made cheat fans money', 'rampage cheesy game great gems', 'game good play', 'waste money trash heap', 'fun levels ui pretty clunky video angle terrible']


In [146]:
test_vec = tf.transform(test_clean)

test_vec

<6x63255 sparse matrix of type '<class 'numpy.float64'>'
	with 32 stored elements in Compressed Sparse Row format>

In [151]:
# we see pretty good prediction with lreg
lreg = LogisticRegression(C=1.5)
lreg = lreg.fit(x_train, y_train)
test_lreg = lreg.predict(test_vec)

for phrase, pred in zip(test_phrase, test_lreg):
    print(f'Logistic regression prediction: {pred}  {phrase}')

Logistic regression prediction: 5  There are no words to describe the experience that is Zelda Ocerina of Time, simply amazing!
Logistic regression prediction: 1  Zombicide is an unpolished piece of crap made to cheat fans out of their money
Logistic regression prediction: 5  Rampage is a cheesy game with some great gems in it
Logistic regression prediction: 4  This game is so so so good, will play again!
Logistic regression prediction: 1  Do not waste your money on this trash heap
Logistic regression prediction: 3  Some fun levels, but UI is pretty clunky and video angle is terrible


In [152]:
# we see ineffective prediction with Naive Bayes
test_mnb = mnb.predict(test_vec)

for phrase, pred in zip(test_phrase, test_mnb):
    print(f'Naive Bayes prediction: {pred}  {phrase}')

Naive Bayes prediction: 5  There are no words to describe the experience that is Zelda Ocerina of Time, simply amazing!
Naive Bayes prediction: 5  Zombicide is an unpolished piece of crap made to cheat fans out of their money
Naive Bayes prediction: 5  Rampage is a cheesy game with some great gems in it
Naive Bayes prediction: 5  This game is so so so good, will play again!
Naive Bayes prediction: 5  Do not waste your money on this trash heap
Naive Bayes prediction: 5  Some fun levels, but UI is pretty clunky and video angle is terrible


In [150]:
# Naive Bayes' mispredictions may be influenced by '5' representing >50% of the ratings
# Naive Bayes is a probabilistic regression
gedit['overall'].value_counts()

5    23593
4     9204
3     4727
1     2659
2     2288
Name: overall, dtype: int64

In [154]:
# this concludes pt 2!