In [1]:
# In this notebook:

# Premise: Games titles are not labeled in the dataset,
# title identification is also difficult by simply reading reviews, 
# so I deploy TFIDF (ngrams 1 & 2) to attempt to guess the game title

# Result: Max TFIDF 2-ngrams was effective at identifying game titles

In [2]:
import numpy as np
import pandas as pd

In [3]:
pd.set_option('display.max_colwidth', 100)

In [4]:
gorig = pd.read_csv('amazon_games_clean.csv',index_col=0)

print(f'Dataset has {gorig.shape[0]} samples')

gorig.head(5)

Dataset has 98144 samples


Unnamed: 0,asin,helppercent,overall,cleansum,cleantxt,cleanboth
0,700099867,0.666667,1,pay unlock content dont think,instal game wa struggle game window live championship race car unlock buy addon game pay nearly ...,pay unlock content dont think instal game wa struggle game window live championship race car unl...
1,700099867,0.7,3,awesome game crash frequently,get version instead p version turn mistake console version game look percent good pc version dea...,awesome game crash frequently get version instead p version turn mistake console version game lo...
2,700099867,1.0,4,dirt,dirt xbox wa okay game start play game laptop buy new game build collection game fun play much b...,dirt dirt xbox wa okay game start play game laptop buy new game build collection game fun play m...
3,700099867,0.846154,5,step dirt terrific,love play dirt think graphic good purchase dirt addition otherand graphic absolutely gorgeous li...,step dirt terrific love play dirt think graphic good purchase dirt addition otherand graphic abs...
4,700099867,1.0,2,couldnt get one work,still havent figure one everything instruct game never instal strange since dont like rate somet...,couldnt get one work still havent figure one everything instruct game never instal strange since...


In [5]:
# we see some null values in cleansum & cleantxt
gorig.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98144 entries, 0 to 98143
Data columns (total 6 columns):
asin           98144 non-null object
helppercent    98144 non-null float64
overall        98144 non-null int64
cleansum       97082 non-null object
cleantxt       98110 non-null object
cleanboth      98144 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 5.2+ MB


In [6]:
# forward fill with 'game' which wont affect TFIDF
gorig.fillna(value='game', inplace=True)

In [7]:
vc_asin = gorig['asin'].value_counts()
vc_asin = vc_asin[(vc_asin >= 50) & (vc_asin <= 52)]

print(f'Unique games with 50~52 reviews: {len(vc_asin)}')

Unique games with 50~52 reviews: 34


In [8]:
# this gives us a good population for visualization
gedit = gorig[gorig['asin'].isin(list(vc_asin.index))]
gedit.reset_index(drop=True, inplace=True)

print(f'Dataset size of 50~52 reviews games: {gedit.shape[0]}')
print(f"Dataset unique word size is: {len(pd.Series(' '.join(gedit['cleanboth']).split()).value_counts())}")

Dataset size of 50~52 reviews games: 1728
Dataset unique word size is: 14262


## TFIDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
tf1 = TfidfVectorizer(max_features=50000, lowercase=True, analyzer='word',
                        stop_words = 'english', ngram_range=(1,1))
tf2 = TfidfVectorizer(max_features=500000, lowercase=True, analyzer='word',
                        stop_words = 'english', ngram_range=(2,2))

tfvec1 = tf1.fit_transform(gedit['cleanboth'])
tfvec2 = tf2.fit_transform(gedit['cleanboth'])

print(f'TFIDF 1 ngrams shape: {tfvec1.shape}')
print(f'TFIDF 2 ngrams shape: {tfvec2.shape}')

TFIDF 1 ngrams shape: (1728, 14071)
TFIDF 2 ngrams shape: (1728, 165299)


## Guessing game title

In [11]:
from collections import Counter

In [12]:
# Return 3 most common words with the highest TFIDF; 1 ngrams
# we see ok results that help us deduce the game title
vocab_lookup = {v:k for k,v in tf1.vocabulary_.items()}

for i in set(gedit['asin']):
    asin_unique = gedit[gedit['asin'] == i].index
    asin_tfidf = Counter([vocab_lookup[np.argmax(v)] for v in tfvec1[asin_unique]])
    print (f'Asin: {i}  1-ngrams Top 3: {asin_tfidf.most_common(3)}')

Asin: B0047THYWC  1-ngrams Top 3: [('origin', 8), ('dragon', 7), ('da', 5)]
Asin: B0076ZQSFI  1-ngrams Top 3: [('gta', 8), ('crime', 3), ('mission', 3)]
Asin: B00DC7G2W8  1-ngrams Top 3: [('kart', 13), ('wii', 6), ('battle', 3)]
Asin: B00006IR62  1-ngrams Top 3: [('war', 5), ('jedi', 5), ('game', 4)]
Asin: B000TI836G  1-ngrams Top 3: [('ops', 4), ('game', 3), ('moh', 3)]
Asin: B0000C6EB4  1-ngrams Top 3: [('honor', 3), ('mohaa', 2), ('ww', 2)]
Asin: B000B69E9G  1-ngrams Top 3: [('resident', 7), ('version', 4), ('leon', 4)]
Asin: B00004YRQ9  1-ngrams Top 3: [('controller', 23), ('analog', 3), ('shock', 2)]
Asin: B00005QEFD  1-ngrams Top 3: [('gamecube', 15), ('gc', 6), ('nintendo', 5)]
Asin: B005C2D2MO  1-ngrams Top 3: [('dishonor', 3), ('game', 3), ('stealth', 3)]
Asin: B000M17AVO  1-ngrams Top 3: [('remote', 26), ('button', 7), ('movie', 3)]
Asin: B003O6JKLC  1-ngrams Top 3: [('xbox', 10), ('gb', 8), ('console', 2)]
Asin: B004PAGJOC  1-ngrams Top 3: [('zombie', 20), ('game', 4), ('isl

In [13]:
# Return 3 most common words with the highest TFIDF; 2 ngrams
# we see great results that often guesses the title in the first suggestion
vocab_lookup = {v:k for k,v in tf2.vocabulary_.items()}

for i in set(gedit['asin']):
    asin_unique = gedit[gedit['asin'] == i].index
    asin_tfidf = Counter([vocab_lookup[np.argmax(v)] for v in tfvec2[asin_unique]])
    print (f'Asin: {i}  2-ngrams Top 3: {asin_tfidf.most_common(3)}')

Asin: B0047THYWC  2-ngrams Top 3: [('dragon age', 15), ('wa bite', 1), ('believe bother', 1)]
Asin: B0076ZQSFI  2-ngrams Top 3: [('hong kong', 5), ('true crime', 3), ('sleep dog', 3)]
Asin: B00DC7G2W8  2-ngrams Top 3: [('mario kart', 17), ('battle mode', 2), ('blue shell', 2)]
Asin: B00006IR62  2-ngrams Top 3: [('star war', 16), ('adventureyou choose', 1), ('act review', 1)]
Asin: B000TI836G  2-ngrams Top 3: [('black ops', 4), ('single player', 3), ('medal honor', 2)]
Asin: B0000C6EB4  2-ngrams Top 3: [('medal honor', 5), ('person shooter', 2), ('ability player', 1)]
Asin: B000B69E9G  2-ngrams Top 3: [('resident evil', 11), ('gamecube version', 2), ('biggest bait', 1)]
Asin: B00004YRQ9  2-ngrams Top 3: [('dual shock', 5), ('great controller', 2), ('double edge', 1)]
Asin: B00005QEFD  2-ngrams Top 3: [('memory card', 3), ('nintendo ha', 2), ('game boy', 2)]
Asin: B005C2D2MO  2-ngrams Top 3: [('play style', 2), ('conclusion finish', 1), ('feel connect', 1)]
Asin: B000M17AVO  2-ngrams Top

## Verification

In [14]:
# Example of correct guessing; Game is likely Dragon Age
# High 1st max TFIDF count relative to 2nd/ 3rd tends to be a good guess
gedit[gedit['asin'] == 'B0047THYWC'][:15]

Unnamed: 0,asin,helppercent,overall,cleansum,cleantxt,cleanboth
1119,B0047THYWC,0.714286,1,every step forward bioware take two step backwards,everyone go mass effect despite game dumbed mass title come second next dragon put game terrible...,every step forward bioware take two step backwards everyone go mass effect despite game dumbed m...
1120,B0047THYWC,0.833333,3,bad origin,overall like game however cant help compare dragon age origin im afraid pale comparison dao wa o...,bad origin overall like game however cant help compare dragon age origin im afraid pale comparis...
1121,B0047THYWC,0.615385,1,disappoint,cant believe even bother spend money expect first game apparently didnt learn youre fence wait g...,disappoint cant believe even bother spend money expect first game apparently didnt learn youre f...
1122,B0047THYWC,0.6,3,take rp rpg,dragon age classic rpg pretty much element could think make game worthwhile intrigue instance gi...,take rp rpg dragon age classic rpg pretty much element could think make game worthwhile intrigue...
1123,B0047THYWC,1.0,5,go annoy first absolutely love,find helpful list pc game enjoy heck begin find nod head say game good title review might helpfu...,go annoy first absolutely love find helpful list pc game enjoy heck begin find nod head say game...
1124,B0047THYWC,0.625,3,lazy sloppy sequel,dragon age ii decent game next predecessor though shameful product slap face series gamers gener...,lazy sloppy sequel dragon age ii decent game next predecessor though shameful product slap face ...
1125,B0047THYWC,0.780488,3,rpg pc go console,agree review already game disappointment especially terrific leadin dragon age play rpgs since f...,rpg pc go console agree review already game disappointment especially terrific leadin dragon age...
1126,B0047THYWC,1.0,3,fairly decent game,overall seem like average decent game however two significant problem battle scene first often h...,fairly decent game overall seem like average decent game however two significant problem battle ...
1127,B0047THYWC,0.583333,4,new approach help hurt,dragon age doe really need introduction biowares fantasy mass effect counterpart make good appea...,new approach help hurt dragon age doe really need introduction biowares fantasy mass effect coun...
1128,B0047THYWC,0.932914,2,time mediocre usually painfully average handful good moment,tldr crowd da isnt bad game per se rush often schizophrenic kind game want fan origin particular...,time mediocre usually painfully average handful good moment tldr crowd da isnt bad game per se r...


In [15]:
# Example of weak guessing; Game is likely Sleeping Dog
# Low and leveled 1st/ 2nd/ 3rd max TFIDF count tends to be a bad guess
# 1-ngram actually guessed GTA, as often comparisons to another game can mislead guessing
gedit[gedit['asin'] == 'B0076ZQSFI'][:15]

Unnamed: 0,asin,helppercent,overall,cleansum,cleantxt,cleanboth
1373,B0076ZQSFI,0.923077,5,sleep game,remember true crime street hong kong wa gonna come devs decide gut last year hear nothing else g...,sleep game remember true crime street hong kong wa gonna come devs decide gut last year hear not...
1374,B0076ZQSFI,1.0,5,fav sandbox game ever,dont make statement lightly perhaps wa even know game exist saw p simply put blow away doe every...,fav sandbox game ever dont make statement lightly perhaps wa even know game exist saw p simply p...
1375,B0076ZQSFI,1.0,5,fun,didnt know expect first hear game sleep dog wtf wa edge get game eventually free playstation plu...,fun didnt know expect first hear game sleep dog wtf wa edge get game eventually free playstation...
1376,B0076ZQSFI,0.5,5,one best game ever,game ridiculously game lack nowadays game ha top notch everything control scenery graphic color ...,one best game ever game ridiculously game lack nowadays game ha top notch everything control sce...
1377,B0076ZQSFI,0.5,5,prefer gta,blunt direct essentially hong kong version gta tweak notable difference instead gun sleep dog ha...,prefer gta blunt direct essentially hong kong version gta tweak notable difference instead gun s...
1378,B0076ZQSFI,0.5,5,best game wait gta v,sleep dog ha best open world game square enix ha ever make enjoy everything game ha great story ...,best game wait gta v sleep dog ha best open world game square enix ha ever make enjoy everything...
1379,B0076ZQSFI,1.0,5,totally underrate,look game play gta v come well find first square enix make game good sign square enix generally ...,totally underrate look game play gta v come well find first square enix make game good sign squa...
1380,B0076ZQSFI,1.0,4,gta meet jackie chan,trailer look awesome buy still havent get past half american im hard time relate character altho...,gta meet jackie chan trailer look awesome buy still havent get past half american im hard time r...
1381,B0076ZQSFI,0.5,5,vote dollar,great game want company invest make new product like annual update cod madden whathaveyou vote d...,vote dollar great game want company invest make new product like annual update cod madden whatha...
1382,B0076ZQSFI,0.920635,5,far east take gta saint row,game square enix sleeper game nobody ha hear may slip radar spiritual successor true crime serie...,far east take gta saint row game square enix sleeper game nobody ha hear may slip radar spiritua...


In [16]:
# this concludes pt 03