# Topic Models for Amazon video game reviews
Project 3

In [117]:
# necessary imports for analysis, loading in data

import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

import pycld2
tqdm.pandas()

In [5]:
df=pd.read_json('Video_Games_5.json',lines=True)

In [6]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5,True,"10 17, 2015",A1HP7NVNPFMA4N,700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4,False,"07 27, 2015",A1JGAP0185YJI6,700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3,True,"02 23, 2015",A1YJWEXHQBWK2B,700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2,True,"02 20, 2015",A2204E1TH211HT,700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5,True,"12 25, 2014",A2RF5B5H74JLPE,700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,


In [7]:
len(df)

497577

# Putting together our cleaner dataframe

In [8]:
df=df.drop(['reviewTime','reviewerName','unixReviewTime','vote','image','style'],axis=1)

In [9]:
df.head()

Unnamed: 0,overall,verified,reviewerID,asin,reviewText,summary
0,5,True,A1HP7NVNPFMA4N,700026657,"This game is a bit hard to get the hang of, bu...",but when you do it's great.
1,4,False,A1JGAP0185YJI6,700026657,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it"
2,3,True,A1YJWEXHQBWK2B,700026657,ok game.,Three Stars
3,2,True,A2204E1TH211HT,700026657,"found the game a bit too complicated, not what...",Two Stars
4,5,True,A2RF5B5H74JLPE,700026657,"great game, I love it and have played it since...",love this game


In [194]:
df.verified.value_counts()

True     332645
False    164932
Name: verified, dtype: int64

The 'verified' column refers to reviews left by customers who had bought the item through Amazon. Approximately 67% of reviews originated from customers who purchased their game through Amazon. For now, I'll keep for a lack of "distinguishing" columns in this dataset.

While this is mildly interesting in itself, this does get away a bit from the goal of the project, so you could argue that the column could be removed for this level of analysis.

In [10]:
(len(df), len(df[~df['reviewText'].isnull()])),(len(df),len(df[~df['summary'].isnull()]))

((497577, 497419), (497577, 497468))

There are 158 reviews without actual reviews, and, although not as crucial, there are 109 summaries without an actual entry. The four other columns all have entries in our dataset, so we can remove everything in our dataframe with NaN values.

In [11]:
df=df.dropna()
len(df)

497316

Unfortunately, there isn't a database of video games with asin numbers (and the corresponding titles) for me to join with this new dataframe I've constructed, so I'll make sure to note those names in descriptions like these when necessary.

In [24]:
df=df.reset_index()

In [25]:
df

Unnamed: 0,level_0,index,overall,verified,reviewerID,asin,reviewText,summary
0,0,0,5,True,A1HP7NVNPFMA4N,0700026657,"This game is a bit hard to get the hang of, bu...",but when you do it's great.
1,1,1,4,False,A1JGAP0185YJI6,0700026657,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it"
2,2,2,3,True,A1YJWEXHQBWK2B,0700026657,ok game.,Three Stars
3,3,3,2,True,A2204E1TH211HT,0700026657,"found the game a bit too complicated, not what...",Two Stars
4,4,4,5,True,A2RF5B5H74JLPE,0700026657,"great game, I love it and have played it since...",love this game
...,...,...,...,...,...,...,...,...
497311,497311,497572,4,True,AVECM71LSZLC5,B01HGPUTCA,not OEM but good replacement parts,Four Stars
497312,497312,497573,3,True,A1RS06313BL6WN,B01HH6JEOC,Okay stuff.,Three Stars
497313,497313,497574,3,True,ACIZ77IGIX2JL,B01HH6JEOC,This does add some kids room things that are v...,Only buy on sale.
497314,497314,497575,4,False,A34GG58TJ1A3SH,B01HIZF7XE,I think I originally began playing Bioshock se...,"It's Okay, Nothing Profound"


In [116]:
df=df.drop(['level_0','index'],axis=1)

In [38]:
df.to_parquet('videogames.parquet',index=False)

# Trying to find MWEs

In [98]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm', exclude=[
                 'parser', 'ner', 'lemmatizer', 'attribute_ruler'])

matcher = Matcher(nlp.vocab)
matcher.add('Term', [[{'TAG': {'IN': ['JJ', 'NN', 'NNP']}},
                      {'TAG': {'IN': ['JJ', 'NN', 'IN',
                                      'HYPH', 'NNP']}, 'OP': '*'},
                      {'TAG': {'IN': ['NN', 'NNP']}}]])


def get_candidates(text):
    doc = nlp(text)
    spans = matcher(doc, as_spans=True)
    return [tuple(tok.norm_ for tok in span) for span in spans]

In [27]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:33105")
client

0,1
Client  Scheduler: tcp://127.0.0.1:33105  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.62 GB


In [99]:
import dask.bag as db
import dask.dataframe as dd

texts = dd.from_pandas(df['reviewText'].sample(
    100000, random_state=19), npartitions=50).to_bag()

graph = texts.map(get_candidates).flatten().frequencies()

In [100]:
df[:].dtypes

overall        int64
verified        bool
reviewerID    object
asin          object
reviewText    object
summary       object
dtype: object

# Finding candidate terms, computing c-values

In [103]:
%%time

candidates = graph.compute()

CPU times: user 5.37 s, sys: 639 ms, total: 6.01 s
Wall time: 4min 46s


In [104]:
candidates[:30]

[(('avid', 'gamer'), 55),
 (('insult', 'to', 'gaming'), 1),
 (('first', 'campaign'), 9),
 (('real', 'tutorial'), 5),
 (('dirt', 'tour'), 1),
 (('tour', 'mode'), 26),
 (('dirt', 'tour', 'mode'), 1),
 (('excellent', 'gameplay'), 47),
 (('cool', 'obstacle'), 1),
 (('same', 'difficulty'), 19),
 (('difficulty', 'spike'), 12),
 (('same', 'difficulty', 'spike'), 1),
 (('spike', 'dirt'), 1),
 (('difficulty', 'spike', 'dirt'), 1),
 (('same', 'difficulty', 'spike', 'dirt'), 1),
 (('casual', 'difficulty'), 3),
 (('big', 'time'), 118),
 (('time', 'on', 'medium'), 1),
 (('big', 'time', 'on', 'medium'), 1),
 (('custom', 'difficulty'), 1),
 (('dead', 'zone'), 34),
 (('loose', 'ground'), 1),
 (('smooth', 'as', 'silk'), 19),
 (('good', 'game'), 2930),
 (('1st', 'shipment'), 1),
 (('2nd', 'shipment'), 1),
 (('fake', 'one'), 1),
 (('wrong', 'key'), 8),
 (('fake', 'dvd'), 1),
 (('great', 'game'), 5526)]

These are some unordered candidate terms that have been extracted, along with their frequencies. 

In [105]:
from nltk import ngrams


def get_subterms(term):
    k = len(term)
    for m in range(k-1, 1, -1):
        yield from ngrams(term, m)

In [106]:
from collections import Counter, defaultdict
from math import log2

freqs = defaultdict(Counter)
for c, f in candidates:
    freqs[len(c)][c] += f


def c_value(F, theta):

    termhood = Counter()
    longer = defaultdict(list)

    for k in sorted(F, reverse=True):
        for term in F[k]:
            if term in longer:
                discount = sum(longer[term]) / len(longer[term])
            else:
                discount = 0
            c = log2(k) * (F[k][term] - discount)
            if c > theta:
                termhood[term] = c
                for subterm in get_subterms(term):
                    if subterm in F[len(subterm)]:
                        longer[subterm].append(F[k][term])
    return termhood

Here we've defined a calculation for c-values to be used on our candidate terms, which is how we'll be ranking them. This requires having a threshold (theta) specified.

In [107]:
freqs[4].most_common(6)

[(('co', '-', 'op', 'mode'), 119),
 (('single', '-', 'player', 'campaign'), 94),
 (('player', 'co', '-', 'op'), 90),
 (('breath', 'of', 'fresh', 'air'), 89),
 (('div', 'id="video', '-', 'block'), 83),
 (('top', '-', 'mini', 'video'), 83)]

These are those same candidate terms previously seen above, now in descending order with sequence lengths specified (in this case, these are the top 6 occurrences of sequences of length 4).

In [112]:
terms = c_value(freqs, theta=300)

# Using theta to find c-values and candidate terms

In [113]:
for t, c in terms.most_common(30):
    print(f'{c:8.2f} {freqs[len(t)][t]:5d} {" ".join(t)}')

 5526.00  5526 great game
 2930.00  2930 good game
 2917.00  2917 game play
 2709.69  1167 data - hook="product - link
 2566.00  2566 video game
 2551.79  1610 lot of fun
 2518.00  2518 final fantasy
 2433.00  2433 fun game
 2269.67  1432 call of duty
 2205.33  2487 single player
 2139.70  1350 class="a - link
 2015.00  2015 first game
 1887.69  1191 co - op
 1629.00  1629 long time
 1534.00  1534 replay value
 1504.00  1504 first time
 1455.00  1455 resident evil
 1359.00  1359 story line
 1218.84   769 god of war
 1209.00  1209 much fun
 1179.00  1179 wii u
 1168.00  1168 first person
 1129.00  1129 only thing
 1102.00  1302 super mario
 1030.00  1030 same time
 1011.00  1011 little bit
  973.00   973 story mode
  935.00   935 mario kart
  933.54   589 grand theft auto
  933.00   933 open world


These values are for the top end of candidate terms ordered by c-value. Depending on how wide your dataset is, theta may not affect the values that are shown here as much as the bottom set of candidate terms (below).

In [114]:
for t, c in tail(30, terms.most_common()):
    print(f'{c:8.2f} {freqs[len(t)][t]:5d} {" ".join(t)}')
    
# interestingly, I was under the impression that a way to determine if theta was too low 
# was to check if the bottom set was sparse of technical terms. 
# At theta=20, there are many proper nouns 
# (liberty city stories, mass effect andromeda, test drive unlimited, lego harry potter, etc.)
# and also generally many domain-specific terms, like (auto-attack, vita memory card, xbox one kinect)
# willing to bet it's just what I'm working with making the most difference.

  349.00   349 full price
  342.00   342 great buy
  342.00   342 3rd person
  339.00   339 max payne
  336.01   212 world of warcraft
  336.01   212 waste of money
  333.00   537 game boy
  333.00   333 person shooter
  331.26   209 top - notch
  328.00   328 first place
  327.00   553 player mode
  327.00   327 nice touch
  324.00   324 career mode
  324.00   324 ninja gaiden
  323.33   204 game boy advance
  323.00   323 good time
  323.00   323 star rating
  321.75   203 open - world
  319.00   319 final boss
  319.00   319 mini -
  318.00   318 mario party
  318.00   318 gaming mouse
  316.99   200 super mario bros.
  314.00   314 modern warfare
  313.00   313 character development
  308.00   308 great addition
  308.00   308 online multiplayer
  304.00   304 hard time
  304.00   304 vice city
  302.00   302 control scheme


The values directly above represent the lowest 30 candidate terms, also ordered by c-value. Theta tends to have more noticeable effects for this side of terms.

# Final candidate terms for use in subsequent notebooks

In [115]:
with open('vidgame-terms.txt','w') as f:
    for t in terms:
        print(' '.join(t), file=f)