# Loading tokenizing data for the LDA topic model

In [1]:
import pandas as pd
import numpy as np
from cytoolz import *
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
df = pd.read_parquet('videogames.parquet')

In [3]:
import tomotopy as tp

mdl = tp.LDAModel.load('vidgame-topics.bin')

# Finding games with the most 1-star ratings

In [5]:
df[df['overall']==1]['asin'].value_counts().head(20)

B00178630A    486
B000ZKA0J6    144
B000X9FV5M    127
B00W8FYFBA    118
B00CRN9UWC    107
B007FTE2VW    107
B00ZQB28XK    106
B00140P9BA     90
B00QXJFE08     83
B00BGA9Y3W     79
B0016BVY7U     78
B00269QLJ2     74
B00J48MUS4     70
B005WWZUQ0     67
B00KSQHX1K     64
B01EZAA2ZI     63
B001BNFQKO     60
B00CMQTVUA     59
B00166N6SA     51
B00VU4J8YY     50
Name: asin, dtype: int64

# Corresponding game titles of asin numbers
Diablo III, StarCraft II, Far Cry 2, Star Wars: Battlefront


SimCity, SimCity (Limited Edition), No Man's Sky, Mass Effect


Street Fighter V, Destiny, Command & Conquer: Red Alert 3, Call of Duty: Modern Warfare 2


Assassin's Creed Unity, Battlefield 3, Halo: The Master Chief Collection, Call of Duty: Infinite Warfare, Grand Theft Auto IV

In [9]:
# StarCraft II
subdf1 = df[df['asin']=='B000ZKA0J6']
subdf1['overall'].value_counts()

5    324
4    150
1    144
3     80
2     76
Name: overall, dtype: int64

In [10]:
# Diablo III
subdf2 = df[df['asin']=='B00178630A']
subdf2['overall'].value_counts()

1    486
5    369
2    194
4    186
3    146
Name: overall, dtype: int64

Interestingly, for StarCraft II having the second-most 1-star reviews, most of its reviews are actually 5 stars. For Diablo III, however, almost 500 reviews gave a 1-star rating while simultaneously being given 5 stars for almost 400 reviews. Diablo III is definitely considered devisive.

In [11]:
# Far Cry 2
subdf3 = df[df['asin']=='B000X9FV5M']
subdf3['overall'].value_counts()

1    127
5     88
4     64
2     42
3     26
Name: overall, dtype: int64

In [12]:
# Star Wars: Battlefront
subdf4 = df[df['asin']=='B00W8FYFBA']
subdf4['overall'].value_counts()

1    118
5    106
3     54
2     53
4     44
Name: overall, dtype: int64

# More tokenizing

In [13]:
from tokenizer import MWETokenizer

tokenizer = MWETokenizer(open('vidgame-terms.txt'))

In [16]:
# Tokenizing Diablo III
subdf2['tokens'] = subdf2['reviewText'].progress_apply(tokenizer.tokenize)

  0%|          | 0/1381 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf2['tokens'] = subdf2['reviewText'].progress_apply(tokenizer.tokenize)


# Applying our topic model

In [17]:
subdf2['docs'] = [mdl.make_doc(words=toks) for toks in subdf2['tokens']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf2['docs'] = [mdl.make_doc(words=toks) for toks in subdf2['tokens']]


In [19]:
topic_dist, ll = mdl.infer(subdf2['docs'])

# Model interpretation, most common topics for reviews

In [64]:
subdf2['reviewText'].iloc[15]

'A great successor to Diablo 2. An updated skill system makes it easy to customize your character and set skills and attacks to suit your play style. The graphics are smooth and pretty. The game has got a pretty good story, with fun, addicting gameplay. But when you beat the game is when the game really gets going, with an endless amount of end game content to keep improving your character, getting higher ranks and better gear as you progress up through the many difficulties.\nHighly recommended to any action rpg fans, and really anyone who enjoys killing hordes of monsters.'

In [76]:
subdf2['docs'].iloc[15].get_topics(top_n=5)

[(53, 0.23067273199558258),
 (45, 0.18530260026454926),
 (9, 0.1675429493188858),
 (35, 0.14477653801441193),
 (42, 0.10566515475511551)]

In [66]:
mdl.get_topic_words(53)

[('recommend', 0.05351521447300911),
 ('great_game', 0.0413653664290905),
 ('highly', 0.0326935276389122),
 ('awesome', 0.0236738882958889),
 ('fan', 0.019291594624519348),
 ('must', 0.01785401813685894),
 ('anyone', 0.016903361305594444),
 ('series', 0.016903361305594444),
 ('good_game', 0.01609182544052601),
 ('buy', 0.015488970093429089)]

In [67]:
mdl.get_topic_words(45)

[('diablo', 0.038367945700883865),
 ('level', 0.023443665355443954),
 ('mmo', 0.022387079894542694),
 ('blizzard', 0.020802199840545654),
 ('character', 0.019745614379644394),
 ('pvp', 0.016840001568198204),
 ('wow', 0.01677396520972252),
 ('players', 0.015057012438774109),
 ('quests', 0.013141949661076069),
 ('class', 0.013009876012802124)]

In [68]:
mdl.get_topic_words(9)

[('short', 0.01773695833981037),
 ('boring', 0.015998441725969315),
 ('bit', 0.01574450172483921),
 ('repetitive', 0.013634842820465565),
 ('hours', 0.01347857154905796),
 ('overall', 0.012853487394750118),
 ('think', 0.012052598409354687),
 ('lot', 0.011915861628949642),
 ('game_play', 0.011642387136816978),
 ('gets', 0.011036837473511696)]

In [69]:
mdl.get_topic_words(35)

[('weapons', 0.028286008164286613),
 ('level', 0.020107002928853035),
 ('items', 0.018885832279920578),
 ('enemies', 0.018317846581339836),
 ('armor', 0.015307518653571606),
 ('character', 0.013887552544474602),
 ('weapon', 0.013859152793884277),
 ('system', 0.010309237986803055),
 ('upgrade', 0.010167241096496582),
 ('skills', 0.009854848496615887)]

In [70]:
mdl.get_topic_words(42)

[('each', 0.020343095064163208),
 ('different', 0.015239209868013859),
 ('player', 0.012614700011909008),
 ('own', 0.011622949503362179),
 ('two', 0.010752145200967789),
 ('choose', 0.010002285242080688),
 ('players', 0.008599321357905865),
 ('allows', 0.007692232728004456),
 ('three', 0.00726892426609993),
 ('both', 0.007256830111145973)]

With review 16, the topics most associated with it are a mix of praise, criticism, suggestions, and descriptions. 

In [73]:
subdf2['topics'] = [list(map(first, d.get_topics(3))) for d in subdf2['docs']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf2['topics'] = [list(map(first, d.get_topics(3))) for d in subdf2['docs']]


In [74]:
subdf2['topics']

138493    [43, 45, 89]
138494     [6, 89, 30]
138495     [43, 5, 47]
138496    [89, 30, 43]
138497     [45, 0, 43]
              ...     
162488     [12, 87, 6]
162489    [12, 45, 27]
472007     [6, 89, 48]
472008     [6, 89, 30]
472009     [6, 89, 30]
Name: topics, Length: 1381, dtype: object

As for the most-common topics, these are the top 3 topics associated with each review in ascending order.

In [77]:
from collections import Counter

In [78]:
topic_freq = Counter(concat(subdf2['topics']))
print(f'Top Freq Words')
for t, c in topic_freq.most_common(20):
    print(f'{t:3d} {c:4d}', ', '.join(map(first, mdl.get_topic_words(t))))

Top Freq Words
 45  768 diablo, level, mmo, blizzard, character, pvp, wow, players, quests, class
 30  426 know, we, see, think, say, let, why, going, us, those
 89  413 lot, though, things, think, bit, say, nice, actually, something, few
 43  284 however, being, though, quite, simply, experience, rather, may, fact, less
  0  254 did, bought, years, before, few, since, came, ago, started, day
 79  244 bad, nothing, why, terrible, money, boring, stupid, ever, worst, made
 12  164 windows, steam, download, computer, install, pc, work, software, internet, disk
 71  144 down, again, times, around, every, start, right, see, going, before
  9  138 short, boring, bit, repetitive, hours, overall, think, lot, game_play, gets
 55  112 $, price, worth, buy, money, 20, pay, bought, dlc, 60
  6  101 works, product, excellent, came, item, perfect, thanks, condition, thank, fast
 53   94 recommend, great_game, highly, awesome, fan, must, anyone, series, good_game, buy
 50   84 issues, problems, bugs,

Of the top 15 topics (meaning topic x occurred in y amount of this game's reviews on Amazon), topics that recommend paying attention to are 79, 9, and 50, though in general this is for all of its reviews combined regardless of the rating.

In [79]:
# top topics of 1-star reviews
topic_freq = Counter(concat(subdf2[subdf2['overall']==1]['topics']))
print(f'Top Freq Words')
for t, c in topic_freq.most_common(20):
    print(f'{t:3d} {c:4d}', ', '.join(map(first, mdl.get_topic_words(t))))

Top Freq Words
 45  276 diablo, level, mmo, blizzard, character, pvp, wow, players, quests, class
 79  154 bad, nothing, why, terrible, money, boring, stupid, ever, worst, made
 30  152 know, we, see, think, say, let, why, going, us, those
 12  110 windows, steam, download, computer, install, pc, work, software, internet, disk
 43  102 however, being, though, quite, simply, experience, rather, may, fact, less
  0   92 did, bought, years, before, few, since, came, ago, started, day
 71   78 down, again, times, around, every, start, right, see, going, before
 89   74 lot, though, things, think, bit, say, nice, actually, something, few
 55   56 $, price, worth, buy, money, 20, pay, bought, dlc, 60
 27   38 online, multiplayer, mode, player, players, friends, co-op, single_player, campaign, team
  5   38 work, product, amazon, bought, did, buy, problem, return, working, replacement
  9   28 short, boring, bit, repetitive, hours, overall, think, lot, game_play, gets
 62   28 series, fans, p

In [80]:
# top topics for 5-star reviews
topic_freq = Counter(concat(subdf2[subdf2['overall']==5]['topics']))
print(f'Top Freq Words')
for t, c in topic_freq.most_common(20):
    print(f'{t:3d} {c:4d}', ', '.join(map(first, mdl.get_topic_words(t))))

Top Freq Words
 89  177 lot, though, things, think, bit, say, nice, actually, something, few
 30  142 know, we, see, think, say, let, why, going, us, those
 45  138 diablo, level, mmo, blizzard, character, pvp, wow, players, quests, class
  6   83 works, product, excellent, came, item, perfect, thanks, condition, thank, fast
  0   78 did, bought, years, before, few, since, came, ago, started, day
 53   54 recommend, great_game, highly, awesome, fan, must, anyone, series, good_game, buy
 43   44 however, being, though, quite, simply, experience, rather, may, fact, less
  9   30 short, boring, bit, repetitive, hours, overall, think, lot, game_play, gets
 18   28 ever, every, amazing, never, buy, awesome, say, made, must, hours
 46   26 stars, 5, 4, star, give, review, 1, reviews, gave, five
 50   22 issues, problems, bugs, times, problem, glitches, save, patch, issue, fix
 62   20 series, fans, previous, fan, we, title, franchise, since, years, titles
 48   19 easy, learn, hard, learning