In [1]:
# In this notebook:

# attempt Topic Modeling via LDA
# Topic Modeling automatically identifies topics in text objects
# Topics == repeating pattern of co-occuring terms in a corpus

# Future research:
# Consider adding TFIDF technique to identify important words
# Kullback Leibler Divergence Score said to help obtain optimal number of topics

In [2]:
helpful_url = 'https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/'

In [3]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords 

In [4]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 100)

In [5]:
gorig = pd.read_csv('amazon_games_clean.csv',index_col=0)

print(f'Dataset has {gorig.shape[0]} samples')

gorig.head()

Dataset has 98144 samples


Unnamed: 0,asin,helppercent,overall,cleansum,cleantxt,cleanboth
0,700099867,0.666667,1,pay unlock content dont think,instal game wa struggle game window live championship race car unlock buy addon game pay nearly ...,pay unlock content dont think instal game wa struggle game window live championship race car unl...
1,700099867,0.7,3,awesome game crash frequently,get version instead p version turn mistake console version game look percent good pc version dea...,awesome game crash frequently get version instead p version turn mistake console version game lo...
2,700099867,1.0,4,dirt,dirt xbox wa okay game start play game laptop buy new game build collection game fun play much b...,dirt dirt xbox wa okay game start play game laptop buy new game build collection game fun play m...
3,700099867,0.846154,5,step dirt terrific,love play dirt think graphic good purchase dirt addition otherand graphic absolutely gorgeous li...,step dirt terrific love play dirt think graphic good purchase dirt addition otherand graphic abs...
4,700099867,1.0,2,couldnt get one work,still havent figure one everything instruct game never instal strange since dont like rate somet...,couldnt get one work still havent figure one everything instruct game never instal strange since...


In [6]:
gorig.dropna(inplace=True)
gorig.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97048 entries, 0 to 98143
Data columns (total 6 columns):
asin           97048 non-null object
helppercent    97048 non-null float64
overall        97048 non-null int64
cleansum       97048 non-null object
cleantxt       97048 non-null object
cleanboth      97048 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 5.2+ MB


In [7]:
vc_asin = gorig['asin'].value_counts()
print(f'Count of total unique games: {len(vc_asin)}')
vc_asin = vc_asin[(vc_asin >= 50) & (vc_asin <= 55)]
print(f'Count of unique games with 50~55 reviews: {len(vc_asin)}')

Count of total unique games: 10430
Count of unique games with 50~55 reviews: 44


In [8]:
# USE THIS TO TOGGLE DATASET SIZE
gedit = gorig[gorig['asin'].isin(list(vc_asin.index))]
gedit.reset_index(drop=True, inplace=True)

print(f'Dataset size game with 50~55 reviews: {gedit.shape[0]}')
print(f"Dataset unique word size is: {len(pd.Series(' '.join(gedit['cleanboth']).split()).value_counts())}")

Dataset size game with 50~55 reviews: 2273
Dataset unique word size is: 16558


## Remove frequent and rare words for better clustering

In [9]:
# non-distinguishing but frequent words like 'game' can affect clustering
both_words = pd.Series(' '.join(gedit['cleanboth']).split()).value_counts()

both_words[:15]

game      14508
play       4331
get        3925
like       3643
wa         3615
one        3181
time       2652
make       2573
ha         2143
go         2053
really     2009
great      1826
good       1812
much       1735
use        1670
dtype: int64

In [10]:
freq_words = both_words[(both_words >= 1300)]
rare_words = both_words[(both_words <= 3)]

print(f'Freq word size: {len(freq_words)}')
print(f'Rare word size: {len(rare_words)}')

Freq word size: 29
Rare word size: 10519


In [21]:
# removing sentiment words by choice as they skew clusters
sentiment_words = ['good','bad','great','terrible','love','hate','awesome','really','much','best','worst']

In [22]:
stop_words = set(stopwords.words('english'))

gedit['cleanboth'] = gedit['cleanboth'].apply(lambda x: " ".join(x for x in str(x).split() if x not in freq_words))
gedit['cleanboth'] = gedit['cleanboth'].apply(lambda x: " ".join(x for x in str(x).split() if x not in rare_words))
gedit['cleanboth'] = gedit['cleanboth'].apply(lambda x: " ".join(x for x in str(x).split() if x not in stop_words))
gedit['cleanboth'] = gedit['cleanboth'].apply(lambda x: " ".join(x for x in str(x).split() if x not in sentiment_words))
gedit['cleanboth'] = gedit['cleanboth'].apply(lambda x: " ".join(x for x in str(x).split() if len(x) > 1))

print(f"Remaining word size: {len(pd.Series(' '.join(gedit['cleanboth']).split()).value_counts())}")

gedit.head()

Remaining word size: 5988


Unnamed: 0,asin,helppercent,overall,cleansum,cleantxt,cleanboth
0,B00000DMB3,1.0,5,love game,love game love game love game get ocarina time year since ha come still rock even comparison wha...,ocarina year since come still rock comparison whats ive playstation couple year ocarina realize ...
1,B00000DMB3,1.0,5,true classic,game blow away didnt expect ocarina meet standard play previous zelda game usually go stale pred...,true classic blow away didnt expect ocarina meet standard previous zelda usually stale predictab...
2,B00000DMB3,0.666667,5,timeless classic,im gamers call zelda veteran ive play almost zelda game manage beat one one love zelda game chal...,timeless classic im gamers call zelda veteran ive almost zelda manage beat zelda challenge diffe...
3,B00000DMB3,1.0,5,one n game must,follow lineage zelda early day classic bite system onward snes day quota link pastquot great gam...,must follow zelda early day classic bite system snes day quota link pastquot every second never ...
4,B00000DMB3,0.833333,5,legendary,play game zelda game ever play wa link past couldnt understand well decide play hype wa sure gla...,legendary zelda ever link past couldnt understand decide hype sure glad ever full adventure wond...


## Topic Modeling

In [23]:
import gensim
from gensim import corpora

In [24]:
word_matrix = [cell.split() for cell in gedit['cleanboth']]

word_matrix[0][:10]

['ocarina',
 'year',
 'since',
 'come',
 'still',
 'rock',
 'comparison',
 'whats',
 'ive',
 'playstation']

In [25]:
g_dict = corpora.Dictionary(word_matrix)
term_matrix = [g_dict.doc2bow(row) for row in word_matrix]

term_matrix[0][:10]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 2),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1)]

In [28]:
g_lda = gensim.models.ldamodel.LdaModel
g_model = g_lda(term_matrix, num_topics=10, id2word = g_dict, passes=50, random_state=None)

In [29]:
# Topic Modeling output
print(g_model.print_topics(num_topics=-1, num_words=4))

[(0, '0.014*"mario" + 0.011*"level" + 0.009*"enemy" + 0.007*"galaxy"'), (1, '0.017*"controller" + 0.014*"xbox" + 0.008*"battery" + 0.008*"silent"'), (2, '0.013*"multiplayer" + 0.012*"player" + 0.012*"map" + 0.010*"campaign"'), (3, '0.027*"headset" + 0.020*"sound" + 0.013*"headphone" + 0.009*"wireless"'), (4, '0.023*"war" + 0.021*"god" + 0.009*"kratos" + 0.009*"wii"'), (5, '0.011*"sims" + 0.008*"want" + 0.007*"system" + 0.006*"see"'), (6, '0.020*"ac" + 0.018*"ship" + 0.015*"assassin" + 0.013*"creed"'), (7, '0.027*"persona" + 0.007*"story" + 0.007*"social" + 0.006*"golden"'), (8, '0.016*"vita" + 0.013*"version" + 0.012*"amaze" + 0.010*"psp"'), (9, '0.016*"story" + 0.010*"end" + 0.009*"feel" + 0.008*"quest"')]


In [18]:
# Future research

# Explore additional LdaModel functions / outputs
# Understand topic weights
# Improve topic selection thru Kullback Leibler Divergence Score
# Tie print_topics to raw review data

In [19]:
# this concludes pt 5