In [122]:
import os
import re
import fnmatch
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

def vectorize(df):
    nFeatures=10000
    tf_vectorizer = CountVectorizer(max_df=0.85, min_df=2,
                                max_features=nFeatures,
                                stop_words='english', lowercase=True)
    tf = tf_vectorizer.fit_transform(df)
    features = tf_vectorizer.get_feature_names()
    return (tf, features)


def runLDA(n, iters, wordMat):
    lda = LatentDirichletAllocation(n_topics=n, max_iter=5,
                                learning_method='batch',
                                learning_offset=50.,
                                random_state=0)
    lda=lda.fit(wordMat)
    return lda
    
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [123]:
reviews=[]
for file in os.listdir('dataset/reviews/ps3'):
    with open('dataset/reviews/ps3/'+file, 'r') as ip:
        data=ip.read()
        review=re.findall(r':::Review:::(.*?):::User Reviews:::', data, re.DOTALL)
        review=str(review)
        reviews.append(review)

n=10
iters=5
nWords=10

wordMat, features=vectorize(reviews)
lda=runLDA(n, iters, wordMat)
print_top_words(lda, features, nWords)

Topic #0:
combat sonic level don attack ve look story enemies use
Topic #1:
skate new games vegas city little tricks fun online makes
Topic #2:
racing mode race online games new vehicle units nbsp player
Topic #3:
racer ridge online race series car cars mode tracks singstar
Topic #4:
characters enemies action mode attacks character fight fighting new story
Topic #5:
racing race car games cars events mode 360 gameplay races
Topic #6:
players player mode year ball new online games team great
Topic #7:
player far version online games series new makes playstation action
Topic #8:
new action campaign multiplayer enemies enemy guitar fun player games
Topic #9:
games look man new team points bad players spider don



In [124]:
df=pd.DataFrame(columns=['Name', 'Publisher', 'GameSpotScore', 'Review'])

i=0
for dirpath, dirs, files in os.walk('dataset/reviews'):   
    for file in fnmatch.filter(files, '*.txt'):
        with open(os.path.join(dirpath, file), 'r') as ip:
            data=ip.read()
            name=re.findall(r':::Game Name:::(.*?)-----', data, re.DOTALL)[0].strip()
            review=re.findall(r':::Review:::(.*?)-----',data, re.DOTALL)[0].strip()
            scores=re.findall(r':::Scores:::(.*?)-----',data, re.DOTALL)[0]
            addition=re.findall(r':::Addition:::(.*?)-----',data, re.DOTALL)[0]
            gsScore=re.findall(r'GameSpot Score:(.*?)\n', scores)[0]
            try:
                pub=re.findall(r'Publisher:(.*?)\n', addition)[0]
            except:
                pub=''
            df.loc[i]=[name, pub, gsScore, review]
            i+=1

In [125]:
df2=df[['Name', 'Publisher']].groupby(['Publisher']).count()
df2=df2.sort_values(['Name'], ascending=False).head(10)
topPubs= (list(df2.axes[0]))
topPubReviewsDf=df[df['Publisher'].isin(topPubs)]

n=10
iters=5
nWords=10
wordMat, features=vectorize(topPubReviewsDf['Review'])
lda=runLDA(n, iters, wordMat)
print_top_words(lda, features, nWords)

Topic #0:
games enemies characters good fun puzzles story play make adventure
Topic #1:
play players new player games mode team year ball fifa
Topic #2:
new combat weapons character good characters multiplayer games missions play
Topic #3:
units strategy new games real campaign play battle enemy good
Topic #4:
mode new play way nbsp make imgelem good year fun
Topic #5:
new sims characters levels use character make way look ve
Topic #6:
golf flight tiger play ball new course tour woods games
Topic #7:
action enemies good way play gameplay make isn little enemy
Topic #8:
action version mode good pc multiplayer quake missions play single
Topic #9:
racing car cars race mode version good games way make



In [126]:
df['GameSpotScore'] =pd.to_numeric(df['GameSpotScore'])
topDf=df[df['GameSpotScore']>=8]

print(len(topDf))
n=10
iters=5
nWords=10
wordMat, features=vectorize(topDf['Review'])
lda=runLDA(n, iters, wordMat)
print_top_words(lda, features, nWords)

1157
Topic #0:
ll just nhl play good new make ve action games
Topic #1:
racing car race cars ll track real new driving mode
Topic #2:
units strategy new games battle ll combat war real play
Topic #3:
ll new multiplayer games missions just action good weapons single
Topic #4:
games new just ll amp play action players multiplayer ve
Topic #5:
nba ll prince live play new make just monkey games
Topic #6:
new character ll characters playing games world just players role
Topic #7:
ll missions mission just new play weapons games story team
Topic #8:
ll tony hawk mode games tricks new just make way
Topic #9:
play players player mode ll games new ball online just



In [127]:
df['GameSpotScore'] =pd.to_numeric(df['GameSpotScore'])
botDf=df[df['GameSpotScore']<=3]

n=10
iters=5
nWords=10
wordMat, features=vectorize(botDf['Review'])
lda=runLDA(n, iters, wordMat)
print_top_words(lda, features, nWords)

Topic #0:
mortyr nazis time mirage attempting futuristic kill world unoriginal hate
Topic #1:
sonic time level big make graphics ll way actually gameplay
Topic #2:
blitz nfl emergency 2002 heroes city missions series single order
Topic #3:
ll games characters time play don way isn look mode
Topic #4:
ll ambulance way crazy problems driver screen sniper make life
Topic #5:
ll play bad time make games level good way look
Topic #6:
games command iii ships ll time computer screen ship way
Topic #7:
combat ll larry bust office time box outlaw missions make
Topic #8:
ll play world events games way weapons enemies don time
Topic #9:
units time unit real ll way control mission games events

