In [1]:
import os
import re
import fnmatch
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity

def vectorize(df):
    nFeatures=1000
    tf_vectorizer = CountVectorizer(max_df=0.85, min_df=0.2,
                                max_features=nFeatures,
                                stop_words='english', lowercase=True)
    tf = tf_vectorizer.fit_transform(df)
    features = tf_vectorizer.get_feature_names()
    return (tf, features)

def runLDA(n, iters, wordMat):
    print('Number of inputs:', np.shape(wordMat)[0], '\n')
    lda = LatentDirichletAllocation(n_topics=n, max_iter=5,
                                learning_method='batch',
                                learning_offset=50.,
                                random_state=0)
    lda=lda.fit(wordMat)
    return lda
    
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx+1))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    
def print_topics(components, pred, features):
    pred=list(pred[0])
    topic_index=pred.index(max(pred))
    topic=components[topic_index]
    print(" ".join([features[i] for i in topic.argsort()[:-10 - 1:-1]]))
    print()
    
def predLDA(x, wordMat, features, model):
    print(x[['Name', 'Genre', 'Console']])
    r=wordMat[x.index.get_values()[0]]
    topics=lda.transform(r)
    print_topics(lda.components_, topics, features)

def getSimilarGames(gameIndex, df, wordMat, lda): 
    mat1=wordMat.todense()
    mat2=np.linalg.pinv(lda.components_)
    x=mat1*mat2
    cos=cosine_similarity(x)
    game1=cos[gameIndex]
    probs_sorted=sorted(game1, reverse=True)

    ipGame=df.iloc[gameIndex]  
    i=0
    sim_games=[]
    lstGames=set()
    lstGames.add(ipGame[0])
    print('Input Game:')
    print(df.iloc[gameIndex], '\n')
    while(len(lstGames)<=5):
        index=int(np.where(game1==probs_sorted[i])[0][0])
        currentGame=df.iloc[index]
        if currentGame[0] not in lstGames:
            sim_games.append(df.iloc[index])
            lstGames.add(currentGame[0])
        i+=1
    
    print('Most similar games:')
    for game in sim_games:
        print(game)
        print()  

### Creating a pandas dataframe out of our dataset of reviews 

In [2]:
df=pd.DataFrame(columns=['Name', 'Publisher', 'GameSpotScore', 'Review', 'Console', 'Genre'])

i=0
for dirpath, dirs, files in os.walk('dataset/reviews'):   
    for file in fnmatch.filter(files, '*.txt'):
        with open(os.path.join(dirpath, file), 'r') as ip:
            data=ip.read()
            name=re.findall(r':::Game Name:::(.*?)-----', data, re.DOTALL)[0].strip()
            review=re.findall(r':::Review:::(.*?)-----',data, re.DOTALL)[0].strip()
            scores=re.findall(r':::Scores:::(.*?)-----',data, re.DOTALL)[0]
            addition=re.findall(r':::Addition:::(.*?)-----',data, re.DOTALL)[0]
            gsScore=re.findall(r'GameSpot Score:(.*?)\n', scores)[0]
            try:
                pub=re.findall(r'Publisher:(.*?)\n', addition)[0]
            except:
                pub=''
            try:
                genre=re.findall(r'Genre:(.*?)\n', addition)[0]
            except:
                genre=''
            console=dirpath.strip('dataset/reviews/')
            df.loc[i]=[name, pub, gsScore, review, console, genre]
            i+=1

###  Running LDA to get topics. Choosing number of topics = 10 and word per topic = 10

In [3]:
n=10
iters=5
nWords=10
wordMat, features=vectorize(df['Review'])
lda=runLDA(n, iters, wordMat)
print_top_words(lda, features, nWords)

Number of inputs: 4303 

Topic 1:
ll new just make need world different way don look
Topic 2:
mode screen ll just play player multiplayer single modes make
Topic 3:
players play player online new games mode team year series
Topic 4:
games just pc play version good mode ll real really
Topic 5:
new battle combat battles war strategy campaign real map turn
Topic 6:
ll enemies story just action characters good use make way
Topic 7:
ll control strategy real team games enemy good different play
Topic 8:
missions mission campaign just games multiplayer ll enemy good war
Topic 9:
weapons ll enemies levels level multiplayer action enemy player single
Topic 10:
character characters ll games just playing story make levels way



### Running lda only on top 10 publishers having maximum game releases

In [4]:
df2=df[['Name', 'Publisher']].groupby(['Publisher']).count()
df2=df2.sort_values(['Name'], ascending=False).head(10)
topPubs= (list(df2.axes[0]))
topPubReviewsDf=df[df['Publisher'].isin(topPubs)]

n=10
iters=5
nWords=10
wordMat, features=vectorize(topPubReviewsDf['Review'])
lda=runLDA(n, iters, wordMat)
print_top_words(lda, features, nWords)

Number of inputs: 1465 

Topic 1:
enemies action level good enemy fight make bad way pretty
Topic 2:
combat battle war missions battles enemy campaign characters enemies action
Topic 3:
play games good gameplay screen make 3d graphics action new
Topic 4:
games character new characters world life make way experience series
Topic 5:
version xbox 360 pc way versions look story good play
Topic 6:
strategy real games new building make different play interface need
Topic 7:
players play new player mode year team online games series
Topic 8:
games player play players team multiplayer online mode single campaign
Topic 9:
speed mode need track events different good games little way
Topic 10:
new original missions weapons mission levels multiplayer good level single



### Running for games that have a GameSpot Rating of 8 or above

In [5]:
df['GameSpotScore'] =pd.to_numeric(df['GameSpotScore'])
topDf=df[df['GameSpotScore']>=8]

n=10
iters=5
nWords=10
wordMat, features=vectorize(topDf['Review'])
lda=runLDA(n, iters, wordMat)
print_top_words(lda, features, nWords)

Number of inputs: 1108 

Topic 1:
levels mode level ll new play games screen just moves
Topic 2:
ll action enemies weapons story just way use games good
Topic 3:
character ll characters playing games just new role combat different
Topic 4:
strategy battle war games real ll play battles different campaign
Topic 5:
players play player team mode ll year online games just
Topic 6:
missions mission combat campaign multiplayer enemy war ll games single
Topic 7:
new original players multiplayer player play features campaign just great
Topic 8:
ll just track good speed make real really need way
Topic 9:
ii original sequel day new battle different players graphics mode
Topic 10:
play ll online players games world mode new just make



### Running for games that have a GameSpot Rating of 3 or below

In [6]:
df['GameSpotScore'] =pd.to_numeric(df['GameSpotScore'])
botDf=df[df['GameSpotScore']<=3]
# print((botDf))
n=10
iters=5
nWords=10
wordMat, features=vectorize(botDf['Review'])
lda=runLDA(n, iters, wordMat)
print_top_words(lda, features, nWords)

Number of inputs: 133 

Topic 1:
mode ll player screen play make attack sound gameplay battle
Topic 2:
missions way ll mission time start feel combat place trying
Topic 3:
jump fun games running possible ll simple clear series good
Topic 4:
way time level mission control broken isn play bad make
Topic 5:
games combat look ll way pc life bad completely work
Topic 6:
ll time make way really actually doesn right problems good
Topic 7:
time games play real way missions computer strategy players don
Topic 8:
ll real life experience fun control level time character high
Topic 9:
ll enemies world weapons play shooter don levels good person
Topic 10:
ll characters action level bad time enemies character levels play



### Filtering dataset based on Genres and performing LDA on top 5 genres

In [7]:
genres=df[['Name']].groupby(df['Genre']).count()
genres=genres.sort_values(['Name'], ascending=False).head(5)
genres=list(genres.axes[0])

for genre in genres:
    df_genre=df[(df['Genre'] == genre)]
    print(genre)
    n=5
    iters=5
    nWords=10
    wordMat, features=vectorize(df_genre['Review'])
    lda=runLDA(n, iters, wordMat)
    print_top_words(lda, features, nWords)

 Action
Number of inputs: 355 

Topic 1:
ll time action like just play games good new make
Topic 2:
new missions like mission gameplay just make graphics level levels
Topic 3:
ll enemies like just time combat enemy level good way
Topic 4:
levels level time ll mode screen play characters player like
Topic 5:
games play like just ll characters gameplay playing isn make

 Adventure
Number of inputs: 300 

Topic 1:
puzzles adventure myst games just story puzzle time little make
Topic 2:
ll dark case characters new adventure items order story gameplay
Topic 3:
ll puzzles time just adventure character play bad characters make
Topic 4:
ll just characters time adventure story games make good way
Topic 5:
video ll games time just clues doesn hard click end

 Strategy
Number of inputs: 296 

Topic 1:
units enemy mission missions unit real 3d command strategy combat
Topic 2:
battles battle play great fun games good place campaign want
Topic 3:
units war games strategy real unit combat scenarios g

### Below we are trying to find similar games based on a given game. We have taken the cosine between vectors as a measure of similarity to find top 5 games with maximum similarity 

In [8]:
wordMat, features=vectorize(df['Review'])
lda=runLDA(n, iters, wordMat)

Number of inputs: 4303 



In [9]:
gameIndex=3525
getSimilarGames(gameIndex, df, wordMat, lda)

Input Game:
Name                                           Grand Theft Auto IV
Publisher                                           Rockstar Games
GameSpotScore                                                   10
Review           A superb single-player story mode and online s...
Console                                                        ps3
Genre                                      Modern Action Adventure
Name: 3525, dtype: object 

Most similar games:
Name                                 Red Orchestra: Ostfront 41-45
Publisher                                           Valve Software
GameSpotScore                                                  7.9
Review           Red Orchestra is an enjoyable, team-based onli...
Console                                                         PC
Genre                                     Historic First-Person...
Name: 2407, dtype: object

Name                                      Full Auto 2: Battlelines
Publisher                                 

In [10]:
gameIndex=3515
getSimilarGames(gameIndex, df, wordMat, lda)

Input Game:
Name                                                FIFA Soccer 08
Publisher                                                EA Sports
GameSpotScore                                                  8.5
Review           The first FIFA for the PS3 is an impressive de...
Console                                                        ps3
Genre                                                   Soccer Sim
Name: 3515, dtype: object 

Most similar games:
Name                                                   NBA Live 99
Publisher                                                EA Sports
GameSpotScore                                                  8.9
Review           When you strip away all the great new graphics...
Console                                                         PC
Genre                                               Basketball Sim
Name: 2108, dtype: object

Name                                                    Top Spin 3
Publisher                                 

In [11]:
gameIndex=4236
getSimilarGames(gameIndex, df, wordMat, lda)

Input Game:
Name                                           The Incredible Hulk
Publisher                                                     Sega
GameSpotScore                                                  5.5
Review           You won't like it when this simplistic, repeti...
Console                                                    Xbox360
Genre                                                       Action
Name: 4236, dtype: object 

Most similar games:
Name                                             Warriors Orochi 2
Publisher                                                     Koei
GameSpotScore                                                    4
Review           Warriors Orochi 2 elevates hack-and-slash acti...
Console                                                    Xbox360
Genre                                                  Beat-'Em-Up
Name: 4288, dtype: object

Name                             Lionheart: Legacy of the Crusader
Publisher                                 

In [12]:
gameIndex=3687
getSimilarGames(gameIndex, df, wordMat, lda)

Input Game:
Name                                    WWE SmackDown vs. Raw 2009
Publisher                                                      THQ
GameSpotScore                                                  7.5
Review           SmackDown vs. Raw 2009 successfully returns to...
Console                                                        ps3
Genre                                                    Wrestling
Name: 3687, dtype: object 

Most similar games:
Name                                        Mario Strikers Charged
Publisher                                                 Nintendo
GameSpotScore                                                  7.5
Review           Mario's soccer sequel is a markedly more enjoy...
Console                                                          W
Genre                                                       Soccer
Name: 3793, dtype: object

Name                              Tom Clancy's Rainbow Six Vegas 2
Publisher                                 

In [13]:
gameIndex=3485
getSimilarGames(gameIndex, df, wordMat, lda)

Input Game:
Name                                                Call of Duty 3
Publisher                                               Activision
GameSpotScore                                                  8.2
Review           Though it plays like many other WWII-based fir...
Console                                                        ps3
Genre                                     Historic First-Person...
Name: 3485, dtype: object 

Most similar games:
Name                                    50 Cent: Blood on the Sand
Publisher                                                      THQ
GameSpotScore                                                    7
Review           Over-the-top thug swagger and solid gameplay h...
Console                                                    Xbox360
Genre                                               Modern Shooter
Name: 3939, dtype: object

Name                                                  Saints Row 2
Publisher                                 