In [1]:
import os
import re
import fnmatch
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
import string

def vectorize(df):
    nFeatures=1000
    tf_vectorizer = CountVectorizer(max_df=0.85, min_df=0.2,
                                max_features=nFeatures,
                                stop_words='english')
    tf = tf_vectorizer.fit_transform(df)
    features = tf_vectorizer.get_feature_names()
    return (tf, features)

def runLDA(n, iters, wordMat):
    print('Number of inputs:', np.shape(wordMat)[0], '\n')
    lda = LatentDirichletAllocation(n_topics=n, max_iter=5,
                                learning_method='batch',
                                learning_offset=50.,
                                random_state=0)
    lda=lda.fit(wordMat)
    return lda
    
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx+1))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    
def print_topics(components, pred, features):
    pred=list(pred[0])
    topic_index=pred.index(max(pred))
    topic=components[topic_index]
    print(" ".join([features[i] for i in topic.argsort()[:-10 - 1:-1]]))
    print()
    
def predLDA(x, wordMat, features, model):
    print(x[['Name', 'Genre', 'Console']])
    r=wordMat[x.index.get_values()[0]]
    topics=lda.transform(r)
    print_topics(lda.components_, topics, features)

def getSimilarGames(gameIndex, df, wordMat, lda): 
    mat1=wordMat.todense()
    mat2=np.linalg.pinv(lda.components_)
    x=mat1*mat2
    cos=cosine_similarity(x)
    game1=cos[gameIndex]
    probs_sorted=sorted(game1, reverse=True)

    ipGame=df.iloc[gameIndex]  
    i=0
    sim_games=[]
    lstGames=set()
    lstGames.add(ipGame[0])
    print('Input Game:')
    print(df.iloc[gameIndex], '\n')
    while(len(lstGames)<=5):
        index=int(np.where(game1==probs_sorted[i])[0][0])
        currentGame=df.iloc[index]
        if currentGame[0] not in lstGames:
            sim_games.append(df.iloc[index])
            lstGames.add(currentGame[0])
        i+=1
    
    print('Most similar games:')
    for game in sim_games:
        print(game)
        print()  

### Creating a pandas dataframe out of our dataset of reviews 

In [2]:
df=pd.DataFrame(columns=['Name', 'Publisher', 'GameSpotScore', 'Review', 'Console', 'Genre'])

i=0
for dirpath, dirs, files in os.walk('dataset/reviews'):   
    for file in fnmatch.filter(files, '*.txt'):
        with open(os.path.join(dirpath, file), 'r') as ip:
            data=ip.read()
            name=re.findall(r':::Game Name:::(.*?)-----', data, re.DOTALL)[0].strip()
            review=re.findall(r':::Review:::(.*?)-----',data, re.DOTALL)[0].strip()
            scores=re.findall(r':::Scores:::(.*?)-----',data, re.DOTALL)[0]
            addition=re.findall(r':::Addition:::(.*?)-----',data, re.DOTALL)[0]
            gsScore=re.findall(r'GameSpot Score:(.*?)\n', scores)[0]
            review = review.lower()
            tVar = str.maketrans('', '', string.punctuation)
            review = review.translate(tVar)
            try:
                pub=re.findall(r'Publisher:(.*?)\n', addition)[0]
            except:
                pub=''
            try:
                genre=re.findall(r'Genre:(.*?)\n', addition)[0]
            except:
                genre=''
            console=dirpath.strip('dataset/reviews/')
            df.loc[i]=[name, pub, gsScore, review, console, genre]
            i+=1

###  Running LDA to get topics. Choosing number of topics = 10 and word per topic = 10

In [3]:
n=25
iters=5
nWords=10
wordMat, features=vectorize(df['Review'])
lda=runLDA(n, iters, wordMat)
print_top_words(lda, features, nWords)

Number of inputs: 8278 

Topic 1:
weapons missions weapon levels action graphics use different effects variety
Topic 2:
3d graphics engine gameplay series mode multiplayer players missions play
Topic 3:
youre play youll theres way isnt really fun lot good
Topic 4:
combat multiplayer weapons singleplayer action weapon different enemies pretty youll
Topic 5:
new missions youll multiplayer original features players great good play
Topic 6:
original new gameplay mode make play map experience theres makes
Topic 7:
youll youre good youve way make great new action dont
Topic 8:
new mode years play players online year player youre youll
Topic 9:
youll enemy youre challenge play level multiplayer xbox enemies bad
Topic 10:
world new series youll make theres way youre previous look
Topic 11:
players player team play youll online gameplay youre mode playing
Topic 12:
levels level youll enemies way youre make action use jump
Topic 13:
battle battles combat youll map attack different new enemy play

### Running lda only on top 10 publishers having maximum game releases

In [4]:
df2=df[['Name', 'Publisher']].groupby(['Publisher']).count()
df2=df2.sort_values(['Name'], ascending=False).head(10)
topPubs= (list(df2.axes[0]))
topPubReviewsDf=df[df['Publisher'].isin(topPubs)]

n=10
iters=5
nWords=10
wordMat, features=vectorize(topPubReviewsDf['Review'])
lda=runLDA(n, iters, wordMat)
print_top_words(lda, features, nWords)

Number of inputs: 3017 

Topic 1:
characters enemies character story action fight attacks good attack fighting
Topic 2:
new missions youre combat mission enemy weapons make good original
Topic 3:
youre good pretty version missions action multiplayer enemy pc xbox
Topic 4:
mode years new year online xbox youre play players version
Topic 5:
youre levels version characters enemies use make way story level
Topic 6:
players play mode player new team youre sports teams series
Topic 7:
youre mode series different speed really need theres good new
Topic 8:
mode players gameplay play good player youre make solid modes
Topic 9:
levels level mode play version original multiplayer gameplay classic console
Topic 10:
team players new play online characters different world character youre



### Running for games that have a GameSpot Rating of 8 or above

In [5]:
df['GameSpotScore'] =pd.to_numeric(df['GameSpotScore'])
topDf=df[df['GameSpotScore']>=8]

n=10
iters=5
nWords=10
wordMat, features=vectorize(topDf['Review'])
lda=runLDA(n, iters, wordMat)
print_top_words(lda, features, nWords)

Number of inputs: 2067 

Topic 1:
combat campaign strategy battle enemy missions multiplayer battles mission different
Topic 2:
version pc xbox youre missions good versions great action need
Topic 3:
characters story character youre make different good great way theres
Topic 4:
players player mode years new team year control make right
Topic 5:
players youre mode online team player teams make different sports
Topic 6:
new original make players youre features theres great better world
Topic 7:
mode new really series good years way great youre sports
Topic 8:
characters character players fighting mode team online different skills gameplay
Topic 9:
mode series new youre different tracks speed track online way
Topic 10:
action weapons enemies missions youre good way use different multiplayer



### Running for games that have a GameSpot Rating of 4 or below

In [14]:
df['GameSpotScore'] =pd.to_numeric(df['GameSpotScore'])
botDf=df[df['GameSpotScore']<=4]
n=10
iters=5
nWords=10
wordMat, features=vectorize(botDf['Review'])
lda=runLDA(n, iters, wordMat)
print_top_words(lda, features, nWords)

Number of inputs: 590 

Topic 1:
bad play theres action enemy youll pretty characters character look
Topic 2:
youll enemies combat characters action attack time youre level levels
Topic 3:
youre theres way actually going time big little new things
Topic 4:
mode play youll time youre button fun players gameplay different
Topic 5:
levels world mode time level youll gameplay make sound play
Topic 6:
level isnt gameplay fun little youll way really thats play
Topic 7:
youll time play theres players youre thats way bad look
Topic 8:
missions mission youre enemy way combat enemies isnt time make
Topic 9:
good youre make doesnt dont sound lot youll need hard
Topic 10:
version original screen graphics new youll little use need make



### Filtering dataset based on Genres and performing LDA on top 5 genres

In [7]:
genres=df[['Name']].groupby(df['Genre']).count()
genres=genres.sort_values(['Name'], ascending=False).head(5)
genres=list(genres.axes[0])

for genre in genres:
    df_genre=df[(df['Genre'] == genre)]
    print(genre)
    n=5
    iters=5
    nWords=10
    wordMat, features=vectorize(df_genre['Review'])
    lda=runLDA(n, iters, wordMat)
    print_top_words(lda, features, nWords)

 Action
Number of inputs: 492 

Topic 1:
games play like players just multiplayer way graphics new time
Topic 2:
levels games enemies youll level like action character boss characters
Topic 3:
mode new youre youll like play theres fun just battle
Topic 4:
youll just youre like time action games make play characters
Topic 5:
level levels time gameplay simple good make games pretty youll

 Adventure
Number of inputs: 341 

Topic 1:
adventure puzzles time just youre make way youll characters great
Topic 2:
time just make play way bad youre fun isnt level
Topic 3:
youll time just youre characters new theres story series adventure
Topic 4:
puzzles adventure characters youll story just good puzzle little make
Topic 5:
just characters good youre gameplay make really story actually time

 Strategy
Number of inputs: 322 

Topic 1:
units strategy missions realtime mission 3d enemy unit new good
Topic 2:
youll just youre time make play theres need new different
Topic 3:
units war unit combat stra

### Below we are trying to find similar games based on a given game. We have taken the cosine between vectors as a measure of similarity to find top 5 games with maximum similarity 

In [8]:
n=25
iters=5
wordMat, features=vectorize(df['Review'])
lda=runLDA(n, iters, wordMat)

Number of inputs: 8278 



In [9]:
gameIndex=745
getSimilarGames(gameIndex, df, wordMat, lda)

Input Game:
Name                                     Pokemon LeafGreen Version
Publisher                                                 Nintendo
GameSpotScore                                                  8.4
Review           both new and longtime pokmon players will find...
Console                                              GameBoyAdvanc
Genre                                                 Role-Playing
Name: 745, dtype: object 

Most similar games:
Name                                       Pokemon FireRed Version
Publisher                                                 Nintendo
GameSpotScore                                                  8.4
Review           both new and longtime pokmon players will find...
Console                                              GameBoyAdvanc
Genre                                                 Role-Playing
Name: 744, dtype: object

Name                          PQ2: Practical Intelligence Quotient
Publisher                                   

In [10]:
gameIndex=2447
getSimilarGames(gameIndex, df, wordMat, lda)

Input Game:
Name                                 Grand Theft Auto: San Andreas
Publisher                                           Rockstar Games
GameSpotScore                                                    9
Review           grand theft auto san andreas is a stupendous t...
Console                                                         PC
Genre                                      Modern Action Adventure
Name: 2447, dtype: object 

Most similar games:
Name                                           Hitman: Blood Money
Publisher                                        Eidos Interactive
GameSpotScore                                                  8.2
Review           while the underlying stealth action is mostly ...
Console                                                         PC
Genre                                      Modern Action Adventure
Name: 2566, dtype: object

Name                                         Thief: Deadly Shadows
Publisher                                 

In [11]:
gameIndex=1751
getSimilarGames(gameIndex, df, wordMat, lda)

Input Game:
Name                                                  Call of Duty
Publisher                                               Activision
GameSpotScore                                                    9
Review           most anyone who plays games would more than li...
Console                                                         PC
Genre                                     Historic First-Person...
Name: 1751, dtype: object 

Most similar games:
Name                                                    25 to Life
Publisher                                        Eidos Interactive
GameSpotScore                                                  5.6
Review           25 to life is a lifelessly generic shooter tha...
Console                                                         PC
Genre                                               Modern Shooter
Name: 1399, dtype: object

Name                                         Tom Clancy's Politika
Publisher                                 

In [12]:
gameIndex=4763
getSimilarGames(gameIndex, df, wordMat, lda)

Input Game:
Name                                              FIFA Soccer 2005
Publisher                                                EA Sports
GameSpotScore                                                  8.8
Review           this is ea sports best soccer game to date on ...
Console                                                        PS2
Genre                                                   Soccer Sim
Name: 4763, dtype: object 

Most similar games:
Name                                                FIFA Soccer 06
Publisher                                                EA Sports
GameSpotScore                                                    9
Review           fifa 06 adds plenty of new features that help ...
Console                                                        PS2
Genre                                                   Soccer Sim
Name: 4758, dtype: object

Name                                 World Soccer Winning Eleven 9
Publisher                                 

In [13]:
gameIndex=7844
getSimilarGames(gameIndex, df, wordMat, lda)

Input Game:
Name                                            Forza Motorsport 2
Publisher                                   Microsoft Game Studios
GameSpotScore                                                  9.2
Review           forza 2 delivers on nearly every aspect youd w...
Console                                                    Xbox360
Genre                                           GT / Street Racing
Name: 7844, dtype: object 

Most similar games:
Name                                              Forza Motorsport
Publisher                                   Microsoft Game Studios
GameSpotScore                                                  9.2
Review           forza motorsport is a stylish and challenging ...
Console                                                       Xbox
Genre                                           GT / Street Racing
Name: 7152, dtype: object

Name                                       Gran Turismo 5 Prologue
Publisher                                 