In [95]:
import os
import re
import fnmatch
import string
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

def tfIdf(df):
    nFeatures=1000
    tf_vectorizer = TfidfVectorizer(max_df=0.85, min_df=0.1,
                                max_features=nFeatures,
                                stop_words='english', lowercase=True)
    tf = tf_vectorizer.fit_transform(df)
    features = tf_vectorizer.get_feature_names()
    return (tf, features)

def runLSA(n, iters, wordMat):
    print('Number of inputs:', np.shape(wordMat)[0], '\n')
    lsa = TruncatedSVD(n_components=n, n_iter=5,
                                random_state=0)
    model=lsa.fit(wordMat)
    lsaTransform=lsa.transform(wordMat)
    return (model, lsaTransform)
    
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx+1))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    
def print_topics(components, pred, features):
    pred=list(pred[0])
    topic_index=pred.index(max(pred))
    topic=components[topic_index]
    print(" ".join([features[i] for i in topic.argsort()[:-10 - 1:-1]]))
    print()
    
def predLDA(x, wordMat, features, model):
    print(x[['Name', 'Genre', 'Console']])
    r=wordMat[x.index.get_values()[0]]
    topics=lda.transform(r)
    print_topics(lda.components_, topics, features)

def getSimilarGames(gameIndex, df, wordMat, lsa): 
#     mat1=wordMat.todense()
#     mat2=np.linalg.pinv(lsa[0].components_)
#     x=mat1*mat2
#     cos=cosine_similarity(x)
    cos=cosine_similarity(lsa[1])
    game1=cos[gameIndex]
    probs_sorted=sorted(game1, reverse=True)

    ipGame=df.iloc[gameIndex]  
    i=0
    sim_games=[]
    lstGames=set()
    lstGames.add(ipGame[0])
    print('Input Game:')
    print(df.iloc[gameIndex], '\n')
    while(len(lstGames)<=5):
        index=int(np.where(game1==probs_sorted[i])[0][0])
        currentGame=df.iloc[index]
        if currentGame[0] not in lstGames:
            sim_games.append(df.iloc[index])
            lstGames.add(currentGame[0])
        i+=1
    
    print('Most similar games:')
    for game in sim_games:
        print(game)
        print()  
    

In [2]:
df=pd.DataFrame(columns=['Name', 'Publisher', 'GameSpotScore', 'Review', 'Console', 'Genre'])

i=0
for dirpath, dirs, files in os.walk('dataset/reviews'):   
    for file in fnmatch.filter(files, '*.txt'):
        with open(os.path.join(dirpath, file), 'r') as ip:
            data=ip.read()
            name=re.findall(r':::Game Name:::(.*?)-----', data, re.DOTALL)[0].strip()
            review=re.findall(r':::Review:::(.*?)-----',data, re.DOTALL)[0].strip()
            scores=re.findall(r':::Scores:::(.*?)-----',data, re.DOTALL)[0]
            addition=re.findall(r':::Addition:::(.*?)-----',data, re.DOTALL)[0]
            gsScore=re.findall(r'GameSpot Score:(.*?)\n', scores)[0]
            review = review.lower()
            tVar = str.maketrans('', '', string.punctuation)
            review = review.translate(tVar)
            try:
                pub=re.findall(r'Publisher:(.*?)\n', addition)[0]
            except:
                pub=''
            try:
                genre=re.findall(r'Genre:(.*?)\n', addition)[0]
            except:
                genre=''
            console=dirpath.strip('dataset/reviews/')
            df.loc[i]=[name, pub, gsScore, review, console, genre]
            i+=1

In [96]:
n=25
iters=5
nWords=10
wordMat, features=tfIdf(df['Review'])


781
Number of inputs: 8278 

Topic 1:
youll youre mode play new characters players good enemies theres
Topic 2:
racing cars race car races mode tracks track driving ball
Topic 3:
ball players player team teams play sports online mode season
Topic 4:
missions campaign war mission strategy ii enemy multiplayer maps map
Topic 5:
characters battle new strategy battles ii character magic fighting world
Topic 6:
puzzles puzzle adventure interface new build building world war computer
Topic 7:
arcade original ii version new xbox mode puzzle classic online
Topic 8:
missions city version xbox characters new pc adventure versions story
Topic 9:
ii enemies team original ball new levels players adventure weapons
Topic 10:
ball ii arcade missions original classic collection minigames screen fighting
Topic 11:
puzzles ii puzzle fighting adventure war mode characters team xbox
Topic 12:
missions mission mode characters puzzle series puzzles new story 3d
Topic 13:
ii puzzle youll city mode ball cars m

In [107]:
lsa=runLSA(n, iters, wordMat)
test=features.index('youll')
print_top_words(lsa[0], features, nWords)

Number of inputs: 8278 

Topic 1:
youll youre mode play new characters players good enemies theres
Topic 2:
racing cars race car races mode tracks track driving ball
Topic 3:
ball players player team teams play sports online mode season
Topic 4:
missions campaign war mission strategy ii enemy multiplayer maps map
Topic 5:
characters battle new strategy battles ii character magic fighting world
Topic 6:
puzzles puzzle adventure interface new build building world war computer
Topic 7:
arcade original ii version new xbox mode puzzle classic online
Topic 8:
missions city version xbox characters new pc adventure versions story
Topic 9:
ii enemies team original ball new levels players adventure weapons
Topic 10:
ball ii arcade missions original classic collection minigames screen fighting
Topic 11:
puzzles ii puzzle fighting adventure war mode characters team xbox
Topic 12:
missions mission mode characters puzzle series puzzles new story 3d
Topic 13:
ii puzzle youll city mode ball cars missi

In [88]:
gameIndex=745
getSimilarGames(gameIndex, df, wordMat, lsa)

Input Game:
Name                                     Pokemon LeafGreen Version
Publisher                                                 Nintendo
GameSpotScore                                                  8.4
Review           both new and longtime pokmon players will find...
Console                                              GameBoyAdvanc
Genre                                                 Role-Playing
Name: 745, dtype: object 

Most similar games:
Name                                       Pokemon FireRed Version
Publisher                                                 Nintendo
GameSpotScore                                                  8.4
Review           both new and longtime pokmon players will find...
Console                                              GameBoyAdvanc
Genre                                                 Role-Playing
Name: 744, dtype: object

Name                                          Pokemon Ruby Version
Publisher                                   

In [89]:
gameIndex=2447
getSimilarGames(gameIndex, df, wordMat, lsa)

Input Game:
Name                                 Grand Theft Auto: San Andreas
Publisher                                           Rockstar Games
GameSpotScore                                                    9
Review           grand theft auto san andreas is a stupendous t...
Console                                                         PC
Genre                                      Modern Action Adventure
Name: 2447, dtype: object 

Most similar games:
Name                                  Grand Theft Auto Double Pack
Publisher                                           Rockstar Games
GameSpotScore                                                  9.6
Review           if youre an xbox owner who either hasnt played...
Console                                                       Xbox
Genre                                      Modern Action Adventure
Name: 7175, dtype: object

Name                                    Bully: Scholarship Edition
Publisher                                 

In [90]:
gameIndex=1751
getSimilarGames(gameIndex, df, wordMat, lsa)

Input Game:
Name                                                  Call of Duty
Publisher                                               Activision
GameSpotScore                                                    9
Review           most anyone who plays games would more than li...
Console                                                         PC
Genre                                     Historic First-Person...
Name: 1751, dtype: object 

Most similar games:
Name                                          The Sum of All Fears
Publisher                                                  Ubisoft
GameSpotScore                                                  7.2
Review           while its clear that the sum of all fears was ...
Console                                                         PC
Genre                                      Modern Tactical Shooter
Name: 3977, dtype: object

Name                                          Killzone: Liberation
Publisher                                 

In [91]:
gameIndex=4763
getSimilarGames(gameIndex, df, wordMat, lsa)

Input Game:
Name                                              FIFA Soccer 2005
Publisher                                                EA Sports
GameSpotScore                                                  8.8
Review           this is ea sports best soccer game to date on ...
Console                                                        PS2
Genre                                                   Soccer Sim
Name: 4763, dtype: object 

Most similar games:
Name                                                FIFA 07 Soccer
Publisher                                                EA Sports
GameSpotScore                                                  8.4
Review           fifa 07 plays a fastpaced and realistic game o...
Console                                                        PS2
Genre                                                   Soccer Sim
Name: 4756, dtype: object

Name                                                FIFA Soccer 06
Publisher                                 

In [92]:
gameIndex=7844
getSimilarGames(gameIndex, df, wordMat, lsa)

Input Game:
Name                                            Forza Motorsport 2
Publisher                                   Microsoft Game Studios
GameSpotScore                                                  9.2
Review           forza 2 delivers on nearly every aspect youd w...
Console                                                    Xbox360
Genre                                           GT / Street Racing
Name: 7844, dtype: object 

Most similar games:
Name                                                Crash 'N' Burn
Publisher                                        Eidos Interactive
GameSpotScore                                                  6.7
Review           crash n burn plays decently but it will leave ...
Console                                                        PS2
Genre                                                       Racing
Name: 4592, dtype: object

Name                                                    MotorStorm
Publisher                                 