#### All the imports required in the file and general functions

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from operator import itemgetter
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from os import listdir
import os
import string
import errno
import pandas as pd
import re

genres = {}
directory_path = "Gamespot-Cleaned/test/"
review_identification = ":::Review:::"
destination_path  = "Gamespot-Cleaned/Cleaned_Data/" 

break_cond = "-------------------------------------------------"

def get_files(filepath):
    files = []
    files = listdir(filepath)
    return files

def make_sure_path_exists(path):
    try:
        os.makedirs(path)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise


#### Get data from the dataset 

In [2]:
_data = []
_docs = []

def read_content_1(filepath):
    data = ""
    i = 0
    with open(filepath, 'r') as f:
            found_review = False
            for line in f:
                line= line.strip(" ").strip('\n')
                if review_identification in line:
                    found_review = True
                    line = ""
                    next(f)
                if found_review :
                    if line.strip() == break_cond:
                        found_review = False
                        break
                    else:
                        line = line.lower()
                        tVar = str.maketrans('', '', string.punctuation)
                        line = line.translate(tVar)
                        if not line.strip("\n") == "":
                            data += " " + line + " "        
            f.close()
    return data
    
def _get_data():
    folders = get_files(directory_path)
    dic_data = {}
    
    for folder in folders:
        print(folder)
        files = get_files(directory_path + folder)
        for f in files:
            output = read_content_1(directory_path + folder + "/" + f)
            _data.append(output)
            _docs.append(directory_path + folder + "/" + f)
            

_get_data()
print("Number of Docs :" + str(len(_data)))
list_index = []

for i, game in enumerate(_docs):
    list_index.append((game,i))

DS
PC
PS3
Wii
Xbox360
Number of Docs :4303


#### Perform LSA on the data from dataset and then calculate cosine similarity score for the documents.

In [3]:
def _perform_Tfidf(docs):   
    vectorizer = TfidfVectorizer(stop_words = 'english',use_idf=True,ngram_range=(1,1),lowercase=False)
    X = vectorizer.fit_transform(docs)
    return X, vectorizer.get_feature_names()

def _perform_LSA(X):
    lsa = TruncatedSVD(n_components=20, n_iter=100)
    lsaOutput = lsa.fit(X)
    lsaOutput = lsa.transform(X)
    return lsaOutput
    
def _run_2():
    X, terms = _perform_Tfidf(_data)
    print(X.shape)
    lsa = _perform_LSA(X)
    cos_score = cosine_similarity(lsa)
    return cos_score

score = _run_2()

(4303, 66847)


#### Get details for a game in form of dataset.

In [4]:
def print_data(path):
    df=pd.DataFrame(columns=['Game Name', 'Publisher', 'GameSpotScore', 'Console','Genre'])
    i=0
    with open(path, 'r') as ip:
        data=ip.read()
        name=re.findall(r':::Game Name:::(.*?)-----', data, re.DOTALL)[0].strip()
        scores=re.findall(r':::Scores:::(.*?)-----',data, re.DOTALL)[0]
        addition=re.findall(r':::Addition:::(.*?)-----',data, re.DOTALL)[0]
        gsScore=re.findall(r'GameSpot Score:(.*?)\n', scores)[0]
        try:
            pub=re.findall(r'Publisher:(.*?)\n', addition)[0]
        except:
            pub=''
        try:
            genre=re.findall(r'Genre:(.*?)\n', addition)[0]
        except:
            genre=''
        console=path.split('/')[-2]
        df.loc[i]=[name, pub, gsScore,console, genre]
        i+=1
    return df

       

### All the below cells print top 5 games for a given game, based on cosine similarity score calculated above.

In [5]:
def _get_results_(name):
    index = -1
    for x in list_index:
        if name == x[0].split('/')[-1]:
            index = x[1]
    if not index == -1: 
        docsnscores = zip(_docs,score[:,index])
        sorted_score = sorted(docsnscores, key=lambda x: x[1], reverse=True)
        i = 1
        game_exists= set()
        for n in sorted_score:
            if i >= 6:
                break
            if n[0].split('/')[-1] not in game_exists and not n[0].split('/')[-1] == name:
                game_exists.add(n[0].split('/')[-1])
                df = print_data(n[0])
                print(df.loc[0])
                print("\n")
                i += 1  
_get_results_("GrandTheftAutoIV.txt")

Game Name                      Saints Row
Publisher                             THQ
GameSpotScore                         8.3
Console                           Xbox360
Genre             Modern Action Adventure
Name: 0, dtype: object


Game Name        True Crime: New York City
Publisher                            Aspyr
GameSpotScore                          5.2
Console                                 PC
Genre              Modern Action Adventure
Name: 0, dtype: object


Game Name                      Crackdown
Publisher         Microsoft Game Studios
GameSpotScore                        7.8
Console                          Xbox360
Genre                     Sci-Fi Shooter
Name: 0, dtype: object


Game Name              Grand Theft Auto 2
Publisher                  Rockstar Games
GameSpotScore                         6.8
Console                                PC
Genre             Modern Action Adventure
Name: 0, dtype: object


Game Name            Starsky & Hutch
Publisher              

In [7]:
def _get_results_(name):
    index = -1
    for x in list_index:
        if name == x[0].split('/')[-1]:
            index = x[1]
    if not index == -1: 
        docsnscores = zip(_docs,score[:,index])
        sorted_score = sorted(docsnscores, key=lambda x: x[1], reverse=True)
        i = 1
        game_exists= set()
        for n in sorted_score:
            if i >= 6:
                break
            if n[0].split('/')[-1] not in game_exists and not n[0].split('/')[-1] == name:
                game_exists.add(n[0].split('/')[-1])
                df = print_data(n[0])
                print(df.loc[0])
                print("\n")
                i += 1  
_get_results_("FIFASoccer08.txt")

Game Name          UEFA EURO 2008
Publisher         Electronic Arts
GameSpotScore                   8
Console                       PS3
Genre                  Soccer Sim
Name: 0, dtype: object


Game Name        FIFA 07 Soccer
Publisher             EA Sports
GameSpotScore               8.5
Console                      PC
Genre                Soccer Sim
Name: 0, dtype: object


Game Name        FIFA Soccer 2005
Publisher               EA Sports
GameSpotScore                 8.9
Console                        PC
Genre                  Soccer Sim
Name: 0, dtype: object


Game Name        FIFA Soccer 06
Publisher             EA Sports
GameSpotScore               8.9
Console                      PC
Genre                Soccer Sim
Name: 0, dtype: object


Game Name        FIFA Soccer 2004
Publisher               EA Sports
GameSpotScore                 7.8
Console                        PC
Genre                  Soccer Sim
Name: 0, dtype: object




In [8]:
def _get_results_(name):
    index = -1
    for x in list_index:
        if name == x[0].split('/')[-1]:
            index = x[1]
    if not index == -1: 
        docsnscores = zip(_docs,score[:,index])
        sorted_score = sorted(docsnscores, key=lambda x: x[1], reverse=True)
        i = 1
        game_exists= set()
        for n in sorted_score:
            if i >= 6:
                break
            if n[0].split('/')[-1] not in game_exists and not n[0].split('/')[-1] == name:
                game_exists.add(n[0].split('/')[-1])
                df = print_data(n[0])
                print(df.loc[0])
                print("\n")
                i += 1  
_get_results_("TheIncredibleHulk.txt")

Game Name                     The Godfather
Publisher                   Electronic Arts
GameSpotScore                           8.1
Console                                  PC
Genre             Historic Action Adventure
Name: 0, dtype: object


Game Name        Grand Theft Auto: Vice City
Publisher                     Rockstar Games
GameSpotScore                            9.3
Console                                   PC
Genre                Modern Action Adventure
Name: 0, dtype: object


Game Name        The Godfather: The Don's Edition
Publisher                         Electronic Arts
GameSpotScore                                 7.6
Console                                       PS3
Genre                   Historic Action Adventure
Name: 0, dtype: object


Game Name        Spider-Man: Web of Shadows
Publisher                        Activision
GameSpotScore                             8
Console                             Xbox360
Genre                                Action
Name: 0, d

In [9]:
def _get_results_(name):
    index = -1
    for x in list_index:
        if name == x[0].split('/')[-1]:
            index = x[1]
    if not index == -1: 
        docsnscores = zip(_docs,score[:,index])
        sorted_score = sorted(docsnscores, key=lambda x: x[1], reverse=True)
        i = 1
        game_exists= set()
        for n in sorted_score:
            if i >= 6:
                break
            if n[0].split('/')[-1] not in game_exists and not n[0].split('/')[-1] == name:
                game_exists.add(n[0].split('/')[-1])
                df = print_data(n[0])
                print(df.loc[0])
                print("\n")
                i += 1  
_get_results_("WWESmackDownvs.Raw2009.txt")

Game Name        Legends of Wrestlemania
Publisher                            THQ
GameSpotScore                          5
Console                          Xbox360
Genre                          Wrestling
Name: 0, dtype: object


Game Name        Street Fighter IV
Publisher                   Capcom
GameSpotScore                    9
Console                        PS3
Genre                  3D Fighting
Name: 0, dtype: object


Game Name        Fight Night Round 3
Publisher                  EA Sports
GameSpotScore                    8.2
Console                          PS3
Genre                         Boxing
Name: 0, dtype: object


Game Name        Virtua Fighter 5 Online
Publisher                           Sega
GameSpotScore                        8.5
Console                          Xbox360
Genre                        3D Fighting
Name: 0, dtype: object


Game Name        Mario Strikers Charged
Publisher                      Nintendo
GameSpotScore                       7.5
Console   

In [10]:
def _get_results_(name):
    index = -1
    for x in list_index:
        if name == x[0].split('/')[-1]:
            index = x[1]
    if not index == -1: 
        docsnscores = zip(_docs,score[:,index])
        sorted_score = sorted(docsnscores, key=lambda x: x[1], reverse=True)
        i = 1
        game_exists= set()
        for n in sorted_score:
            if i >= 6:
                break
            if n[0].split('/')[-1] not in game_exists and not n[0].split('/')[-1] == name:
                game_exists.add(n[0].split('/')[-1])
                df = print_data(n[0])
                print(df.loc[0])
                print("\n")
                i += 1  
_get_results_("CallofDuty3.txt")

Game Name         Medal of Honor: Airborne
Publisher                  Electronic Arts
GameSpotScore                            7
Console                                PS3
Genre             Historic First-Person...
Name: 0, dtype: object


Game Name                     Call of Duty
Publisher                       Activision
GameSpotScore                            9
Console                                 PC
Genre             Historic First-Person...
Name: 0, dtype: object


Game Name        Brothers in Arms: Earned in Blood
Publisher                                  Ubisoft
GameSpotScore                                  8.7
Console                                         PC
Genre                     Historic First-Person...
Name: 0, dtype: object


Game Name                      Sniper Elite
Publisher                             Namco
GameSpotScore                           7.6
Console                                  PC
Genre             Historic Tactical Shooter
Name: 0, dtype: obje