In [9]:
from slda.topic_models import SLDA
from functools import partial
from sklearn.metrics import (mean_squared_error)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import fnmatch
import re
import random
from scipy import sparse

def vectorize(df):
    nFeatures=1000
    tf_vectorizer = CountVectorizer(max_df=0.85, min_df=0.2,
                                max_features=nFeatures,
                                stop_words='english', lowercase=True)
    tf = tf_vectorizer.fit_transform(df)
    features = tf_vectorizer.get_feature_names()
    return (tf, features)

def sldaPred(model, wordMat, n_iter, d):
    burn_in = max(n_iter - 100, int(n_iter / 2))
    model.loglikelihoods[:].mean()
    eta_pred = model.eta[:].mean(axis=0)

    thetas_test_slda = model.transform(wordMat)
    y_slda = [np.dot(eta_pred, thetas_test_slda[i]) for i in range(d)]
    return y_slda

def runSLDA(df, wordMat, features):
#     wordMat, features=vectorize(df['Review'])
    scr=df['GameSpotScore']
    scr=list(scr)
    scr=np.array(scr)

    K=10
    D=len(df)
    V=len(features)
    nu2 = 10
    sigma2 = 1

    alpha = np.ones(K)
    np.random.seed(42)
    thetas = np.random.dirichlet(alpha, size=D)
    np.random.seed(42)
    eta = np.random.normal(scale=nu2, size=K)


    _K = 10
    _alpha = alpha
    _beta = np.repeat(0.01, V)
    _mu = 0
    _nu2 = nu2
    _sigma2 = sigma2
    n_iter = 200
    slda = SLDA(_K, _alpha, _beta, _mu, _nu2, _sigma2, n_iter, seed=42)
    slda.fit(wordMat, scr)
    
    res={}
    res['model']=slda
    res['wordMat']=wordMat
    res['features']=features
    return res

def subWordMatrix(wordMat, indices):
    subWordMat=np.empty((0, np.shape(wordMat)[1]))
    for index in indices:
        vec=wordMat[index]
        subWordMat=np.append(subWordMat, vec, axis=0)
    return subWordMat

def calcSSE(A,B):
    sse=0
    for a, b in zip(A, B):
        a=round(float(a),2)
        b=round(float(b),2)
        sse+=(a-b)**2
    return sse

In [5]:
df=pd.DataFrame(columns=['Name', 'Publisher', 'GameSpotScore', 'Review', 'Console', 'Genre'])

i=0
for dirpath, dirs, files in os.walk('dataset/reviews'):   
    for file in fnmatch.filter(files, '*.txt'):
        with open(os.path.join(dirpath, file), 'r') as ip:
            data=ip.read()
            name=re.findall(r':::Game Name:::(.*?)-----', data, re.DOTALL)[0].strip()
            review=re.findall(r':::Review:::(.*?)-----',data, re.DOTALL)[0].strip()
            scores=re.findall(r':::Scores:::(.*?)-----',data, re.DOTALL)[0]
            addition=re.findall(r':::Addition:::(.*?)-----',data, re.DOTALL)[0]
            gsScore=re.findall(r'GameSpot Score:(.*?)\n', scores)[0]
            try:
                pub=re.findall(r'Publisher:(.*?)\n', addition)[0]
            except:
                pub=''
            try:
                genre=re.findall(r'Genre:(.*?)\n', addition)[0]
            except:
                genre=''
            console=dirpath.strip('dataset/reviews/')
            df.loc[i]=[name, pub, gsScore, review, console, genre]
            i+=1

In [10]:
df['GameSpotScore'] =pd.to_numeric(df['GameSpotScore'])
wordMat, features=vectorize(df['Review'])
slda=runSLDA(df, wordMat, features)

2017-04-15 08:28:06.251269 start iterations
2017-04-15 08:28:07.987354 0:00:01.736085 elapsed, iter   10, LL -751579.0513, 22.86% change from last
2017-04-15 08:28:09.602606 0:00:03.351337 elapsed, iter   20, LL -647199.2320, 13.89% change from last
2017-04-15 08:28:11.210899 0:00:04.959630 elapsed, iter   30, LL -616476.0055, 4.75% change from last
2017-04-15 08:28:12.826905 0:00:06.575636 elapsed, iter   40, LL -599361.2020, 2.78% change from last
2017-04-15 08:28:14.419454 0:00:08.168185 elapsed, iter   50, LL -586913.4028, 2.08% change from last
2017-04-15 08:28:16.020308 0:00:09.769039 elapsed, iter   60, LL -580260.4417, 1.13% change from last
2017-04-15 08:28:17.662520 0:00:11.411251 elapsed, iter   70, LL -573728.1130, 1.13% change from last
2017-04-15 08:28:19.263176 0:00:13.011907 elapsed, iter   80, LL -569200.8962, 0.79% change from last
2017-04-15 08:28:20.827434 0:00:14.576165 elapsed, iter   90, LL -564501.4282, 0.83% change from last
2017-04-15 08:28:22.443123 0:00:16.1

In [13]:
game1=df[(df['Console']=='DS')].ix[np.random.choice(df[(df['Console']=='DS')].index, 1)]
game2=df[(df['Console']=='ps3')].ix[np.random.choice(df[(df['Console']=='ps3')].index, 1)]
game3=df[(df['Console']=='Xbox360')].ix[np.random.choice(df[(df['Console']=='Xbox360')].index, 1)]

wordMat1=wordMat[game1.index.get_values()[0]]
wordMat2=wordMat[game2.index.get_values()[0]]
wordMat3=wordMat[game3.index.get_values()[0]]


y1=sldaPred(slda['model'], wordMat1, 200, 1)
y2=sldaPred(slda['model'], wordMat2, 200, 1)
y3=sldaPred(slda['model'], wordMat3, 200, 1)

print(game1['Name'].values[0], 'Original Rating:',game1['GameSpotScore'].values[0], 'Predicted Rating:', y1[0])
print(game2['Name'].values[0], 'Original Rating:',game2['GameSpotScore'].values[0], 'Predicted Rating:', y2[0])
print(game3['Name'].values[0], 'Original Rating:',game3['GameSpotScore'].values[0], 'Predicted Rating:', y3[0])

Bee Movie Game Original Rating: 5.0 Predicted Rating: 6.00848121919
Major League Baseball 2K8 Original Rating: 6.5 Predicted Rating: 6.80154818114
College Hoops 2K8 Original Rating: 8.0 Predicted Rating: 7.59118860931


In [14]:
n=len(df)
wordMat, features=vectorize(df['Review'])
n_train=int(n*0.9)
n_test=n-n_train

wordMat=wordMat.todense()
testRows=random.sample((list(df.index)), n_test)
test_df=df.ix[testRows]
train_df=df.drop(testRows)

subWordMatTest=subWordMatrix(wordMat, test_df.index)
subWordMatTest=sparse.csr_matrix(subWordMatTest, dtype='int')
print(type (subWordMatTest))
subWordMatTrain=subWordMatrix(wordMat, train_df.index)
subWordMatTrain=sparse.csr_matrix(subWordMatTrain, dtype='int')

train_slda=runSLDA(train_df, subWordMatTrain, features)

<class 'scipy.sparse.csr.csr_matrix'>
2017-04-15 08:29:22.205658 start iterations
2017-04-15 08:29:23.818999 0:00:01.613341 elapsed, iter   10, LL -691628.3489, 21.02% change from last
2017-04-15 08:29:25.271995 0:00:03.066337 elapsed, iter   20, LL -594200.1460, 14.09% change from last
2017-04-15 08:29:26.939151 0:00:04.733493 elapsed, iter   30, LL -555353.4772, 6.54% change from last
2017-04-15 08:29:28.537765 0:00:06.332107 elapsed, iter   40, LL -536124.1218, 3.46% change from last
2017-04-15 08:29:29.997866 0:00:07.792208 elapsed, iter   50, LL -525642.3712, 1.96% change from last
2017-04-15 08:29:31.579668 0:00:09.374010 elapsed, iter   60, LL -516470.5110, 1.74% change from last
2017-04-15 08:29:33.049222 0:00:10.843564 elapsed, iter   70, LL -510056.9860, 1.24% change from last
2017-04-15 08:29:34.863795 0:00:12.658137 elapsed, iter   80, LL -505492.6640, 0.89% change from last
2017-04-15 08:29:36.275550 0:00:14.069892 elapsed, iter   90, LL -501739.1733, 0.74% change from las

In [15]:
y=sldaPred(train_slda['model'], subWordMatTest, 200, len(test_df))
i=0
for name, score in zip(test_df['Name'], test_df['GameSpotScore']):
    print(name, '---', 'Rating:', score, '---', 'Predicted Rating:', round(y[i], 1))
    i+=1

Dirt --- Rating: 8.3 --- Predicted Rating: 7.0
Tomb Raider: Legend --- Rating: 7.8 --- Predicted Rating: 6.6
Namco Museum DS --- Rating: 6.5 --- Predicted Rating: 6.6
Dino Master --- Rating: 2.4 --- Predicted Rating: 5.9
Rockstar Games presents Table Tennis --- Rating: 6.5 --- Predicted Rating: 6.1
Elf Bowling 1 & 2 --- Rating: 1.4 --- Predicted Rating: 6.1
Sid Meier's Civilization Revolution --- Rating: 8.5 --- Predicted Rating: 7.3
NBA Live 06 --- Rating: 6.4 --- Predicted Rating: 7.0
Final Fantasy Crystal Chronicles: Ring of Fates --- Rating: 7.5 --- Predicted Rating: 6.3
Blazing Angels 2: Secret Missions of WWII --- Rating: 7.0 --- Predicted Rating: 6.5
Hour of Victory --- Rating: 2.0 --- Predicted Rating: 6.2
Orcs & Elves --- Rating: 7.0 --- Predicted Rating: 6.1
Tom Clancy's Ghost Recon Advanced Warfighter 2 --- Rating: 8.7 --- Predicted Rating: 7.6
Skate --- Rating: 7.0 --- Predicted Rating: 6.2
Broken Sword: Shadows of the Templars (Director's Cut) --- Rating: 8.0 --- Predicted

In [16]:
sse=calcSSE(test_df['GameSpotScore'], y)
print(sse)

301.83089999999976
