In [25]:
from slda.topic_models import SLDA
from functools import partial
from sklearn.metrics import (mean_squared_error)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import fnmatch
import re

def vectorize(df):
    nFeatures=1000
    tf_vectorizer = CountVectorizer(max_df=0.85, min_df=0.2,
                                max_features=nFeatures,
                                stop_words='english', lowercase=True)
    tf = tf_vectorizer.fit_transform(df)
    features = tf_vectorizer.get_feature_names()
    return (tf, features)

def sldaPred(model, wordMat, n_iter):
    burn_in = max(n_iter - 100, int(n_iter / 2))
    model.loglikelihoods[burn_in:].mean()
    eta_pred = model.eta[burn_in:].mean(axis=0)


    thetas_test_slda = model.transform(wordMat)
    y_slda = [np.dot(eta_pred, thetas_test_slda[i]) for i in range(1)]
    return y_slda

In [2]:
df=pd.DataFrame(columns=['Name', 'Publisher', 'GameSpotScore', 'Review', 'Console', 'Genre'])

i=0
for dirpath, dirs, files in os.walk('dataset/reviews'):   
    for file in fnmatch.filter(files, '*.txt'):
        with open(os.path.join(dirpath, file), 'r') as ip:
            data=ip.read()
            name=re.findall(r':::Game Name:::(.*?)-----', data, re.DOTALL)[0].strip()
            review=re.findall(r':::Review:::(.*?)-----',data, re.DOTALL)[0].strip()
            scores=re.findall(r':::Scores:::(.*?)-----',data, re.DOTALL)[0]
            addition=re.findall(r':::Addition:::(.*?)-----',data, re.DOTALL)[0]
            gsScore=re.findall(r'GameSpot Score:(.*?)\n', scores)[0]
            try:
                pub=re.findall(r'Publisher:(.*?)\n', addition)[0]
            except:
                pub=''
            try:
                genre=re.findall(r'Genre:(.*?)\n', addition)[0]
            except:
                genre=''
            console=dirpath.strip('dataset/reviews/')
            df.loc[i]=[name, pub, gsScore, review, console, genre]
            i+=1

In [3]:
df['GameSpotScore'] =pd.to_numeric(df['GameSpotScore'])
wordMat, features=vectorize(df['Review'])

res=df['GameSpotScore']
res=list(res)
res=np.array(res)
print(np.shape(wordMat))
print(np.shape(res))

K=10
D=len(df)
V=len(features)
nu2 = 10
sigma2 = 1

alpha = np.ones(K)
np.random.seed(42)
thetas = np.random.dirichlet(alpha, size=D)
np.random.seed(42)
eta = np.random.normal(scale=nu2, size=K)


_K = 10
_alpha = alpha
_beta = np.repeat(0.01, V)
_mu = 0
_nu2 = nu2
_sigma2 = sigma2
n_iter = 200
slda = SLDA(_K, _alpha, _beta, _mu, _nu2, _sigma2, n_iter, seed=42)


slda.fit(wordMat, res)
print(slda.phi)

(1284, 357)
(1284,)
2017-04-15 04:47:23.645557 start iterations
2017-04-15 04:47:25.698296 0:00:02.052739 elapsed, iter   10, LL -751579.0513, 22.86% change from last
2017-04-15 04:47:27.716015 0:00:04.070458 elapsed, iter   20, LL -647199.2320, 13.89% change from last
2017-04-15 04:47:29.655565 0:00:06.010008 elapsed, iter   30, LL -616476.0055, 4.75% change from last
2017-04-15 04:47:31.719098 0:00:08.073541 elapsed, iter   40, LL -599361.2020, 2.78% change from last
2017-04-15 04:47:33.716963 0:00:10.071406 elapsed, iter   50, LL -586913.4028, 2.08% change from last
2017-04-15 04:47:35.767637 0:00:12.122080 elapsed, iter   60, LL -580260.4417, 1.13% change from last
2017-04-15 04:47:37.871395 0:00:14.225838 elapsed, iter   70, LL -573728.1130, 1.13% change from last
2017-04-15 04:47:39.850078 0:00:16.204521 elapsed, iter   80, LL -569200.8962, 0.79% change from last
2017-04-15 04:47:41.754955 0:00:18.109398 elapsed, iter   90, LL -564501.4282, 0.83% change from last
2017-04-15 04:47

In [17]:
arr1=np.array([[1, 2], [1, 1]])
arr2=np.array([[1,1]])
print(np.shape(arr1))
print(np.shape(arr2))
arr3=np.append(arr1, arr2, axis=0)
print(arr3)

(2, 2)
(1, 2)
[[1 2]
 [1 1]
 [1 1]]


In [47]:
game1=df[(df['Console']=='DS')].ix[np.random.choice(df[(df['Console']=='DS')].index, 1)]
game2=df[(df['Console']=='ps3')].ix[np.random.choice(df[(df['Console']=='ps3')].index, 1)]
game3=df[(df['Console']=='Xbox360')].ix[np.random.choice(df[(df['Console']=='Xbox360')].index, 1)]

wordMat1=wordMat[game1.index.get_values()[0]]
wordMat2=wordMat[game2.index.get_values()[0]]
wordMat3=wordMat[game3.index.get_values()[0]]


y1=sldaPred(slda, wordMat1, 200)
y2=sldaPred(slda, wordMat2, 200)
y3=sldaPred(slda, wordMat3, 200)

print(game1['Name'].values[0], 'Original Rating:',game1['GameSpotScore'].values[0], 'Predicted Rating:', y1[0])
print(game2['Name'].values[0], 'Original Rating:',game2['GameSpotScore'].values[0], 'Predicted Rating:', y2[0])
print(game3['Name'].values[0], 'Original Rating:',game3['GameSpotScore'].values[0], 'Predicted Rating:', y3[0])

Looney Tunes: Cartoon Conductor Original Rating: 6.5 Predicted Rating: 7.03179018896
MotoGP 08 Original Rating: 7.0 Predicted Rating: 7.79764327106
Ninja Gaiden II Original Rating: 8.0 Predicted Rating: 6.27744116472
