In [1]:
import numpy as np

from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.factorization import explicit, implicit
from spotlight.evaluation import mrr_score, rmse_score
from collections import Counter
import pandas as pd
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import numpy as np

  from ._conv import register_converters as _register_converters


In [7]:
from sklearn.utils import shuffle
df = pd.read_pickle('reviews_Sports_and_Outdoors_5.h5')
df = shuffle(df)

In [8]:
print df.columns

Index([u'reviewerID', u'asin', u'reviewerName', u'helpful', u'unixReviewTime',
       u'reviewText', u'overall', u'reviewTime', u'summary', u'prediction'],
      dtype='object')


In [9]:

# Lists of unique movies and users
movie_list = list(set(df['asin'].values))
user_list = list(set(df['reviewerID'].values))

#Storing the indices of users to generate the Rating Memory Matrix
UserIndices={}
for i in range(len(user_list)):
    UserIndices[user_list[i]] = i

#Storing the indices of movies to generate the Rating Memory Matrix
#These indices will be used to populate the UsersRatingMemory for Memory based Collaborative Filtering
MovieIndices={}
for j in range(len(movie_list)):
    MovieIndices[movie_list[j]] = j

In [10]:
def adjust_score(score):
    if score >0:
      return  max(0,score-0.5)
    else:
       return min(0,score+0.5)
    
df['corrected_sent_score'] = [adjust_score(row['prediction']) for index, row in df.iterrows()]

In [11]:
print df.columns

Index([u'reviewerID', u'asin', u'reviewerName', u'helpful', u'unixReviewTime',
       u'reviewText', u'overall', u'reviewTime', u'summary', u'prediction',
       u'corrected_sent_score'],
      dtype='object')


In [13]:
user_ids=[]
item_ids=[]
ratings=[]
for i in range(0, len(df)):
    item_id=df.iloc[i]['asin']
    user_id=df.iloc[i]['reviewerID']
    user_ind=UserIndices[user_id]
    item_ind=MovieIndices[item_id]
    user_ids.append(user_ind)
    item_ids.append(item_ind)
    ratings.append(float(df.iloc[i]['overall'])+df.iloc[i]['corrected_sent_score'])

In [15]:
from spotlight.interactions import Interactions
user_ids_list=np.array(user_ids,dtype=np.int32)
item_ids_list=np.array(item_ids,dtype=np.int32)
ratings_list=np.array(ratings,dtype=np.float32)
dataset = Interactions(user_ids_list, item_ids_list,ratings_list)

In [16]:
RANDOM_SEED = 42
LATENT_DIM = 32
NUM_EPOCHS = 10
BATCH_SIZE = 256
L2 = 1e-6
LEARNING_RATE = 1e-3
train, test = random_train_test_split(dataset, random_state=np.random.RandomState(RANDOM_SEED))

In [17]:

explicit_model = explicit.ExplicitFactorizationModel(loss='regression',
                                                     embedding_dim=LATENT_DIM,
                                                     n_iter=NUM_EPOCHS,
                                                     learning_rate=LEARNING_RATE,
                                                     batch_size=BATCH_SIZE,
                                                     l2=L2,
                                                     random_state=np.random.RandomState(RANDOM_SEED))
implicit_model = implicit.ImplicitFactorizationModel(loss='bpr',
                                                     embedding_dim=LATENT_DIM,
                                                     n_iter=NUM_EPOCHS,
                                                     learning_rate=LEARNING_RATE,
                                                     batch_size=BATCH_SIZE,
                                                     l2=L2,
                                                     random_state=np.random.RandomState(RANDOM_SEED))

In [34]:
implicit_model.fit(train)

In [35]:
print('Implicit MRR: {:.2f}'.format(mrr_score(implicit_model, test, train=train).mean()))

Implicit MRR: 0.02


In [18]:
explicit_model.fit(train)

In [19]:
print('Implicit MRR: {:.2f}'.format(mrr_score(explicit_model, test, train=train).mean()))

Implicit MRR: 0.01
