In [1]:
from __future__ import division
from __future__ import print_function 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


import pandas as pd
import numpy as np 

from sklearn.preprocessing import LabelEncoder
from yarst.utils import TrainTestSplit
from yarst.models import SGD,ALS,UserBasedCollaborativeFiltering
from yarst.metrics import mae,mse
# set seed to repeat the task

seed_set = 1234
np.random.seed(seed_set)


## Import Data

In [2]:
df_rating = pd.read_csv('data.csv')
df_rating.head(5)

Unnamed: 0,user,movie_title,rating,genres,timestamp
0,1,Dangerous Minds (1995),2.5,Drama,1260759144
1,7,Dangerous Minds (1995),3.0,Drama,851868750
2,31,Dangerous Minds (1995),4.0,Drama,1273541953
3,32,Dangerous Minds (1995),4.0,Drama,834828440
4,36,Dangerous Minds (1995),3.0,Drama,847057202


In [3]:
# process data

user_encoder = LabelEncoder()
df_rating['user_id'] = user_encoder.fit_transform(df_rating['user'].values)

movie_encoder = LabelEncoder()
df_rating['movie_id'] = user_encoder.fit_transform(df_rating['movie_title'].values)

# drop dups
df_rating = df_rating.drop_duplicates(['user_id','movie_id'])
df_rating.shape
df_rating.head(5)

(100003, 7)

Unnamed: 0,user,movie_title,rating,genres,timestamp,user_id,movie_id
0,1,Dangerous Minds (1995),2.5,Drama,1260759144,0,1976
1,7,Dangerous Minds (1995),3.0,Drama,851868750,6,1976
2,31,Dangerous Minds (1995),4.0,Drama,1273541953,30,1976
3,32,Dangerous Minds (1995),4.0,Drama,834828440,31,1976
4,36,Dangerous Minds (1995),3.0,Drama,847057202,35,1976


In [4]:
# split the train and test 
# params 
# split_count: int. Number of user-item-ratings per user to move from training to test set.
# threshold: int, set the minimin number of items that test has hold
# fractions : float. Fraction of users to split off some of their interactions into test set. 
#            If None, then all users are considered.

x_col = {'user':'user_id','item':'movie_id','rate':'rating'}
df_train, df_test = TrainTestSplit(df_rating,x_col,
                                   split_count = 100,threshold = 200, fraction = None,
                                   verbose=1)

# remove dups
df_train = df_train.drop_duplicates(['user_id','movie_id'])
df_test = df_test.drop_duplicates(['user_id','movie_id'])


557 unique users in train
Train sparsity: 1.82%
137 unique users in test
Test sparsity: 2.43%


## Build Model(SGD)

In [6]:
num_user = df_rating["user_id"].nunique()
num_item = df_rating["movie_id"].nunique()

print("total number of users: {}".format(num_user))
print("total number of movies: {}".format(num_item))

total number of users: 671
total number of movies: 9064


In [8]:
# build the sgd, components is hidden factor number
sgd = SGD(num_hidden_factor=20,num_user=num_user,num_item=num_item)

#fit the data via df_train
x = {'user':df_train["user_id"].values,'item':df_train["movie_id"].values}
y = df_train["rating"].values
sgd.train(x,y)

100%|██████████| 10/10 [00:19<00:00,  1.93s/it]


In [13]:
# evaluate model

y_pred = sgd.predict(
    {'user':df_test["user_id"].values,'item':df_test["movie_id"].values}
)
y_true = df_test["rating"].values

print("mean_absolute_error: {}".format(mae()(y_true,np.array(y_pred))))
print("mean_squared_error: {}".format(mse()(y_true,np.array(y_pred))))

mean_absolute_error: 0.748679824892
mean_squared_error: 0.904119602473


## Build Model(CF)

In [5]:
x = {'user':df_train["user_id"].values,'item':df_train["movie_id"].values}
y = df_train["rating"].values


ubcf = UserBasedCollaborativeFiltering()

ubcf.train(x,y)



In [6]:
# evaluate model

y_pred = ubcf.predict(
    {'user':df_test["user_id"].values,'item':df_test["movie_id"].values}
)
y_true = df_test["rating"].values

print("mean_absolute_error: {}".format(mae()(y_true,np.array(y_pred))))
print("mean_squared_error: {}".format(mse()(y_true,np.array(y_pred))))

mean_absolute_error: 1.79276970464
mean_squared_error: 5.30892301291
