In [1]:
import pandas as pd
import numpy as np

In [7]:
u_cols = ['user_id','age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('data/ml-100k/u.user', sep='|', names=u_cols)

In [9]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv('data/ml-100k/ua.base', sep='\t', names=r_cols)
ratings_test = pd.read_csv('data/ml-100k/ua.test', sep='\t', names=r_cols)

In [18]:
ratings_base

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
90565,943,1047,2,875502146
90566,943,1074,4,888640250
90567,943,1188,3,888640250
90568,943,1228,3,888640275


In [19]:
rate_train = ratings_base.values
rate_test = ratings_test.values


In [26]:
i_cols = ['movie id', 'movie title' ,'release date',
          'video release date', 'IMDb URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 
          'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
          'Film-Noir', ' Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', ' Western']
items = pd.read_csv('data/ml-100k/u.item', sep='|', names=i_cols, encoding='cp1252')

In [45]:
items

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
X0 = items.values

In [31]:
X_train_counts = X0[:, -19:]

In [39]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
X = transformer.fit_transform(X_train_counts.tolist()).toarray()

In [47]:
rate_train

array([[        1,         1,         5, 874965758],
       [        1,         2,         3, 876893171],
       [        1,         3,         4, 878542960],
       ...,
       [      943,      1188,         3, 888640250],
       [      943,      1228,         3, 888640275],
       [      943,      1330,         3, 888692465]], dtype=int64)

In [120]:
def get_ietms_rates_by_user(rate_matrix,user_id):
    y = rate_matrix[:,0]
    index = np.where(y == user_id)[0]
    item_ids = rate_matrix[index,1] - 1
    scores = rate_matrix[index,2]
    return item_ids,scores

# Train

In [51]:
from sklearn.linear_model import Ridge
from sklearn import linear_model

In [119]:
n_users = users.shape[0]
n_items = items.shape[0]
d = X.shape[1]

In [121]:
store_model = dict()
for n in range(1,n_users+1,1):
    ids, score = get_ietms_rates_by_user(rate_train,n)
    model = Ridge(alpha=0.01, fit_intercept = True)
    X_train = X[ids,:]
    model.fit(X_train,score)
    store_model[n] = model

In [130]:
def evaluate(rate_matrix):
    y_pred = []
    y_true = []
    for n in range(1,n_users+1,1):
        ids, score = get_ietms_rates_by_user(rate_matrix,n)
        y = store_model[n].predict(X[ids,:])
        y_pred+=list(y)
        y_true+=list(score)
    return y_pred,y_true

In [96]:
n = 10
np.set_printoptions(precision=2) # 2 digits after .
ids, scores = get_ietms_rates_by_user(rate_test, n)
y_pred = store_model[n].predict(X[ids,:])
print('Rated movies ids :', ids )
print('True ratings :', scores)
print('Predicted ratings:', y_pred)

Rated movies ids : [  6  15  99 174 284 460 485 487 503 610]
True ratings : [4 4 5 3 5 3 4 5 5 5]
Predicted ratings: [4.37 3.94 4.44 4.29 4.17 4.17 3.94 4.36 4.46 4.79]


In [101]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(scores,y_pred,squared=False)

In [118]:
mse

0.7023717541306221

In [131]:
y_pred_train,y_true_train = evaluate(rate_train)
y_pred_test,y_true_test = evaluate(rate_test)

## RMSE for training

In [132]:
mean_squared_error(y_pred_train,y_true_train,squared=False)

0.908980456282672

## RMSE for test

In [133]:
mean_squared_error(y_pred_test,y_true_test,squared=False)

1.2703282700393044