In [5]:
from __future__ import print_function
import numpy as np
import pandas as pd
import os

file_path = '/workspaces/Machine-Learning-Model/Recommender-system/main.ipynb'
file_location = os.path.dirname(file_path)

print('File location:', file_location)
# BEGIN: Check other files in the folder
folder_path = os.path.dirname(file_path)
files_in_folder = os.listdir(folder_path)

files_containing_file = [file for file in files_in_folder if file_path in file]

print('Files containing the specified file:')
for file in files_containing_file:
    print(file)
# END: Check other files in the folder


File location: /workspaces/Machine-Learning-Model/Recommender-system
Files containing the specified file:


In [7]:

u_cols = ['user', 'age', 'sex, occupation', 'zip_code']
users = pd.read_csv('/workspaces/Machine-Learning-Model/Recommender-system/Data/ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')
n_users = users.shape[0]
print('Number of users:', n_users)

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv('/workspaces/Machine-Learning-Model/Recommender-system/Data/ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('/workspaces/Machine-Learning-Model/Recommender-system/Data/ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.values
rate_test = ratings_test.values

print('Number of traing rates:', rate_train.shape[0])
print('Number of test rates:', rate_test.shape[0])

Number of users: 943
Number of traing rates: 90570
Number of test rates: 9430


In [8]:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure','Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy','Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('/workspaces/Machine-Learning-Model/Recommender-system/Data/ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

X0 = items.values
X_train_counts = X0[:, -19:]

n_items = items.shape[0]
print('Number of items:', n_items)



Number of items: 1682


In [9]:
print(rate_train[:5, :])

[[        1         1         5 874965758]
 [        1         2         3 876893171]
 [        1         3         4 878542960]
 [        1         4         3 876893119]
 [        1         5         3 889751712]]


In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm='l2')
X = transformer.fit_transform(X_train_counts.tolist()).toarray()
print(X[:5, :])

[[0.         0.         0.         0.74066017 0.57387209 0.34941857
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.53676706 0.65097024 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.53676706 0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         1.         0.
  0.        ]
 [0.         0.71065158 0.         0.         0.         0.5397592
  0.         0.         0.45125862 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.735504   0.         0.36318585 0.         0.         0.
  0.         0.         0.         0.         0.57195272 0.
  0.        ]]


In [12]:
def get_items_rated_by_user(rate_matrix, user_id):
    """
    return (item_ids, scores)
    """
    y = rate_matrix[:,0] # all users
    # item indices rated by user_id
    # we need to +1 to user_id since in the rate_matrix, id starts from 1
    # while index in python starts from 0
    ids = np.where(y == user_id +1)[0]
    item_ids = rate_matrix[ids, 1] - 1 # index starts from 0
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)



In [13]:
from sklearn.linear_model import Ridge
from sklearn import linear_model
d = X.shape[1] # data dimension
W = np.zeros((d, n_users))
b = np.zeros(n_users)
for n in range(n_users):
    ids, scores = get_items_rated_by_user(rate_train, n)
    model = Ridge(alpha=0.01, fit_intercept  = True)
    Xhat = X[ids, :]
    model.fit(Xhat, scores)
    W[:, n] = model.coef_
    b[n] = model.intercept_


In [14]:
Yhat = X.dot(W) + b

In [18]:
n = 10
np.set_printoptions(precision=2) # 2 digits after .
ids, scores = get_items_rated_by_user(rate_test, n)
print('Rate test:', ids)
print('Scores:', scores)
print('Predictions:', Yhat[ids, n])
print('Error:', np.sqrt(np.mean((Yhat[ids, n] - scores)**2)))

Rate test: [189 229 311 516 560 659 713 719 740 745]
Scores: [3 4 4 2 2 3 4 1 5 4]
Predictions: [3.47 3.13 4.15 4.16 3.9  4.01 3.83 3.56 3.14 3.31]
Error: 1.4449976697815012


In [19]:
def evaluate(Yhat, rates, W, b):
    se = 0
    cnt = 0
    for n in range(n_users):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred
        se += (e*e).sum(axis = 0)
        cnt += e.size
    return np.sqrt(se/cnt)
print('RMSE for training:', evaluate(Yhat, rate_train, W, b))
print('RMSE for test    :', evaluate(Yhat, rate_test, W, b))

RMSE for training: 0.9073946349009766
RMSE for test    : 1.2532350678563369
