In [315]:
import pandas as pd 

# Reading user file:
u_cols =  ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
 encoding='latin-1')

In [316]:
#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
 encoding='latin-1')

#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,
 encoding='latin-1')

n_users = users.shape[0]
n_items = items.shape[0]

In [317]:
ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_base.shape, ratings_test.shape

((90570, 4), (9430, 4))

In [318]:
rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

In [319]:
import numpy as np
def get_items_rated_by_user(rates, user_id):
    """
    return (item_ids, scores)
    """
    y = rates[:,0] # all users
    ids = np.where(y == user_id +1)[0] # item indices rated by user_id
    scores = rates[ids, 2]
    item_ids = rates[ids, 1] - 1 # index starts from 0 
    return (item_ids, scores)
    

In [320]:
X0 = items.as_matrix()
X_train_counts = X0[:, 5:]
# print X0

In [321]:
#tfidf
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X_train_counts.tolist()).toarray()
# tfidf = X_train_counts

In [322]:
from sklearn.linear_model import Ridge

d = tfidf.shape[1] # data dimension

#bias trick
Xbar = np.concatenate((tfidf, np.ones((n_items, 1))), axis = 1)
W = np.zeros((d + 1, n_users))

for n in range(n_users):    
    ids, scores = get_items_rated_by_user(rate_train, n)
    clf = Ridge(alpha=.001)
    Xhat = Xbar[ids, :]
    clf.fit(Xhat, scores) 
    W[:, n] = clf.coef_


[5 3 5 4 4 4 4 4 4 3 4 5 4 1 3 4 5 5 3 5 4 4 5 2 3 4 5 2 5 4 4 2 5 5 4 3 1
 3 5 3 1 4 4 2 4 5 4 4 5 5 2 5 4 4 5 4 5 4 5 2 3 5 4 4 3 4 4 4 1 3 2 1 2 5
 3 4 5 3 4 5 2 3 4 3 4 1 2 2 1 1 2 4 4 3 2 1 2 2 3 4 4 4 2 1 1 2 4 5 5 5 5
 4 4 4 1 4 4 3 2 4 4 1 2 2 5 3 4 5 4 4 1 5 3 1 4 4 2 4 3 2 3 3 4 4 3 2 4 5
 1 5 2 2 3 2 4 3 3 3]


In [323]:
# predicted scores
Yhat = W.T.dot(Xbar.T)

In [324]:
print Yhat[1, :10]

[ 4.28269834  3.49810493  1.67314687  6.55191428  4.04046711  4.07617154
  2.94292545  7.41634765  4.07617154  2.75639798]


In [325]:
print Xbar.shape

(1682, 20)


In [326]:
# evaluate 

# rate test 
ids0, scores = get_items_rated_by_user(rate_test, 0)
ids1, scores1 = get_items_rated_by_user(rate_train, 0)

In [330]:
n = 2
ids, scores = get_items_rated_by_user(rate_train, 10)
Yhat[n, ids]
print(scores, Yhat[ n, ids])

(array([4, 5, 2, 2, 5, 4, 3, 3, 5, 3, 3, 3, 3, 4, 4, 3, 3, 4, 2, 3, 3, 4, 4,
       5, 4, 3, 2, 3, 4, 2, 4, 4, 3, 2, 3, 3, 4, 4, 3, 5, 3, 3, 2, 4, 3, 4,
       4, 5, 4, 3, 4, 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 5, 3, 1, 5, 3, 5,
       5, 3, 4, 3, 4, 4, 4, 5, 1, 5, 4, 4, 5, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3,
       3, 4, 3, 3, 5, 4, 4, 5, 3, 2, 4, 4, 4, 3, 2, 3, 3, 4, 2, 2, 4, 3, 4,
       4, 4, 2, 3, 3, 5, 2, 4, 3, 4, 3, 5, 3, 3, 4, 4, 4, 4, 5, 2, 5, 4, 3,
       3, 2, 5, 3, 1, 3, 3, 3, 3, 3, 3, 4, 3, 4, 4, 3, 3, 4, 4, 3, 3, 5, 3,
       2, 4, 5, 4, 3, 1, 5, 5, 2, 4]), array([ 1.54561049,  1.86651435,  1.62530574,  1.62530574,  1.86651435,
        1.93529482,  3.21512875,  1.68338939,  1.4080381 ,  3.60091222,
        1.93062729,  1.68338939,  1.68338939,  2.48865825,  1.37913049,
        1.86651435,  1.4080381 ,  2.67296566,  1.86651435,  1.86651435,
        1.90427918,  1.7625979 ,  0.70581066,  1.7625979 ,  1.86651435,
        1.7625979 ,  1.37896926,  0.87453136,  1.81784035,  0.997827