In [43]:
import pandas as pd 
#Reading user file:
u_cols =  ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
 encoding='latin-1')

n_users = users.shape[0]
print('Number of users:', users)

Number of users:      user_id  age sex     occupation zip_code
0          1   24   M     technician    85711
1          2   53   F          other    94043
2          3   23   M         writer    32067
3          4   24   M     technician    43537
4          5   33   F          other    15213
..       ...  ...  ..            ...      ...
938      939   26   F        student    33319
939      940   32   M  administrator    02215
940      941   20   M        student    97229
941      942   48   F      librarian    78209
942      943   22   M        student    77841

[943 rows x 5 columns]


In [53]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')

rating_test = ratings_test.drop(columns = "rating") 
rate_train = ratings_base.values
rate_test = ratings_test.values

print(ratings_test)


      user_id  movie_id  rating  unix_timestamp
0           1        20       4       887431883
1           1        33       4       878542699
2           1        61       4       878542420
3           1       117       3       874965739
4           1       155       2       878542201
...       ...       ...     ...             ...
9425      943       232       4       888639867
9426      943       356       4       888639598
9427      943       570       1       888640125
9428      943       808       4       888639868
9429      943      1067       2       875501756

[9430 rows x 4 columns]


In [52]:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,
 encoding='latin-1')

n_items = items.shape[0]
print ('Number of items:', items)



Number of items:       movie id                                movie title release date  \
0            1                           Toy Story (1995)  01-Jan-1995   
1            2                           GoldenEye (1995)  01-Jan-1995   
2            3                          Four Rooms (1995)  01-Jan-1995   
3            4                          Get Shorty (1995)  01-Jan-1995   
4            5                             Copycat (1995)  01-Jan-1995   
...        ...                                        ...          ...   
1677      1678                          Mat' i syn (1997)  06-Feb-1998   
1678      1679                           B. Monkey (1998)  06-Feb-1998   
1679      1680                       Sliding Doors (1998)  01-Jan-1998   
1680      1681                        You So Crazy (1994)  01-Jan-1994   
1681      1682  Scream of Stone (Schrei aus Stein) (1991)  08-Mar-1996   

      video release date                                           IMDb URL  \
0              

In [46]:
X0 = items.values
X_train_counts = X0[:, -19:]
# print(X_train_counts)
items.shape

(1682, 24)

In [47]:
#tfidf
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
tfidf = transformer.fit_transform(X_train_counts.tolist()).toarray()
print(tfidf)

[[0.   0.   0.   ... 0.   0.   0.  ]
 [0.   0.54 0.65 ... 0.54 0.   0.  ]
 [0.   0.   0.   ... 1.   0.   0.  ]
 ...
 [0.   0.   0.   ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.  ]]


In [48]:
import numpy as np
def get_items_rated_by_user(rate_matrix, user_id):
  
    """
    in each line of rate_matrix, we have infor: user_id, item_id, rating (scores), time_stamp
    we care about the first three values
    return (item_ids, scores) rated by user user_id
    """
    y = rate_matrix[:,0] # all users
    # item indices rated by user_id
    # we need to +1 to user_id since in the rate_matrix, id starts from 1 
    # while index in python starts from 0
    ids = np.where(y == user_id +1)[0] 
    item_ids = rate_matrix[ids, 1] - 1 # index starts from 0 
    scores = rate_matrix[ids, 2]

    return (item_ids, scores)


In [49]:
from sklearn.linear_model import Ridge
from sklearn import linear_model

d = tfidf.shape[1] # data dimension
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))

for n in range(n_users):    
    ids, scores = get_items_rated_by_user(rate_train, n)

    clf = Ridge(alpha=0.01, fit_intercept  = True)
    Xhat = tfidf[ids, :]
    clf.fit(Xhat, scores)

    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_

<class 'numpy.int64'>


In [50]:

Yhat = tfidf.dot(W) + b
print(Yhat)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [51]:
n = 10
np.set_printoptions(precision=2) # 2 digits after . 
ids, scores = get_items_rated_by_user(rate_test, n)
# print(Yhat[n, ids])
print('Rated movies ids :', ids )
print('True ratings     :', scores)
print('Predicted ratings:', Yhat[ids, n])

Rated movies ids : [ 37 109 110 226 424 557 722 724 731 739]
True ratings     : [3 3 4 3 4 3 5 3 3 4]
Predicted ratings: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
