# Reading data

In [1]:
dataset_folder = '/home/erwin/Downloads/ml-100k'

In [2]:
import pandas as pd
import numpy as np

In [3]:
training_data = dataset_folder + '/u1.base'

In [4]:
columns = ['user_id', 'movie_id', 'rating', 'timestamp']
training_df = pd.read_csv(training_data, names=columns, sep='\t')

In [5]:
training_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [6]:
training_df['user_id'].value_counts()

655    685
405    582
450    494
537    490
416    417
      ... 
93       9
172      9
258      8
228      7
310      4
Name: user_id, Length: 943, dtype: int64

We get n_user = 943

In [7]:
n_user = 943

In [8]:
training_df['movie_id'].value_counts()

50      484
181     422
258     402
100     395
294     394
       ... 
1548      1
1676      1
1461      1
1525      1
1663      1
Name: movie_id, Length: 1650, dtype: int64

We get n_movie = 1650

In [9]:
n_movie = 1650

In [10]:
training_df['rating'].value_counts()

4    27396
3    21963
5    16744
2     9178
1     4719
Name: rating, dtype: int64

In [11]:
genres = pd.read_csv(dataset_folder + '/u.genre', names=['genre', 'id'], sep='\|')

  """Entry point for launching an IPython kernel.


In [12]:
genres.head()

Unnamed: 0,genre,id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4


In [13]:
list_genres = list(genres['genre'])

In [14]:
print(list_genres)

['unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [15]:
df_movie = pd.read_csv(dataset_folder + '/u.item', names=['movie_id', 'movie_title', 'release_date', 'video_release_date', \
                                                          'imdb_url'] + list_genres, sep='\|')

  


In [16]:
df_movie.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


Let's find out the watched movies for user 13

In [17]:
movies_watched = pd.merge(training_df[['user_id', 'movie_id', 'rating']], df_movie[['movie_id', 'movie_title']], on='movie_id')

In [18]:
movies_watched.head()

Unnamed: 0,user_id,movie_id,rating,movie_title
0,1,1,5,Toy Story (1995)
1,2,1,4,Toy Story (1995)
2,6,1,4,Toy Story (1995)
3,13,1,3,Toy Story (1995)
4,16,1,5,Toy Story (1995)


In [19]:
movies_watched[movies_watched['user_id'] == 13].head(10)

Unnamed: 0,user_id,movie_id,rating,movie_title
3,13,1,3,Toy Story (1995)
566,13,4,5,Get Shorty (1995)
735,13,5,1,Copycat (1995)
806,13,7,2,Twelve Monkeys (1995)
1113,13,8,4,Babe (1995)
1287,13,9,3,Dead Man Walking (1995)
1726,13,13,5,Mighty Aphrodite (1995)
2260,13,22,4,Braveheart (1995)
2778,13,28,5,Apollo 13 (1995)
3125,13,32,4,Crumb (1994)


# Creating model

Let X be user's latent features and Y be item's latent features. X has (n_user, k) dimension and Y has (n_item, k) dimension. Let R = X*Y.T

We will use k = 32 in this code, and dimension of R will be (1000, 2000) to deal with 1-index array.

In [20]:
k = 32

In [21]:
R = np.zeros((1000, 2000))

In [22]:
R

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [23]:
for row in training_df.values:
    R[row[0], row[1]] = row[2]

In [24]:
R

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 3., ..., 0., 0., 0.],
       [0., 4., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
def ALS(R, epochs=10, _lambda=0.1):
    # initialize X and Y first, might be random, we will use matrix of 1's here
    X = np.ones((R.shape[0],k))
    Y = np.ones((R.shape[1],k))
    
    for _ in range(epochs):
        X = np.matmul(R, np.matmul(Y, np.linalg.inv(np.matmul(Y.T, Y) + (_lambda * np.eye(Y.shape[1])))))
        Y = np.matmul(R.T, np.matmul(X, np.linalg.inv(np.matmul(X.T, X) + (_lambda * np.eye(X.shape[1])))))
        
    return X, Y

In [26]:
X, Y = ALS(R, epochs=10000)

In [27]:
X

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.13987593,  0.25735238,  0.66721166, ...,  0.78050565,
         0.58605553, -0.61258072],
       [ 0.11032128, -0.13402094,  0.00634766, ...,  0.345138  ,
         0.0434053 ,  0.22908705],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [28]:
Y

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.81328038,  0.46164783,  2.15671158, ...,  0.36025032,
         0.77295538,  0.29858563],
       [ 0.38673386,  0.3284106 ,  0.22188541, ...,  0.02202617,
         0.3433343 , -0.17571973],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [29]:
predicted_R = np.matmul(X, Y.T)

In [30]:
predicted_R

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 4.69352073, 1.20494701, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.95072928, 0.03036072, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [31]:
print(R[13,4])
print(predicted_R[13,4])

5.0
4.680622495919718


# Get Recommendation

input = (original ratings, prediction ratings, user id, top n reco)

In [32]:
def valid_user(R, user_id):
    try:
        return sum(R[user_id, :]) > 0
    except:
        return False

In [33]:
def get_reco(original_R, predicted_R, user_id, top_n=10):
    if valid_user(original_R, user_id):
        try:
            result = []
            predicted_ratings = predicted_R[user_id, :]
            result = list(zip(predicted_ratings, [i for i in range(len(predicted_ratings))]))
            result.sort(key=lambda x: (-x[0]))

            filtered = [x for x in result if original_R[user_id, x[1]] == 0]
            return [x[1] for x in filtered[:top_n]]
        except:
            return []
    else:
        return []

In [34]:
reco_user_13 = get_reco(R, predicted_R, 13, 25)

In [35]:
print(reco_user_13)

[47, 71, 304, 11, 15, 588, 272, 178, 230, 238, 192, 153, 715, 180, 198, 195, 434, 173, 591, 125, 513, 229, 530, 923, 305]


In [36]:
df_movie[df_movie.movie_id.isin(reco_user_13)].head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
10,11,Seven (Se7en) (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Se7en%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
14,15,Mr. Holland's Opus (1995),29-Jan-1996,,http://us.imdb.com/M/title-exact?Mr.%20Holland...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46,47,Ed Wood (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Ed%20Wood%20(...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
70,71,"Lion King, The (1994)",01-Jan-1994,,"http://us.imdb.com/M/title-exact?Lion%20King,%...",0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0
124,125,Phenomenon (1996),29-Jun-1996,,http://us.imdb.com/M/title-exact?Phenomenon%20...,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [37]:
df_movie[df_movie.movie_id == reco_user_13[0]].head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
46,47,Ed Wood (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Ed%20Wood%20(...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Compute RMSE score between predicted ratings and labeled ratings in testing data

In [38]:
columns = ['user_id', 'movie_id', 'rating', 'timestamp']
testing_data = dataset_folder + '/u1.test'
testing_df = pd.read_csv(testing_data, names=columns, sep='\t')

In [39]:
testing_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


In [40]:
def predict_rating(user_id, movie_id):
    return round(predicted_R[user_id, movie_id])

In [41]:
testing_df['predicted_rating'] = testing_df.apply(lambda row: predict_rating(row[0], row[1]), axis=1)
testing_df['diff'] = testing_df.apply(lambda row: abs(row[2]-row[4]), axis=1)

In [42]:
testing_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,predicted_rating,diff
0,1,6,5,887431973,0.0,5.0
1,1,10,3,875693118,1.0,2.0
2,1,12,5,878542960,1.0,4.0
3,1,14,5,874965706,1.0,4.0
4,1,17,3,875073198,1.0,2.0


In [43]:
sum(testing_df['diff'])

54651.0

In [44]:
len(testing_df[testing_df['diff'] == 0]['diff'])

620

In [45]:
len(testing_df['diff'])

20000

In [46]:
acc = 620/20000*100

In [47]:
acc

3.1