In [63]:
import pandas as pd
import numpy as np
import os
import time

In [64]:
# path to the data directory
data_path = '/mnt/DataSets/MovieLens/ml-latest-small'

# read ratings and links into dataframes
df_ratings = pd.read_csv(os.path.join(data_path,'ratings.csv'), sep=',')         
df_links = pd.read_csv(os.path.join(data_path,'links.csv'), sep=',')

# joining ratings with links to get ImdbId which is needed for comparison to the DL approach
df_joined = pd.merge(df_ratings, df_links, on=['movieId'])
df_joined = df_joined.drop(['tmdbId'], axis=1)

# creating a bunch of dicts on the various item Ids
# note that UserIds don't need to be indexed here because they are consecutive integers from 1 to 671
#movie_to_imdb = {} # key:MovieId, val:ImdbId
movie_to_idx = {}  # key:MovieId, val: idx
idx_to_imdb = {}   # key:idx, val:ImdbId
idx = 0
for row in df_joined.itertuples():
    if(row[2] not in movie_to_idx):
        #movie_to_imdb[row[2]] = row[5]
        movie_to_idx[row[2]] = idx
        idx_to_imdb[idx] = row[5]
        idx +=1   

print (df_joined.head(5))
print (df_joined.shape)

num_movies = len(movie_to_idx)
print("Number of Unique Movies:",num_movies)
num_users = df_joined.userId.unique().shape[0]
print("Number of Unique Users:",num_users)

   userId  movieId  rating   timestamp  imdbId
0       1       31     2.5  1260759144  112792
1       7       31     3.0   851868750  112792
2      31       31     4.0  1273541953  112792
3      32       31     4.0   834828440  112792
4      36       31     3.0   847057202  112792
(100004, 5)
Number of Unique Movies: 9066
Number of Unique Users: 671


In [65]:
# get the sparse ratings matrix
rating_matrix = np.zeros((num_users, num_movies))
for row in df_joined.itertuples():
    rating_matrix[row[1]-1, movie_to_idx[row[2]]] = row[3]

print ("Shape of Ratings Matrix:",rating_matrix.shape)

sparsity = float(len(rating_matrix.nonzero()[0]))
sparsity /= (rating_matrix.shape[0] * rating_matrix.shape[1])
sparsity *= 100
print ("Sparsity %: ",sparsity)

Shape of Ratings Matrix: (671, 9066)
Sparsity %:  1.6439141608663475


In [66]:
# compute the cosine similarity between Items by default (can be used for users if passing rating_matrix.T)
def cosine_sim(rating_matrix):
    sim_matrix = rating_matrix.T.dot(rating_matrix)
    norm_matrix = np.array([np.sqrt(np.diagonal(sim_matrix))])
    sim_matrix = (sim_matrix/(norm_matrix*norm_matrix.T))
    return sim_matrix

start_time = time.time()
item_sim_matrix = cosine_sim(rating_matrix)

print("Rating Matrix shape:",rating_matrix.shape)
print("Sim Matrix shape:", item_sim_matrix.shape)
print("--- It took %s seconds ---" % (time.time() - start_time))

Rating Matrix shape: (671, 9066)
Sim Matrix shape: (9066, 9066)
--- It took 1.2085046768188477 seconds ---


In [67]:
# consider only similarities greater than a threshold
sim_th = np.mean(item_sim_matrix)

sim_list = []
for i in range(0,num_movies-1):
    for j in range(i+1,num_movies):
        if(item_sim_matrix[i][j] > sim_th):
            sim_list.append((idx_to_imdb[i],idx_to_imdb[j],item_sim_matrix[i][j])) 

print("Similarity threshold:", sim_th)
print("Number of similarities:",len(sim_list))

Similarity threshold: 0.0517881350127
Number of similarities: 9809734


In [89]:
# save to a gzipped csv file for inspection and later re-loading
# avoiding pickle here because I'd like to be able to read the output as well as load it anywhere

df_out = pd.DataFrame(sim_list, columns =['ItemId1','ItemId2','Sim'])

df_out.to_csv(data_path + "/item_sim_cosine.gz", compression="gzip", index = False)

**Evaluation**

Single-Item recommendation evaluation by considering the last two ratings of each User: SecondToLast used as a seed to get recommendations; Last used to evaluate whether the User ended up watching/rating that item. Metric: Precision@5

In [132]:
k = 10   # number of most similar movies to display

# read in the seed file
df_seeds = pd.read_csv(os.path.join(data_path,'SecondToLastItemPerUser.csv'), sep=',', header = None, names=['UserId','ItemId'])
df_seeds.head(5)

# iterating over the dataframe rows and requesting 5 recos for each seed Item
#for i in range(simDF.shape[0]):
reco_list_all = []
for row in df_seeds.itertuples(): # row[0] is the df index
    movie_seed = row[2]
    sorted_indx = np.argsort(item_sim_matrix[movie_to_idx[movie_seed],:])[::-1][:k+1]
    #scores = np.sort(item_sim_matrix[movie_to_idx[movie_seed],:])[::-1][:k+1]
    reco_list = []
    reco_list.append(row[1])
    reco_list.append(row[2])
    for i in range(1,len(sorted_indx)):
        reco_list.append(idx_to_imdb[sorted_indx[i]])
    reco_list_all.append(reco_list)


df_recos = pd.DataFrame(reco_list_all, dtype='str')
df_recos.columns = ['UserId', 'Seed','Reco1', 'Reco2', 'Reco3', 'Reco4', 'Reco5', 'Reco6', 'Reco7', 'Reco8', 'Reco9', 'Reco10'] 
df_recos.head(5)
#df_recos.to_csv(data_path + '/CF_cosine_ItemRecos.csv', index=False, header=False

[ 1.          0.50157897  0.48288893  0.46210555  0.45734757  0.44842716
  0.44495504  0.41988885  0.41948274  0.41781405  0.41302175]


Unnamed: 0,UserId,Seed,Reco1,Reco2,Reco3,Reco4,Reco5,Reco6,Reco7,Reco8,Reco9,Reco10
0,1,2968,84827,101889,82348,83791,90728,84726,93870,101272,82340,94721
1,2,661,116583,29583,57546,67992,32455,33563,32910,58331,59742,107688
2,3,58559,371746,372784,1375666,499549,910970,167260,407887,416449,1345836,796366
3,4,3265,103905,120265,117786,75029,19422,92263,94138,84156,3799694,3065204
4,5,41569,407304,258000,892782,1230414,338751,1058017,405159,978764,265086,411477


In [131]:
df_test = pd.read_csv(os.path.join(data_path,'lastItemPerUser.csv'), sep=',', header = None, names=['UserId','ItemId'])

df_eval = pd.merge(df_recos, df_test, on=['UserId'])
print(df_eval.head(5))
sum = 0
for row in df_eval.itertuples(): # row[0] is the df index
    if(row[13] in row[2:-1]):
        sum +=1
     
print("Precision:", sum/df_eval.shape[0])

  UserId   Seed   Reco1   Reco2    Reco3    Reco4   Reco5    Reco6   Reco7  \
0      1   2968   84827  101889    82348    83791   90728    84726   93870   
1      2    661  116583   29583    57546    67992   32455    33563   32910   
2      3  58559  371746  372784  1375666   499549  910970   167260  407887   
3      4   3265  103905  120265   117786    75029   19422    92263   94138   
4      5  41569  407304  258000   892782  1230414  338751  1058017  405159   

    Reco8    Reco9   Reco10  ItemId  
0  101272    82340    94721    3671  
1   58331    59742   107688     720  
2  416449  1345836   796366   84236  
3   84156  3799694  3065204    4006  
4  978764   265086   411477   48385  
Precision: 0.0
