In [None]:
!pip install rpy2==3.5.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rpy2==3.5.1
  Downloading rpy2-3.5.1.tar.gz (201 kB)
[K     |████████████████████████████████| 201 kB 5.2 MB/s 
Building wheels for collected packages: rpy2
  Building wheel for rpy2 (setup.py) ... [?25l[?25hdone
  Created wheel for rpy2: filename=rpy2-3.5.1-cp38-cp38-linux_x86_64.whl size=310198 sha256=eb72f52eb0466b0a15f96f25cee60fcd32768329d5388adf0f09c3f4b1096bda
  Stored in directory: /root/.cache/pip/wheels/6b/40/7d/f63e87fd83e8b99ee837c8e3489081c4b3489134bc520235ed
Successfully built rpy2
Installing collected packages: rpy2
  Attempting uninstall: rpy2
    Found existing installation: rpy2 3.5.5
    Uninstalling rpy2-3.5.5:
      Successfully uninstalled rpy2-3.5.5
Successfully installed rpy2-3.5.1


##Restart runtime(I am using Google Colab) after previous step! (no need to run pip install again after restart)

In [None]:
import numpy as np
import pandas as pd
import rpy2.robjects as robjects
from collections import defaultdict

**Reading data: convert UserID, MovieID to string**

In [None]:
#https://liangfgithub.github.io/Data/faithful.dat
ratings = pd.read_table("https://liangfgithub.github.io/MovieData/ratings.dat",sep="::",header=None,engine='python')
ratings.columns = ['UserID','MovieID','Rating','Timestamp']
ratings['UserID'] = ratings['UserID'].apply(lambda x:'u'+str(x))
ratings['MovieID'] = ratings['MovieID'].apply(lambda x:'m'+str(x))

**Some data wrangling for Movies data**

In [None]:
movies = pd.read_table("https://liangfgithub.github.io/MovieData/movies.dat",sep="::",header=None,engine='python',encoding='latin-1')
movies.columns = ['MovieID','Title','Genres']
movies['year'] = movies['Title'].apply(lambda x:int(x[-5:-1]))
movies['MovieID'] = movies['MovieID'].apply(lambda x:'m'+str(x))
movies['Genres'] = movies['Genres'].apply(lambda x:x.split("|"))
movies_w_ratings = movies.explode('Genres').merge(ratings,on='MovieID',how='left')


# with open("movies.dat",encoding='latin-1') as f:
#   movies = f.readlines()

# System I: Recommendation based on genres

In [None]:
sys1_data = movies_w_ratings.groupby(['Genres','MovieID','Title']).agg({'UserID':'count','Rating':np.mean,'year':'first'}).rename(columns={'UserID':'numRatings'}).reset_index().set_index('Genres')

In [None]:
sys1_data.loc['Film-Noir']

Unnamed: 0_level_0,MovieID,Title,numRatings,Rating,year
Genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Film-Noir,m1068,Crossfire (1947),35,3.685714,1947
Film-Noir,m1069,"Murder, My Sweet (1944)",110,3.927273,1944
Film-Noir,m1152,He Walked by Night (1948),29,3.482759,1948
Film-Noir,m1153,Raw Deal (1948),14,3.428571,1948
Film-Noir,m1154,T-Men (1947),15,3.733333,1947
Film-Noir,m1179,"Grifters, The (1990)",885,3.788701,1990
Film-Noir,m1248,Touch of Evil (1958),514,4.215953,1958
Film-Noir,m1252,Chinatown (1974),1185,4.339241,1974
Film-Noir,m1260,M (1931),308,4.301948,1931
Film-Noir,m1267,"Manchurian Candidate, The (1962)",765,4.333333,1962


In [None]:
def system1(genre,method,data=sys1_data):
  genres = list(data.index.unique())
  if genre not in genres:
    print('Genre must be one of the following')
    print(" ".join(genres))
    return
  if method not in ['Rating','Popularity']:
    print('Method must be either "Rating" or "Popularity"')
    return
  #First method for recommendation (Rating): for movies with more than 100 ratings, get top 10 by average rating
  if method == 'Rating':
    return data[data['numRatings'] > 100].loc[genre].sort_values('Rating',ascending=False)[:10]
  #Second method for recommendation (Popularity): for recent movies (year > 1998), get top 10 by number of ratings received
  else:
    #For smaller categories like 'Western', 'Film-Noir' etc., we need to relax the definition of 'recent'
    lookback = 6 if genre == 'Western' else 4 if genre in ('Musical','Fantasy','War','Mystery') else 10 if genre == 'Film-Noir' else 2
    #The film need to be 'recent' enough and have a lot of ratings (sort by descending, I don't care whether the rating is good or bad for this method)
    recent_year = max(sys1_data.loc[genre]['year'])-lookback
    return data[data['year'] > recent_year].loc[genre].sort_values('numRatings',ascending=False)[:10]

**Make sure that for each combination, I am giving non-empty result**

In [None]:
summary_hm = {'Genre':[],'Method':[],'DFShape':[]}
for genre in sys1_data.index.unique():
  for method in ['Rating','Popularity']:
    summary_hm['Genre'].append(genre)
    summary_hm['Method'].append(method)
    #print(genre,method)
    summary_hm['DFShape'].append(system1(genre,method).shape)
    
pd.DataFrame(summary_hm)

Unnamed: 0,Genre,Method,DFShape
0,Action,Rating,"(10, 5)"
1,Action,Popularity,"(10, 5)"
2,Adventure,Rating,"(10, 5)"
3,Adventure,Popularity,"(10, 5)"
4,Animation,Rating,"(10, 5)"
5,Animation,Popularity,"(10, 5)"
6,Children's,Rating,"(10, 5)"
7,Children's,Popularity,"(10, 5)"
8,Comedy,Rating,"(10, 5)"
9,Comedy,Popularity,"(10, 5)"


**Trying a few examples**

First method for recommendation (Rating): for movies with more than 100 ratings, get top 10 by average rating

In [None]:
system1('Action','Rating')

Unnamed: 0_level_0,MovieID,Title,numRatings,Rating,year
Genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Action,m2019,Seven Samurai (The Magnificent Seven) (Shichin...,628,4.56051,1954
Action,m858,"Godfather, The (1972)",2223,4.524966,1972
Action,m1198,Raiders of the Lost Ark (1981),2514,4.477725,1981
Action,m260,Star Wars: Episode IV - A New Hope (1977),2991,4.453694,1977
Action,m1221,"Godfather: Part II, The (1974)",1692,4.357565,1974
Action,m2028,Saving Private Ryan (1998),2653,4.337354,1998
Action,m2571,"Matrix, The (1999)",2590,4.31583,1999
Action,m1197,"Princess Bride, The (1987)",2318,4.30371,1987
Action,m1233,"Boat, The (Das Boot) (1981)",1001,4.302697,1981
Action,m1196,Star Wars: Episode V - The Empire Strikes Back...,2990,4.292977,1980


Second method for recommendation (Popularity): for recent (note that the dataset contains movies only until year 2000) movies 

'Recent' is defined by

**year > most recent release year of any film of given genre - lookback window** 

Where lookback window is based on the genre, lesser-shot genres like Western and Film-Noir have larger lookback window

Get top 10 movies by number of ratings received

In [None]:
system1("Film-Noir",'Popularity')#This is a very small category. The recent (for this category, it means last 10 years) movie at 10th place based on number of Ratings only has 5 ratings.

Unnamed: 0_level_0,MovieID,Title,numRatings,Rating,year
Genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Film-Noir,m1617,L.A. Confidential (1997),2288,4.219406,1997
Film-Noir,m1179,"Grifters, The (1990)",885,3.788701,1990
Film-Noir,m1748,Dark City (1998),874,3.822654,1998
Film-Noir,m164,Devil in a Blue Dress (1995),379,3.567282,1995
Film-Noir,m707,Mulholland Falls (1996),275,3.087273,1996
Film-Noir,m1783,Palmetto (1998),220,2.927273,1998
Film-Noir,m1601,Hoodlum (1997),75,2.72,1997
Film-Noir,m320,Suture (1993),32,3.40625,1993
Film-Noir,m2008,"This World, Then the Fireworks (1996)",15,2.533333,1996
Film-Noir,m2631,Frogs for Snakes (1998),5,2.0,1998


In case user gives an invalid genre or method...

In [None]:
system1("Pop",'Popularity')

Genre must be one of the following
Action Adventure Animation Children's Comedy Crime Documentary Drama Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western


# System2

## UBCF

In [None]:
#Get the id of the 501th UserID, in alphabetical order
user501 = sorted(ratings['UserID'].unique())[500]

#Only focus on first 501 users
ratings_sub = ratings[ratings['UserID'] <= user501].copy()

#Create a dictionary of dictionary s.t. alldata[u][m] is the rating of MovieID=m by UserID=u if the combination exists
alldata = {}
for user,movie,rating in zip(ratings_sub['UserID'],ratings_sub['MovieID'],ratings_sub['Rating']):
    if user not in alldata:
        alldata[user] = {}
    alldata[user][movie] = rating

In [None]:
mean501 = np.mean(list(alldata[user501].values()))

In [None]:
mean501

2.8061142397425582

In [None]:
allMovies = sorted(ratings['MovieID'].unique())
allMovies[:10]

['m1',
 'm10',
 'm100',
 'm1000',
 'm1002',
 'm1003',
 'm1004',
 'm1005',
 'm1006',
 'm1007']

In [None]:
len(allMovies)

3706

In [None]:
def normalize(alldata):
    alldata_norm = alldata.copy()
    for user in alldata_norm:
        mean_user = np.mean(list(alldata_norm[user].values()))
        alldata_norm[user] = {k:(v-mean_user) for k,v in alldata_norm[user].items()}
    return alldata_norm
def similarity(user1,user2,alldata_norm):
    common_movies = set(alldata_norm[user1]).intersection(set(alldata_norm[user2]))
    if len(common_movies) == 0:
        return 0
    num = sq1= sq2= 0
    for movie in common_movies:
        num += alldata_norm[user1][movie]*alldata_norm[user2][movie]
        sq1 += alldata_norm[user1][movie]**2
        sq2 += alldata_norm[user2][movie]**2
    #m = num/np.sqrt(sum(v**2 for v in alldata_norm[user1].values())*sum(v**2 for v in alldata_norm[user2].values()))
    m = num/np.sqrt(sq1*sq2)
    return (m+1)/2
def topKSimilarUsers(user,alldata_norm,k):
    l = []
    for other_user in alldata_norm:
        if other_user == user:
            continue
        l.append([similarity(user,other_user,alldata_norm),other_user])
    return sorted(l)[-k:]
def mypredict(user,others,allMovies,alldata_norm,user_mean):
    res = {}
    for movie in allMovies:
        if movie in alldata_norm[user]:
            continue
        num = den = 0
        for s,u in others:
            if movie in alldata_norm[u]:
                r = alldata_norm[u][movie]
                den += s
                num += s*r
        if den > 0:
            res[movie] = num/den
    return {k:v+user_mean for k,v in res.items()}

In [None]:
alldata_norm = normalize(alldata)
others = topKSimilarUsers(user501,alldata_norm,20)
ubcf_pred = mypredict(user501,others,allMovies,alldata_norm,mean501)

In [None]:
%load_ext rpy2.ipython

# %%R
# options(warn=-1)
# install.packages("recommenderlab")
#suppressMessages(install.packages("recommenderlab"))

In [None]:
%%R
suppressMessages(install.packages("recommenderlab")) #I omit the output of this cell to make the HTML more readable

In [None]:
%%R
library(recommenderlab)
myurl = "https://liangfgithub.github.io/MovieData/"
ratings = read.csv(paste0(myurl, 'ratings.dat?raw=true'), 
                   sep = ':',
                   colClasses = c('integer', 'NULL'), 
                   header = FALSE)
colnames(ratings) = c('UserID', 'MovieID', 'Rating', 'Timestamp')
i = paste0('u', ratings$UserID)
j = paste0('m', ratings$MovieID)
x = ratings$Rating
tmp = data.frame(i, j, x, stringsAsFactors = T)
Rmat = sparseMatrix(as.integer(tmp$i), as.integer(tmp$j), x = tmp$x)
rownames(Rmat) = levels(tmp$i)
colnames(Rmat) = levels(tmp$j)
Rmat = new('realRatingMatrix', data = Rmat)

train = Rmat[1:500, ]
test = Rmat[501, ]

recommender.UBCF <- Recommender(train, method = "UBCF",
                                parameter = list(normalize = 'center', 
                                                 method = 'Cosine', 
                                                 nn = 20))

p.UBCF <- predict(recommender.UBCF, test, type="ratings")
p.UBCF <- as.numeric(as(p.UBCF, "matrix"))
df <- data.frame("Movie"=colnames(Rmat),"p.UBCF"=p.UBCF)
head(df,20)

   Movie   p.UBCF
1     m1       NA
2    m10       NA
3   m100       NA
4  m1000       NA
5  m1002       NA
6  m1003       NA
7  m1004       NA
8  m1005       NA
9  m1006 0.622441
10 m1007       NA
11 m1008 1.784375
12 m1009       NA
13  m101 3.622441
14 m1010       NA
15 m1011 0.948170
16 m1012       NA
17 m1013       NA
18 m1014       NA
19 m1015       NA
20 m1016       NA


In [None]:
def dataFrame_r2python(rdfname):
  robjects.reval("r_df <-"+rdfname)
  with robjects.conversion.localconverter(robjects.default_converter + robjects.pandas2ri.converter):
    pd_from_r_df = robjects.conversion.rpy2py(robjects.r.r_df)
  return pd_from_r_df
UBCF_R = dataFrame_r2python("df")

In [None]:
UBCF_R

Unnamed: 0,Movie,p.UBCF
1,m1,
2,m10,
3,m100,
4,m1000,
5,m1002,
...,...,...
3702,m994,
3703,m996,
3704,m997,
3705,m998,


In [None]:
keys = ubcf_pred.keys()
vals = ubcf_pred.values()
mypredict_df = pd.DataFrame({"Movie":keys,"my_UBCF":vals})
mypredict_df

Unnamed: 0,Movie,my_UBCF
0,m1006,0.622441
1,m1008,1.784375
2,m101,3.622441
3,m1011,0.948170
4,m1019,3.328744
...,...,...
748,m967,1.784375
749,m971,3.844676
750,m972,1.497101
751,m986,2.497101


In [None]:
scoring_df = UBCF_R.merge(mypredict_df,how="left",on='Movie')
scoring_df.head(15)

Unnamed: 0,Movie,p.UBCF,my_UBCF
0,m1,,
1,m10,,
2,m100,,
3,m1000,,
4,m1002,,
5,m1003,,
6,m1004,,
7,m1005,,
8,m1006,0.622441,0.622441
9,m1007,,


**Equivalent to the following R code**

Equivalent of sum(is.na(p.UBCF) != is.na(mypred)) ### should be zero

In [None]:
#Equivalent of sum(is.na(p.UBCF) != is.na(mypred)) ### should be zero
np.sum(pd.isna(scoring_df['my_UBCF']) != pd.isna(scoring_df['p.UBCF']))

0

**Equivalent to the following R code**

max(abs(p.UBCF - mypred), na.rm = TRUE)  ### should be less than 1e-06 

In [None]:
#Equivalent of max(abs(p.UBCF - mypred), na.rm = TRUE)  ### should be less than 1e-06 
np.nanmax(abs(scoring_df['p.UBCF']-scoring_df['my_UBCF']))

4.953572276811258e-07

##IBCF

In [None]:
%%R
recommender.IBCF <- Recommender(train, method = "IBCF",
                                parameter = list(normalize = 'center', 
                                                 method = 'Cosine', 
                                                 k = 30))

p.IBCF <- predict(recommender.IBCF, test, type="ratings")
p.IBCF <- as.numeric(as(p.IBCF, "matrix"))
df_ibcf <- data.frame("Movie"=colnames(Rmat),"p.IBCF"=p.IBCF)
head(df_ibcf,15)

   Movie   p.IBCF
1     m1       NA
2    m10       NA
3   m100       NA
4  m1000       NA
5  m1002 3.875000
6  m1003 2.833333
7  m1004 3.250000
8  m1005       NA
9  m1006 2.285714
10 m1007       NA
11 m1008 2.250000
12 m1009       NA
13  m101 2.250000
14 m1010       NA
15 m1011 2.111111


In [None]:
IBCF_R = dataFrame_r2python("df_ibcf")

In [None]:
#Essentially a dictionary such that alldata_ibcf_norm[movie][user] is the rating; and omit the data for user501 since it should be excluded in training
alldata_ibcf_norm = {}
for user in alldata_norm:
  if user == user501:
    continue
  for movie in alldata_norm[user]:
    if movie not in alldata_ibcf_norm:
      alldata_ibcf_norm[movie] = {}
    alldata_ibcf_norm[movie][user] = alldata_norm[user][movie]

In [None]:
def similarity_ibcf(movie1,movie2,alldata_ibcf_norm):
    if movie1 not in alldata_ibcf_norm or movie2 not in alldata_ibcf_norm:
      return 0
    common_users = set(alldata_ibcf_norm[movie1]).intersection(set(alldata_ibcf_norm[movie2]))
    if len(common_users) == 0:
        return 0
    num = sq1= sq2= 0
    for user in common_users:
        num += alldata_ibcf_norm[movie1][user]*alldata_ibcf_norm[movie2][user]
        sq1 += np.power(alldata_ibcf_norm[movie1][user],2)
        sq2 += np.power(alldata_ibcf_norm[movie2][user],2)
    #m = num/np.sqrt(sum(v**2 for v in alldata_norm[user1].values())*sum(v**2 for v in alldata_norm[user2].values()))
    if sq1*sq2 == 0:
      return 0
    m = num/np.sqrt(sq1*sq2)
    return (m+1)/2

In [None]:
sim_movies = defaultdict(dict)
for i,m1 in enumerate(allMovies):
  for j in range(i+1,len(allMovies)):
    m2 = allMovies[j]
    s = similarity_ibcf(m1,m2,alldata_ibcf_norm)
    if abs(s-1) < 1e-7:
      s = 1
    sim_movies[m1][m2] = s
    sim_movies[m2][m1] = s

In [None]:
def topKSimilarMovies(movie,sim_movies,k):
  l = []
  for other in allMovies:
    if other == movie:
      continue
    s = sim_movies[movie][other]
    l.append((s,other))
  return sorted(l)[-k:]

In [None]:
def mypredict_ibcf(user,allMovies,sim_movies,alldata):
    res = {}
    for movie in allMovies:
        if movie in alldata[user]:
            continue
        num = den = 0
        others = topKSimilarMovies(movie,sim_movies,30)
        for s,m in others:
            if m in alldata[user]:
              r = alldata[user][m]
              num += r*s
              den +=  s
        if den > 0:
            res[movie] = num/den
    return res

In [None]:
pred_ibcf=  mypredict_ibcf(user501,allMovies,sim_movies,alldata)

In [None]:
mydf_ibcf = pd.DataFrame({"Movie":pred_ibcf.keys(),"mypred_ibcf":pred_ibcf.values()})

In [None]:
scoring_ibcf = IBCF_R.merge(mydf_ibcf,on='Movie',how='left')

In [None]:
scoring_ibcf.head(15)

Unnamed: 0,Movie,p.IBCF,mypred_ibcf
0,m1,,
1,m10,,
2,m100,,
3,m1000,,
4,m1002,3.875,3.875
5,m1003,2.833333,2.833333
6,m1004,3.25,3.25
7,m1005,,
8,m1006,2.285714,2.285714
9,m1007,,


**Equivalent to the following R code**

sum(is.na(p.IBCF) != is.na(mypred))  

In [None]:
np.sum(pd.isna(scoring_ibcf['mypred_ibcf']) != pd.isna(scoring_ibcf['p.IBCF']))

1

**Equivalent to the following R code**

mydiff = abs(p.IBCF - mypred)
sum(mydiff[!is.na(mydiff)] > 1e-6) / sum(!is.na(mydiff)) 

In [None]:
mydiff = abs(scoring_ibcf['p.IBCF']-scoring_ibcf['mypred_ibcf'])
np.sum((mydiff > 1e-6) & (~np.isnan(mydiff)))/np.sum(~np.isnan(mydiff))

0.06170513424399435

## Why discrepencies in IBCF?

1. Some discrepancies in cosine similarity (at least in some cases, recommender lab has given the incorrect cosine similarity)

For example, 'm1014' and 'm810' has 0 cosine similarity in recommenderlab. However, both 'u1169' and 'u1434' have given both movies a 1-star rating. My cosine similarity between these two movies are 1.

2. A lot of movies have similarity 1. Depending on how to break ties, we will have very different top30 similar movies for a given movie, and therefore different predicted ratings. 

3. Also, previously I found that (maybe because of rounding error in Python), I get a lot of movie pairs with 0.999999 similarity. That would demote their ranks in the top30 list. I just set the tolerance to be 1e-7: any similarity value within the tolerance would be considered as a 1.