In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# User-based recommendations

In [None]:
# the file is in a Box folder 
ratings = pd.read_csv('ratings_large.csv')
ratings

In [None]:
# (transpose) ratings matrix
ratings_matrix = ratings.pivot('title','userId')['rating']
ratings_matrix

In [None]:
# select a userId 
user = 4
user_ratings = ratings_matrix.loc[:,user]
user_ratings

In [None]:
# drop user column from the ratings matrix
ratings_matrix.drop(user,axis=1,inplace=True)

In [None]:
# movies rated by the user
user_ratings[user_ratings.notna()].index.to_list()

In [None]:
# movies not rated by the user
user_ratings[user_ratings.isna()].index.to_list()

In [None]:
# ratings distribution
user_ratings.value_counts(dropna=False)

In [None]:
# user top movies
user_ratings.sort_values(ascending=False).head(20)

In [None]:
# user bottom movies
user_ratings.sort_values().head(20)

In [None]:
# user mean rating
user_mean = user_ratings.mean()
user_mean

In [None]:
# user standard deviation rating
user_std = user_ratings.std()
user_std

## The z-scores matrix

In [None]:
# matrix of z-scores (scale-free ratings)
z_scores = (ratings_matrix - ratings_matrix.mean())/ratings_matrix.std()
z_scores

## Similarities

In [None]:
# user similarity function = correlation 
similarities = ratings_matrix.corrwith(user_ratings)
similarities

## Prediction function

In [None]:
# select number of neighbors
k = 20

# select an (unrated) item: 'Dawn of the Dead (2004)', "Zoolander (2001)", 'Duck Soup (1933)'
item = 'Duck Soup (1933)'

In [None]:
# k nearest neighbors similarities
knn_sim = similarities[ratings_matrix.loc[item].notna()].sort_values(ascending=False).head(k)
knn_sim

In [None]:
# normalization factor
total = knn_sim.abs().sum()
total

In [None]:
# k nearest neighbors
knn = knn_sim.index
knn

In [None]:
knn_z_scores = z_scores.loc[item,knn]
knn_z_scores

In [None]:
# prediction
prediction = user_mean + user_std*knn_z_scores.dot(knn_sim)/total
print(item+': '+str(np.round(prediction,1)))

In [None]:
# the same in one cell:

# select number of neighbors
k = 20

# select item
item = "Zoolander (2001)"# 

# k nearest neighbors similarities
knn_sim = similarities[ratings_matrix.loc[item].notna()].sort_values(ascending=False).head(k)
    
# normalization factor
total = knn_sim.abs().sum()

# k nearest neighbors
knn = knn_sim.index

# k nearest neighbors z-scores
knn_z_scores = z_scores.loc[item,knn]

# prediction
prediction = user_mean + user_std*knn_sim.dot(knn_z_scores)/total

print(item+': '+str(np.round(prediction,1)))

## Fake user

In [None]:
# fake user likes sci-fi films, action-adventures films, and doesn't like romance films, kid movies
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Theory/master/Data/fake_user.csv'
user = pd.read_csv(url,index_col='title', squeeze=True)
user

In [None]:
user_mean = user.mean()
user_std = user.std()

In [None]:
# user similarities
similarities = ratings_matrix.corrwith(user)

In [None]:
# not rated movies
not_rated_movies = user[user.isna()].index.to_list()
not_rated_movies

In [None]:
k = 20
for item in not_rated_movies:
    # k nearest neighbors similarities
    knn_sim = similarities[ratings_matrix.loc[item].notna()].sort_values(ascending=False).head(k)

    # normalization factor
    total = knn_sim.abs().sum()
    
    # k nearest neighbors
    knn = knn_sim.index
    
    # k nearest neighbors z-scores
    knn_z_scores = z_scores.loc[item,knn]
    

    # prediction
    prediction = user_mean + user_std*knn_sim.dot(knn_z_scores)/total
    user.loc[item] = prediction

In [None]:
# top 20 recommendations
user.loc[not_rated_movies].sort_values(ascending=False).head(20)

In [None]:
# bottom 20 recommendations
user.loc[not_rated_movies].sort_values(ascending=False).tail(20)

## The impact of the long tail

In [None]:
# fake user likes sci-fi films, action-adventures films, and doesn't like romance films, kid movies
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Theory/master/Data/fake_user.csv'
user = pd.read_csv(url,index_col='title', squeeze=True)
user

In [None]:
# number of ratings each movie received
n_ratings = ratings.groupby('title')['rating'].count()
n_ratings

In [None]:
n_ratings.plot.hist(bins=20)
plt.xlabel('number of ratings')
plt.ylabel('number of movies')

In [None]:
# popular movies
n_ratings[n_ratings>20000].sort_values(ascending=False)

In [None]:
# number of users
n_users = ratings['userId'].nunique()
n_users

In [None]:
# weights
weights = np.log(n_users/n_ratings)
weights

In [None]:
weights['Forrest Gump (1994)']

In [None]:
# pandas does not have a built-in function that computes weighted correlations
def weighted_corrwith(y):
    
    x = user
    
    # means
    mean_x = np.mean(x)
    mean_y = np.mean(y)
    
    # denominator
    den1 = np.sqrt(np.sum((weights*(x-mean_x)**2)))
    den2 = np.sqrt(np.sum((weights*(y-mean_y)**2)))
    den = den1*den2
    
    # numerator
    num = np.sum(weights*(x-mean_x)*(y-mean_y))
    
    return num/den

In [None]:
similarities = ratings_matrix.apply(weighted_corrwith)
similarities

In [None]:
# not rated movies
not_rated_movies = user[user.isna()].index.to_list()
not_rated_movies

In [None]:
k = 20
for item in not_rated_movies:
    # k nearest neighbors similarities
    knn_sim = similarities[ratings_matrix.loc[item].notna()].sort_values(ascending=False).head(k)

    # normalization factor
    total = knn_sim.abs().sum()
    
    # k nearest neighbors
    knn = knn_sim.index
    
    # k nearest neighbors z-scores
    knn_z_scores = z_scores.loc[item,knn]
    

    # prediction
    prediction = user_mean + user_std*knn_sim.dot(knn_z_scores)/total
    user.loc[item] = prediction

In [None]:
# top 20 recommendations
user.loc[not_rated_movies].sort_values(ascending=False).head(20)

## Appendix: create a fake user

In [None]:
fake_user = pd.DataFrame(index=ratings_matrix.index,columns=['rating'])
fake_user

In [None]:
fake_user.to_csv('fake_kid_user.csv')