In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# User-based recommendations

In [2]:
# the file is in a Box folder (each movie has at least 2000 ratings)
ratings = pd.read_csv('ratings_large.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp,title
0,4,1,4.0,1113765937,Toy Story (1995)
1,4,2,4.0,1113767306,Jumanji (1995)
2,4,5,2.0,1123990453,Father of the Bride Part II (1995)
3,4,6,4.5,1113767242,Heat (1995)
4,4,10,4.0,1113765995,GoldenEye (1995)
...,...,...,...,...,...
10395149,283224,1388,2.0,851001995,Jaws 2 (1978)
10395150,283224,1394,3.0,851002354,Raising Arizona (1987)
10395151,283224,1396,3.0,851002354,Sneakers (1992)
10395152,283224,2019,5.0,851000812,Seven Samurai (Shichinin no samurai) (1954)


In [4]:
# (transposed) ratings matrix
ratings_matrix = ratings.pivot('userId','title')['rating'].T
ratings_matrix

userId,4,19,42,43,51,55,56,71,73,79,...,283153,283164,283165,283170,283183,283184,283185,283195,283204,283224
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),,,4.0,,,,,2.5,4.5,,...,,,,4.0,,4.0,,,,
10 Things I Hate About You (1999),,,,,,1.5,,,,4.0,...,3.0,,4.5,,,,,2.5,,
101 Dalmatians (1996),,,4.0,,,,0.5,,,,...,,,,,,,,,,
101 Dalmatians (One Hundred and One Dalmatians) (1961),,,,,,3.5,1.0,,,,...,,,,,,,3.0,4.5,,
12 Angry Men (1957),1.5,5.0,,4.5,5.0,,4.0,,,,...,,,3.0,,,,,,,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zoolander (2001),,,,,,,2.5,,2.5,,...,3.0,,,,,2.0,,3.5,,
Zootopia (2016),,,,,,,,,,,...,,,,,,,,,,
eXistenZ (1999),3.5,,,,,,,,,,...,,,,,,,,,,
xXx (2002),3.5,,,,,,,,,,...,0.5,,,,,,,,,


In [5]:
# pick a userId
user = 4
user_ratings = ratings_matrix.loc[:,user]
user_ratings

title
(500) Days of Summer (2009)                               NaN
10 Things I Hate About You (1999)                         NaN
101 Dalmatians (1996)                                     NaN
101 Dalmatians (One Hundred and One Dalmatians) (1961)    NaN
12 Angry Men (1957)                                       1.5
                                                         ... 
Zoolander (2001)                                          NaN
Zootopia (2016)                                           NaN
eXistenZ (1999)                                           3.5
xXx (2002)                                                3.5
¡Three Amigos! (1986)                                     3.0
Name: 4, Length: 1344, dtype: float64

In [6]:
# number of ratings
np.sum(user_ratings.notna())

532

In [7]:
# ratings distribution
user_ratings.value_counts(dropna=False)

NaN    812
4.0    126
3.5     97
4.5     90
5.0     60
3.0     46
2.5     34
2.0     34
0.5     27
1.0      9
1.5      9
Name: 4, dtype: int64

In [8]:
# user top movies
user_ratings.sort_values(ascending=False).head(20)

title
Shrek (2001)                                                     5.0
Batman Begins (2005)                                             5.0
Incredibles, The (2004)                                          5.0
Rock, The (1996)                                                 5.0
Insomnia (2002)                                                  5.0
Spider-Man (2002)                                                5.0
Spider-Man 2 (2004)                                              5.0
Road to Perdition (2002)                                         5.0
Spy Game (2001)                                                  5.0
Clear and Present Danger (1994)                                  5.0
Traffic (2000)                                                   5.0
Pulp Fiction (1994)                                              5.0
Kung Fu Hustle (Gong fu) (2004)                                  5.0
L.A. Confidential (1997)                                         5.0
Last Samurai, The (2003)    

In [9]:
# user bottom movies
user_ratings.sort_values().head(20)

title
Napoleon Dynamite (2004)                  0.5
Showgirls (1995)                          0.5
Spy Kids (2001)                           0.5
Royal Tenenbaums, The (2001)              0.5
Beavis and Butt-Head Do America (1996)    0.5
Super Mario Bros. (1993)                  0.5
Big Lebowski, The (1998)                  0.5
Titanic (1997)                            0.5
Bio-Dome (1996)                           0.5
Blair Witch Project, The (1999)           0.5
Blues Brothers, The (1980)                0.5
Sleepless in Seattle (1993)               0.5
Miss Congeniality (2000)                  0.5
Breakfast Club, The (1985)                0.5
Bridget Jones's Diary (2001)              0.5
Moulin Rouge (2001)                       0.5
Cable Guy, The (1996)                     0.5
Dude, Where's My Car? (2000)              0.5
Dumb & Dumber (Dumb and Dumber) (1994)    0.5
Eyes Wide Shut (1999)                     0.5
Name: 4, dtype: float64

In [10]:
# user mean rating
user_mean = user_ratings.mean()
user_mean

3.5253759398496243

In [134]:
# user standard deviation rating
user_std = user_ratings.std()
user_std

1.1591017244209465

In [11]:
# drop user column from the ratings matrix
ratings_matrix.drop(user,axis=1,inplace=True)

## The z-scores matrix

In [121]:
z_scores = (ratings_matrix - ratings_matrix.mean())/ratings_matrix.std()
z_scores

userId,19,42,43,51,55,56,71,73,79,81,...,283153,283164,283165,283170,283183,283184,283185,283195,283204,283224
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),,0.441226,,,,,-1.447012,0.883303,,-0.337638,...,,,,-0.552253,,0.258297,,,,
10 Things I Hate About You (1999),,,,,-1.913005,,,,-0.311645,-0.337638,...,0.197451,,1.846808,,,,,-1.542636,,
101 Dalmatians (1996),,0.441226,,,,-2.040622,,,,,...,,,,,,,,,,
101 Dalmatians (One Hundred and One Dalmatians) (1961),,,,,-0.111992,-1.576142,,,,,...,,,,,,,-0.538664,0.772019,,
12 Angry Men (1957),1.905267,,0.523548,1.245453,,1.210739,,,,1.501957,...,,,0.006166,,,,,,,1.290966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zoolander (2001),,,,,,-0.182702,,-1.694737,,-0.337638,...,0.197451,,,,,-1.559117,,-0.385309,,
Zootopia (2016),,,,,,,,,,,...,,,,,,,,,,
eXistenZ (1999),,,,,,,,,,,...,,,,,,,,,,
xXx (2002),,,,,,,,,,-2.637132,...,-1.652619,,,,,,,,,


## Similarities

In [None]:
# correlations
correlations = ratings_matrix.corrwith(user_ratings)

In [176]:
# similarities
alpha = 2
similarities = correlations**alpha

In [160]:
similarities

userId
19        0.004030
42        0.009382
43        0.000455
51        0.000162
55        0.001399
            ...   
283184    0.020518
283185   -0.013222
283195    0.017568
283204    0.148858
283224    0.024471
Length: 35288, dtype: float64

## Prediction function

In [223]:
# number of nearest neighbors
k = 10

# item
item = "League of Extraordinary Gentlemen, The (a.k.a. LXG) (2003)"# '300 (2007)'

# k nearest neighbors similarities
knn_sim = similarities[ratings_matrix.loc[item].notna()].sort_values(ascending=False).head(k)
    
# normalization factor
total = knn_sim.abs().sum()

# k nearest neighbors z-scores
knn_z_scores = z_scores.loc[item,knn_sim.index]

# prediction
prediction = user_mean + user_std*knn_sim.dot(knn_z_scores)/total

print(item+': '+str(np.round(prediction,1)))

League of Extraordinary Gentlemen, The (a.k.a. LXG) (2003): 2.0


## Fake user

In [244]:
fake_user = pd.read_csv('fake_user.csv',index_col='title', squeeze=True)
fake_user

title
(500) Days of Summer (2009)                              NaN
10 Things I Hate About You (1999)                        NaN
101 Dalmatians (1996)                                    NaN
101 Dalmatians (One Hundred and One Dalmatians) (1961)   NaN
12 Angry Men (1957)                                      NaN
                                                          ..
Zoolander (2001)                                         NaN
Zootopia (2016)                                          NaN
eXistenZ (1999)                                          NaN
xXx (2002)                                               NaN
¡Three Amigos! (1986)                                    NaN
Name: rating, Length: 1344, dtype: float64

In [258]:
fake_user_mean = fake_user.mean()
fake_user_std = fake_user.std()

In [254]:
# correlations
correlations = ratings_matrix.corrwith(fake_user)

In [255]:
# similarities
alpha = 2
similarities = correlations**alpha

In [269]:
# number of nearest neighbors
k = 20

# item
item = "Dark Knight, The (2008)"# '300 (2007)'

# k nearest neighbors similarities
knn_sim = similarities[ratings_matrix.loc[item].notna()].sort_values(ascending=False).head(k)
    
# normalization factor
total = knn_sim.abs().sum()

# k nearest neighbors z-scores
knn_z_scores = z_scores.loc[item,knn_sim.index]

# prediction
prediction = fake_user_mean + fake_user_std*knn_sim.dot(knn_z_scores)/total

print(item+': '+str(np.round(prediction,1)))

Dark Knight, The (2008): 3.0


In [271]:
fake_user_pred = pd.Series(index=fake_user.index)
fake_user_pred

  """Entry point for launching an IPython kernel.


title
(500) Days of Summer (2009)                              NaN
10 Things I Hate About You (1999)                        NaN
101 Dalmatians (1996)                                    NaN
101 Dalmatians (One Hundred and One Dalmatians) (1961)   NaN
12 Angry Men (1957)                                      NaN
                                                          ..
Zoolander (2001)                                         NaN
Zootopia (2016)                                          NaN
eXistenZ (1999)                                          NaN
xXx (2002)                                               NaN
¡Three Amigos! (1986)                                    NaN
Length: 1344, dtype: float64

In [272]:
for item in fake_user.index:
    # k nearest neighbors similarities
    knn_sim = similarities[ratings_matrix.loc[item].notna()].sort_values(ascending=False).head(k)

    # normalization factor
    total = knn_sim.abs().sum()

    # k nearest neighbors z-scores
    knn_z_scores = z_scores.loc[item,knn_sim.index]

    # prediction
    prediction = fake_user_mean + fake_user_std*knn_sim.dot(knn_z_scores)/total
    fake_user_pred[item] = prediction

In [275]:
# top 20 recommendations
fake_user_pred.sort_values(ascending=False).head(20)

title
Great Escape, The (1963)                                                     4.357516
Léon: The Professional (a.k.a. The Professional) (Léon) (1994)               4.255870
Once Upon a Time in the West (C'era una volta il West) (1968)                4.249854
American History X (1998)                                                    4.113102
Boondock Saints, The (2000)                                                  4.087678
Crow, The (1994)                                                             4.055658
Apocalypse Now (1979)                                                        4.020279
Usual Suspects, The (1995)                                                   4.016448
Dark City (1998)                                                             3.985395
Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)    3.981048
Highlander (1986)                                                            3.978339
Spotlight (2015)                                

In [276]:
# bottom 20 recommendations
fake_user_pred.sort_values(ascending=False).tail(20)

title
Dick Tracy (1990)                  1.290860
Richie Rich (1994)                 1.278722
Superman III (1983)                1.259494
Flintstones, The (1994)            1.253136
Miracle on 34th Street (1994)      1.238037
Addams Family Values (1993)        1.209867
Spy Kids (2001)                    1.137306
Fantastic Four (2005)              1.121693
Pee-wee's Big Adventure (1985)     1.093370
RoboCop 3 (1993)                   1.086190
Lilo & Stitch (2002)               1.015310
Honey, I Blew Up the Kid (1992)    0.995885
Muppet Treasure Island (1996)      0.971641
Free Willy (1993)                  0.928857
Barb Wire (1996)                   0.824697
Brokeback Mountain (2005)          0.788435
Babe: Pig in the City (1998)       0.771914
Brady Bunch Movie, The (1995)      0.759411
Twilight (2008)                    0.644088
Super Mario Bros. (1993)           0.632490
dtype: float64