In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
ratings = pd.read_csv('./data/ratings.csv', index_col=0)

In [3]:
ratings

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931
...,...,...,...
610,166534,4.0,1493848402
610,168248,5.0,1493850091
610,168250,5.0,1494273047
610,168252,5.0,1493846352


In [4]:
ratings.drop('timestamp', inplace=True, axis=1)

In [5]:
R = ratings.pivot(columns='movieId', values='rating')

In [6]:
R.shape

(610, 9724)

In [7]:
R.isna().sum()

movieId
1         395
2         500
3         558
4         603
5         561
         ... 
193581    609
193583    609
193585    609
193587    609
193609    609
Length: 9724, dtype: int64

In [8]:
R.mean().sort_values(ascending=False)

movieId
88448     5.0
100556    5.0
143031    5.0
143511    5.0
143559    5.0
         ... 
157172    0.5
85334     0.5
53453     0.5
8494      0.5
71810     0.5
Length: 9724, dtype: float64

In [9]:
#calculate the number of ratings per user
R.count(axis=1).sort_values(ascending=False)

userId
414    2698
599    2478
474    2108
448    1864
274    1346
       ... 
442      20
569      20
320      20
576      20
53       20
Length: 610, dtype: int64

In [10]:
#Filter for items that have been rated by less than 20 users
less_than_20=R.loc[:, R.count()< 20]
len(less_than_20)

610

In [11]:
#Impute missing values with the global mean
global_mean=R.mean().mean()
global_mean

3.2624482748109656

In [12]:
movie_ratings=R.fillna(R.mean())

In [13]:
movie_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,3.431818,4.0,2.357143,3.071429,4.0,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
2,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
3,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
4,3.92093,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
5,4.0,3.431818,3.259615,2.357143,3.071429,3.946078,3.185185,2.875,3.125,3.496212,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0


In [14]:
user_mean=R.mean(axis=1)

In [15]:
# With the user averages:
user_ratings=R.transpose().fillna(user_mean).transpose().head()

In [17]:
user_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,4.366379,4.0,4.366379,4.366379,4.0,4.366379,4.366379,4.366379,4.366379,...,4.366379,4.366379,4.366379,4.366379,4.366379,4.366379,4.366379,4.366379,4.366379,4.366379
2,3.948276,3.948276,3.948276,3.948276,3.948276,3.948276,3.948276,3.948276,3.948276,3.948276,...,3.948276,3.948276,3.948276,3.948276,3.948276,3.948276,3.948276,3.948276,3.948276,3.948276
3,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897,...,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897,2.435897
4,3.555556,3.555556,3.555556,3.555556,3.555556,3.555556,3.555556,3.555556,3.555556,3.555556,...,3.555556,3.555556,3.555556,3.555556,3.555556,3.555556,3.555556,3.555556,3.555556,3.555556
5,4.0,3.636364,3.636364,3.636364,3.636364,3.636364,3.636364,3.636364,3.636364,3.636364,...,3.636364,3.636364,3.636364,3.636364,3.636364,3.636364,3.636364,3.636364,3.636364,3.636364


In [18]:
user_ratings.mean(axis=0).sort_values(ascending=False)

movieId
3703    4.228039
475     4.150111
1587    4.128039
5919    4.101315
4518    4.101315
          ...   
3176    3.004107
441     3.004107
2628    3.004107
3489    3.004107
316     2.987946
Length: 9724, dtype: float64

In [19]:
movie_ratings.mean(axis=0).sort_values(ascending=False)

movieId
88448     5.0
100556    5.0
143031    5.0
143511    5.0
143559    5.0
         ... 
95796     0.5
141668    0.5
136297    0.5
122627    0.5
145724    0.5
Length: 9724, dtype: float64

In [20]:
less_than_20.mean(axis=0).sort_values(ascending=False)

movieId
5088      5.0
626       5.0
159811    5.0
96608     5.0
130970    5.0
         ... 
65350     0.5
7636      0.5
102025    0.5
175475    0.5
72696     0.5
Length: 8427, dtype: float64