Data Source: https://grouplens.org/datasets/movielens/
Data used: ml-latest-small.zip (1MB)

### Load libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

from sklearn.decomposition import NMF

### Load dataframes

In [2]:
links = pd.read_csv('./data/links.csv')
movies = pd.read_csv('./data/movies.csv')
ratings = pd.read_csv('./data/ratings.csv')
tags = pd.read_csv('./data/tags.csv')

In [3]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


### Check for nulls

In [7]:
links.isna().sum()

movieId    0
imdbId     0
tmdbId     8
dtype: int64

In [8]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [9]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [10]:
tags.isna().sum()

userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

### Pivot ratings table

In [11]:
user_movie_rating_df = pd.pivot_table(ratings
                                      , index = 'userId'
                                      , columns = 'movieId'
                                      , values = 'rating'
                                     )
user_movie_rating_df.fillna(0, inplace = True)
user_movie_rating_df

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Single value decomposition SVD method

In [12]:
Mat = np.array(user_movie_rating_df)
Mat.shape

(610, 9724)

In [13]:
U, s, V = np.linalg.svd(Mat, full_matrices=False)
Pred_Mat = U @ np.diag(s) @ V + np.mean(Mat)

In [14]:
Pred_Mat.shape

(610, 9724)

In [15]:
Pred_Mat

array([[4.05952536, 0.05952536, 4.05952536, ..., 0.05952536, 0.05952536,
        0.05952536],
       [0.05952536, 0.05952536, 0.05952536, ..., 0.05952536, 0.05952536,
        0.05952536],
       [0.05952536, 0.05952536, 0.05952536, ..., 0.05952536, 0.05952536,
        0.05952536],
       ...,
       [2.55952536, 2.05952536, 2.05952536, ..., 0.05952536, 0.05952536,
        0.05952536],
       [3.05952536, 0.05952536, 0.05952536, ..., 0.05952536, 0.05952536,
        0.05952536],
       [5.05952536, 0.05952536, 0.05952536, ..., 0.05952536, 0.05952536,
        0.05952536]])

In [16]:
Pred_Mat[0].argsort()[-10:][::-1]

array([ 398,  291,  898, 2019,  461,  973,  485, 2670,  197, 1180])

In [17]:
movieIds = user_movie_rating_df.columns
movieIds[398]

457

In [18]:
print(Pred_Mat[0][398], movies.title[movies.movieId == movieIds[398]])
print(Pred_Mat[0][291], movies.title[movies.movieId == movieIds[291]])
print(Pred_Mat[0][898], movies.title[movies.movieId == movieIds[898]])
print(Pred_Mat[0][2019], movies.title[movies.movieId == movieIds[2019]])

5.059525358922696 398    Fugitive, The (1993)
Name: title, dtype: object
5.059525358922691 291    Tommy Boy (1995)
Name: title, dtype: object
5.059525358922685 899    Princess Bride, The (1987)
Name: title, dtype: object
5.059525358922685 2020    Run Lola Run (Lola rennt) (1998)
Name: title, dtype: object


### Verify user 1's viewing patterns

In [19]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [20]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [21]:
ratings_movies_df = ratings.merge(right = movies
                                  , how = 'left'
                                  , left_on = ['movieId']
                                  , right_on = ['movieId']
                                 )
ratings_movies_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi


In [22]:
ratings_movies_df.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64

In [23]:
ratings_movies_df.loc[(ratings_movies_df.userId == 1)
                      & (ratings_movies_df.rating == 5.0)
                     ][:10]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
6,1,101,5.0,964980868,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance
8,1,151,5.0,964984041,Rob Roy (1995),Action|Drama|Romance|War
9,1,157,5.0,964984100,Canadian Bacon (1995),Comedy|War
10,1,163,5.0,964983650,Desperado (1995),Action|Romance|Western
11,1,216,5.0,964981208,Billy Madison (1995),Comedy
13,1,231,5.0,964981179,Dumb & Dumber (Dumb and Dumber) (1994),Adventure|Comedy
15,1,260,5.0,964981680,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
18,1,333,5.0,964981179,Tommy Boy (1995),Comedy
