In [1]:
import os

# data science imports
import numpy as np
import pandas as pd

# sklearn imports
from sklearn.model_selection import train_test_split

# keras/tensorflow imports
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Multiply, Concatenate
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
# from tensorflow.keras.optimizers import Adagrad, Adam, SGD, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow.keras.backend as K
from tensorflow.keras.models import load_model

# visualization imports
import matplotlib.pyplot as plt
%matplotlib inline

In [33]:
movie_path = 'dataset/ml-latest-small/movies.csv'
df_movie = pd.read_csv(movie_path)
df_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9742 non-null int64
title      9742 non-null object
genres     9742 non-null object
dtypes: int64(1), object(2)
memory usage: 228.4+ KB


In [34]:
df_movie.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [35]:
data_path = 'dataset/ml-latest-small/ratings.csv'
df_ratings = pd.read_csv(
    data_path,
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [36]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
userId     100836 non-null int32
movieId    100836 non-null int32
rating     100836 non-null float32
dtypes: float32(1), int32(2)
memory usage: 1.2 MB


In [37]:
df_ratings.head(10)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
5,1,70,3.0
6,1,101,5.0
7,1,110,4.0
8,1,151,5.0
9,1,157,5.0


In [38]:
unique_users = df_ratings.userId.unique()
unique_movies = df_ratings.movieId.unique()

In [39]:
len(unique_users)

610

In [40]:
len(unique_movies)

9724

In [55]:
user2index = {user:index for index, user in enumerate(unique_users)}
user2index.tail()

AttributeError: 'dict' object has no attribute 'tail'

In [42]:
index2user = {index:user for user, index in user2index.items()}
index2user

{0: 1,
 1: 2,
 2: 3,
 3: 4,
 4: 5,
 5: 6,
 6: 7,
 7: 8,
 8: 9,
 9: 10,
 10: 11,
 11: 12,
 12: 13,
 13: 14,
 14: 15,
 15: 16,
 16: 17,
 17: 18,
 18: 19,
 19: 20,
 20: 21,
 21: 22,
 22: 23,
 23: 24,
 24: 25,
 25: 26,
 26: 27,
 27: 28,
 28: 29,
 29: 30,
 30: 31,
 31: 32,
 32: 33,
 33: 34,
 34: 35,
 35: 36,
 36: 37,
 37: 38,
 38: 39,
 39: 40,
 40: 41,
 41: 42,
 42: 43,
 43: 44,
 44: 45,
 45: 46,
 46: 47,
 47: 48,
 48: 49,
 49: 50,
 50: 51,
 51: 52,
 52: 53,
 53: 54,
 54: 55,
 55: 56,
 56: 57,
 57: 58,
 58: 59,
 59: 60,
 60: 61,
 61: 62,
 62: 63,
 63: 64,
 64: 65,
 65: 66,
 66: 67,
 67: 68,
 68: 69,
 69: 70,
 70: 71,
 71: 72,
 72: 73,
 73: 74,
 74: 75,
 75: 76,
 76: 77,
 77: 78,
 78: 79,
 79: 80,
 80: 81,
 81: 82,
 82: 83,
 83: 84,
 84: 85,
 85: 86,
 86: 87,
 87: 88,
 88: 89,
 89: 90,
 90: 91,
 91: 92,
 92: 93,
 93: 94,
 94: 95,
 95: 96,
 96: 97,
 97: 98,
 98: 99,
 99: 100,
 100: 101,
 101: 102,
 102: 103,
 103: 104,
 104: 105,
 105: 106,
 106: 107,
 107: 108,
 108: 109,
 109: 110,
 110: 11

In [43]:
movie2index = {movie:index for index, movie in enumerate(unique_movies)}
movie2index

{1: 0,
 3: 1,
 6: 2,
 47: 3,
 50: 4,
 70: 5,
 101: 6,
 110: 7,
 151: 8,
 157: 9,
 163: 10,
 216: 11,
 223: 12,
 231: 13,
 235: 14,
 260: 15,
 296: 16,
 316: 17,
 333: 18,
 349: 19,
 356: 20,
 362: 21,
 367: 22,
 423: 23,
 441: 24,
 457: 25,
 480: 26,
 500: 27,
 527: 28,
 543: 29,
 552: 30,
 553: 31,
 590: 32,
 592: 33,
 593: 34,
 596: 35,
 608: 36,
 648: 37,
 661: 38,
 673: 39,
 733: 40,
 736: 41,
 780: 42,
 804: 43,
 919: 44,
 923: 45,
 940: 46,
 943: 47,
 954: 48,
 1009: 49,
 1023: 50,
 1024: 51,
 1025: 52,
 1029: 53,
 1030: 54,
 1031: 55,
 1032: 56,
 1042: 57,
 1049: 58,
 1060: 59,
 1073: 60,
 1080: 61,
 1089: 62,
 1090: 63,
 1092: 64,
 1097: 65,
 1127: 66,
 1136: 67,
 1196: 68,
 1197: 69,
 1198: 70,
 1206: 71,
 1208: 72,
 1210: 73,
 1213: 74,
 1214: 75,
 1219: 76,
 1220: 77,
 1222: 78,
 1224: 79,
 1226: 80,
 1240: 81,
 1256: 82,
 1258: 83,
 1265: 84,
 1270: 85,
 1275: 86,
 1278: 87,
 1282: 88,
 1291: 89,
 1298: 90,
 1348: 91,
 1377: 92,
 1396: 93,
 1408: 94,
 1445: 95,
 1473: 96,
 

In [44]:
index2movie = {index:movie for movie, index in movie2index.items()}
index2movie

{0: 1,
 1: 3,
 2: 6,
 3: 47,
 4: 50,
 5: 70,
 6: 101,
 7: 110,
 8: 151,
 9: 157,
 10: 163,
 11: 216,
 12: 223,
 13: 231,
 14: 235,
 15: 260,
 16: 296,
 17: 316,
 18: 333,
 19: 349,
 20: 356,
 21: 362,
 22: 367,
 23: 423,
 24: 441,
 25: 457,
 26: 480,
 27: 500,
 28: 527,
 29: 543,
 30: 552,
 31: 553,
 32: 590,
 33: 592,
 34: 593,
 35: 596,
 36: 608,
 37: 648,
 38: 661,
 39: 673,
 40: 733,
 41: 736,
 42: 780,
 43: 804,
 44: 919,
 45: 923,
 46: 940,
 47: 943,
 48: 954,
 49: 1009,
 50: 1023,
 51: 1024,
 52: 1025,
 53: 1029,
 54: 1030,
 55: 1031,
 56: 1032,
 57: 1042,
 58: 1049,
 59: 1060,
 60: 1073,
 61: 1080,
 62: 1089,
 63: 1090,
 64: 1092,
 65: 1097,
 66: 1127,
 67: 1136,
 68: 1196,
 69: 1197,
 70: 1198,
 71: 1206,
 72: 1208,
 73: 1210,
 74: 1213,
 75: 1214,
 76: 1219,
 77: 1220,
 78: 1222,
 79: 1224,
 80: 1226,
 81: 1240,
 82: 1256,
 83: 1258,
 84: 1265,
 85: 1270,
 86: 1275,
 87: 1278,
 88: 1282,
 89: 1291,
 90: 1298,
 91: 1348,
 92: 1377,
 93: 1396,
 94: 1408,
 95: 1445,
 96: 1473,
 

In [51]:
df_ratings_user = df_ratings['userId']
# df_ratings['user_index'] = 
df_ratings_user.apply(lambda  x: user2index[x])

df_ratings_user
# user2index(df_ratings['userId'])
# df_ratings['userId']

0           1
1           1
2           1
3           1
4           1
5           1
6           1
7           1
8           1
9           1
10          1
11          1
12          1
13          1
14          1
15          1
16          1
17          1
18          1
19          1
20          1
21          1
22          1
23          1
24          1
25          1
26          1
27          1
28          1
29          1
         ... 
100806    610
100807    610
100808    610
100809    610
100810    610
100811    610
100812    610
100813    610
100814    610
100815    610
100816    610
100817    610
100818    610
100819    610
100820    610
100821    610
100822    610
100823    610
100824    610
100825    610
100826    610
100827    610
100828    610
100829    610
100830    610
100831    610
100832    610
100833    610
100834    610
100835    610
Name: userId, Length: 100836, dtype: int32

In [56]:
df_ratings['user_index'] = df_ratings['userId'].map(lambda x: user2index[x])
df_ratings['movie_index'] = df_ratings['movieId'].map(lambda x: movie2index[x])
df_ratings

Unnamed: 0,userId,movieId,rating,user_index,movie_index
0,1,1,4.0,0,0
1,1,3,4.0,0,1
2,1,6,4.0,0,2
3,1,47,5.0,0,3
4,1,50,5.0,0,4
5,1,70,3.0,0,5
6,1,101,5.0,0,6
7,1,110,4.0,0,7
8,1,151,5.0,0,8
9,1,157,5.0,0,9


In [57]:
index2movie[2873]

170875

In [58]:
index2movie[9723]

163981

In [60]:
df_ratings['user_index'].values

array([  0,   0,   0, ..., 609, 609, 609], dtype=int64)

In [61]:
len(df_ratings)

100836

In [62]:
movie_path = 'dataset/ml-latest-small/movies.csv'
df_movies = pd.read_csv(movie_path, index_col='movieId')
df_movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [75]:
df_movies.loc[[10,1,2,3]]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
10,GoldenEye (1995),Action|Adventure|Thriller
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance


In [24]:
num_users = len(df_ratings.userId.unique())
num_items = len(df_ratings.movieId.unique())
print('There are {} unique users and {} unique movies in this data set'.format(num_users, num_items))

There are 610 unique users and 9724 unique movies in this data set


In [25]:
user_maxId = df_ratings.userId.max()
item_maxId = df_ratings.movieId.max()
print('There are {} distinct users and the max of user ID is also {}'.format(num_users, user_maxId))
print('There are {} distinct movies, however, the max of movie ID is {}'.format(num_items, item_maxId))
print('In the context of matrix factorization, the current item vector is in unnecessarily high dimensional space')
print('So we need to do some data cleaning to reduce the dimension of item vector back to {}'.format(num_items))

There are 610 distinct users and the max of user ID is also 610
There are 9724 distinct movies, however, the max of movie ID is 193609
In the context of matrix factorization, the current item vector is in unnecessarily high dimensional space
So we need to do some data cleaning to reduce the dimension of item vector back to 9724


In [26]:

def reduce_item_dim(df_ratings):
    """
    Reduce item vector dimension to the number of distinct items in our data sets
    
    input: pd.DataFrame, df_ratings should have columns ['userId', 'movieId', 'rating']
    output: pd.DataFrame, df_ratings with new 'MovieID' that is compressed
    """
    # pivot
    df_user_item = df_ratings.pivot(index='userId', columns='movieId', values='rating')
    # reset movieId
    df_user_item = df_user_item.T.reset_index(drop=True).T
    # undo pivot/melt - compress data frame
    df_ratings_new = df_user_item \
        .reset_index('userId') \
        .melt(
            id_vars='userId', 
            value_vars=df_user_item.columns,
            var_name='movieId',
            value_name='rating')
    # drop nan and final clean up
    return df_ratings_new.dropna().sort_values(['userId', 'movieId']).reset_index(drop=True)

In [27]:
print('reduce item dimension before:')
df_ratings.head(10)
df_ratings.describe()

reduce item dimension before:


Unnamed: 0,userId,movieId,rating
count,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557
std,182.618491,35530.987199,1.042529
min,1.0,1.0,0.5
25%,177.0,1199.0,3.0
50%,325.0,2991.0,3.5
75%,477.0,8122.0,4.0
max,610.0,193609.0,5.0


In [29]:
df_ratings.columns

Index(['userId', 'movieId', 'rating'], dtype='object')

In [30]:
df_ratings = reduce_item_dim(df_ratings)
print('reduce item dimension after:')
df_ratings.head(10)

reduce item dimension after:


Unnamed: 0,userId,movieId,rating
0,1,0,4.0
1,1,2,4.0
2,1,5,4.0
3,1,43,5.0
4,1,46,5.0
5,1,62,3.0
6,1,89,5.0
7,1,97,4.0
8,1,124,5.0
9,1,130,5.0


In [31]:
df_ratings.describe()

Unnamed: 0,userId,rating
count,100836.0,100836.0
mean,326.127564,3.501557
std,182.618491,1.042529
min,1.0,0.5
25%,177.0,3.0
50%,325.0,3.5
75%,477.0,4.0
max,610.0,5.0


In [32]:
df_ratings.columns

Index(['userId', 'movieId', 'rating'], dtype='object')