In [1]:
import numpy as np
from collections import defaultdict
import pandas as pd
import csv

# Load the original data

In [2]:
def load_org_data():
    
    movies = pd.read_csv('./ml-latest-small/movies.csv')
    ratings = pd.read_csv('./ml-latest-small/ratings.csv')
    tags = pd.read_csv('./ml-latest-small/tags.csv')

    return movies, ratings, tags

In [3]:
movies, ratings, tags = load_org_data()

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
print(movies.shape)
print(ratings.shape)

(9742, 3)
(100836, 4)


# Get movies dataset

## Filter out insignificant data

In [7]:
# timestamp is not significant 
ratings = ratings.drop('timestamp', axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [8]:
# only count movies with more than 5 ratings so that the reviews are fair
fair_rated_movie = ratings.groupby('movieId')['rating'].count() >= 5

# convert series to dataframe
fair_rated_movie = fair_rated_movie.to_frame().reset_index()
# rename the column for more clarity
fair_rated_movie.rename(columns={'rating':'more_than_five_rate'}, inplace=True)
fair_rated_movie.shape

(9724, 2)

In [9]:
# remove the rows where the movie with movieId has less than 5 rating
fair_rated_movie = fair_rated_movie.drop(fair_rated_movie[fair_rated_movie['more_than_five_rate']
                                                          == False].index).reset_index(drop=True)
fair_rated_movie

Unnamed: 0,movieId,more_than_five_rate
0,1,True
1,2,True
2,3,True
3,4,True
4,5,True
...,...,...
3645,180031,True
3646,180985,True
3647,183897,True
3648,187593,True


In [10]:
# remove the rows where the movie with movieId has less than 5 rating
ratings = ratings[ratings['movieId'].isin(fair_rated_movie['movieId'])]
movies = movies[movies['movieId'].isin(fair_rated_movie['movieId'])]
ratings = ratings.reset_index(drop=True)
movies = movies.reset_index(drop=True)

In [11]:
print(movies.shape)
print(ratings.shape)

(3650, 3)
(90274, 3)


In [12]:
movies['year'] = movies.apply(lambda x: x.title.split()[-1][1:-1], axis=1)
movies['year'] = movies['year'].astype('int')
movies = movies.drop('title', axis=1)
movies

Unnamed: 0,movieId,genres,year
0,1,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Adventure|Children|Fantasy,1995
2,3,Comedy|Romance,1995
3,4,Comedy|Drama|Romance,1995
4,5,Comedy,1995
...,...,...,...
3645,180031,Adventure|Drama|Fantasy,2017
3646,180985,Drama,2017
3647,183897,Animation|Comedy,2018
3648,187593,Action|Comedy|Sci-Fi,2018


In [13]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
90269,610,166528,4.0
90270,610,166534,4.0
90271,610,168248,5.0
90272,610,168250,5.0


In [14]:
movies_rate = pd.merge(ratings, movies, how='left', on=['movieId', 'movieId'])
movies_rate

Unnamed: 0,userId,movieId,rating,genres,year
0,1,1,4.0,Adventure|Animation|Children|Comedy|Fantasy,1995
1,1,3,4.0,Comedy|Romance,1995
2,1,6,4.0,Action|Crime|Thriller,1995
3,1,47,5.0,Mystery|Thriller,1995
4,1,50,5.0,Crime|Mystery|Thriller,1995
...,...,...,...,...,...
90269,610,166528,4.0,Action|Adventure|Fantasy|Sci-Fi,2016
90270,610,166534,4.0,Drama|Horror|Thriller,2017
90271,610,168248,5.0,Action|Crime|Thriller,2017
90272,610,168250,5.0,Horror,2017


In [15]:
movies_rate.shape

(90274, 5)

## Split the movie genres in the column 'genres' into multiple rows

In [16]:
# split the movie genres in the column genres into multiple rows
genres_rows = (movies_rate.set_index(['userId', 'movieId', 'rating', 'year'])).apply(lambda 
                                                                                     x: x.str.split('|').explode()).reset_index()

# In the original movie table, movieId 1 has genres 'Adventure|Animation|Children|Comedy|Fantasy', so it's 
# splitted into 5 rows
genres_rows.head(10)

Unnamed: 0,userId,movieId,rating,year,genres
0,1,1,4.0,1995,Adventure
1,1,1,4.0,1995,Animation
2,1,1,4.0,1995,Children
3,1,1,4.0,1995,Comedy
4,1,1,4.0,1995,Fantasy
5,1,3,4.0,1995,Comedy
6,1,3,4.0,1995,Romance
7,1,6,4.0,1995,Action
8,1,6,4.0,1995,Crime
9,1,6,4.0,1995,Thriller


In [17]:
genres_rows.shape

(251979, 5)

## Find the average ratings

In [18]:
# find the average rating of each user
user_ave_rating = ratings.groupby('userId')['rating'].sum() / ratings.groupby('userId')['rating'].count()

In [19]:
# find the average rating of each movie
movie_ave_rating = ratings.groupby('movieId')['rating'].sum() / ratings.groupby('movieId')['rating'].count()

In [20]:
print(user_ave_rating.shape)
print(movie_ave_rating.shape)
print(ratings.shape)
print("There are {} users that have given ratings".format(user_ave_rating.shape[0]))
print("There are {} movies with more than 5 rating".format(movie_ave_rating.shape[0]))
print("There are {} movie ratings where each unique movie has at least 5 ratings".format(ratings.shape[0]))

(610,)
(3650,)
(90274, 3)
There are 610 users that have given ratings
There are 3650 movies with more than 5 rating
There are 90274 movie ratings where each unique movie has at least 5 ratings


In [21]:
# convert series to dataframe
user_ave_rating = user_ave_rating.to_frame().reset_index()
# rename the column for more clarity
user_ave_rating.rename(columns={'rating':'user_ave_rating'}, inplace=True)
user_ave_rating

Unnamed: 0,userId,user_ave_rating
0,1,4.361233
1,2,3.981481
2,3,1.482143
3,4,3.542714
4,5,3.636364
...,...,...
605,606,3.662162
606,607,3.785714
607,608,3.137995
608,609,3.218750


In [22]:
# convert series to dataframe
movie_ave_rating = movie_ave_rating.to_frame().reset_index()
# rename the column for more clarity
movie_ave_rating.rename(columns={'rating':'movie_ave_rating'}, inplace=True)
movie_ave_rating

Unnamed: 0,movieId,movie_ave_rating
0,1,3.920930
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429
...,...,...
3645,180031,3.687500
3646,180985,3.000000
3647,183897,3.500000
3648,187593,3.875000


## Merge average ratings

In [23]:
genres_rows

Unnamed: 0,userId,movieId,rating,year,genres
0,1,1,4.0,1995,Adventure
1,1,1,4.0,1995,Animation
2,1,1,4.0,1995,Children
3,1,1,4.0,1995,Comedy
4,1,1,4.0,1995,Fantasy
...,...,...,...,...,...
251974,610,168248,5.0,2017,Crime
251975,610,168248,5.0,2017,Thriller
251976,610,168250,5.0,2017,Horror
251977,610,168252,5.0,2017,Action


In [24]:
# merge the table with movie genres to the table of average ratings of movies based on their movieId
genres_ave_rating = pd.merge(genres_rows, movie_ave_rating, how='left', on=['movieId', 'movieId'])
genres_ave_rating

Unnamed: 0,userId,movieId,rating,year,genres,movie_ave_rating
0,1,1,4.0,1995,Adventure,3.920930
1,1,1,4.0,1995,Animation,3.920930
2,1,1,4.0,1995,Children,3.920930
3,1,1,4.0,1995,Comedy,3.920930
4,1,1,4.0,1995,Fantasy,3.920930
...,...,...,...,...,...,...
251974,610,168248,5.0,2017,Crime,4.142857
251975,610,168248,5.0,2017,Thriller,4.142857
251976,610,168250,5.0,2017,Horror,3.633333
251977,610,168252,5.0,2017,Action,4.280000


## Filter out insignificant genres

In [25]:
# check number of genres
genres = list(genres_ave_rating.genres.unique())
genres.sort()
print(genres)
print(len(genres))

['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
20


In [26]:
# 'IMAX' and '(no genres listed)' would not contribute much to the recommendation system, so we drop these columns
# After dropping some rows, the indices would not be consecutively, so use .reset_index() to fix it 
genres_ave_rating = genres_ave_rating.drop(genres_ave_rating[(genres_ave_rating.genres == '(no genres listed)') | 
                                           (genres_ave_rating.genres == 'IMAX')].index).reset_index(drop=True)
# equivalently
# genres_rows = genres_rows[~((genres_rows.genres == '(no genres listed)') | (genres_rows.genres == 'IMAX'))]

In [27]:
# check number of genres
genres = list(genres_ave_rating.genres.unique())
genres.sort()
print(genres)
print(len(genres))

['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
18


In [28]:
genres_ave_rating

Unnamed: 0,userId,movieId,rating,year,genres,movie_ave_rating
0,1,1,4.0,1995,Adventure,3.920930
1,1,1,4.0,1995,Animation,3.920930
2,1,1,4.0,1995,Children,3.920930
3,1,1,4.0,1995,Comedy,3.920930
4,1,1,4.0,1995,Fantasy,3.920930
...,...,...,...,...,...,...
247895,610,168248,5.0,2017,Crime,4.142857
247896,610,168248,5.0,2017,Thriller,4.142857
247897,610,168250,5.0,2017,Horror,3.633333
247898,610,168252,5.0,2017,Action,4.280000


## Create one hot encoding for the genres

In [29]:
from sklearn.preprocessing import OneHotEncoder

# creating instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='error')

# perform one-hot encoding on 'genres' column 
encoder_df = pd.DataFrame(encoder.fit_transform(genres_ave_rating[['genres']]).toarray(), columns = genres)

# merge one-hot encoded columns back with original DataFrame
movie_data = genres_ave_rating.join(encoder_df)
# pd.concat([genres_rows, encoder_df], axis=1)

In [30]:
# make sure the one hot encoding matches the movie genres
movie_data

Unnamed: 0,userId,movieId,rating,year,genres,movie_ave_rating,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,1995,Adventure,3.920930,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,4.0,1995,Animation,3.920930,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1,4.0,1995,Children,3.920930,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1,4.0,1995,Comedy,3.920930,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1,4.0,1995,Fantasy,3.920930,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247895,610,168248,5.0,2017,Crime,4.142857,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
247896,610,168248,5.0,2017,Thriller,4.142857,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
247897,610,168250,5.0,2017,Horror,3.633333,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
247898,610,168252,5.0,2017,Action,4.280000,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
movie_data = movie_data.drop('genres', axis=1)
movie_data = movie_data.drop('rating', axis=1)
movie_data = movie_data.drop('userId', axis=1)
# display all columns
pd.set_option('display.max_columns', None)
movie_data.head()

Unnamed: 0,movieId,year,movie_ave_rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,3.92093,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1995,3.92093,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1995,3.92093,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1995,3.92093,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1995,3.92093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Get actual rating (Y) data

In [32]:
y = genres_ave_rating['rating'].to_frame()
y

Unnamed: 0,rating
0,4.0
1,4.0
2,4.0
3,4.0
4,4.0
...,...
247895,5.0
247896,5.0
247897,5.0
247898,5.0


# Get user data

## Find the number of ratings and the average of genres ratings given by each user

In [33]:
# Find the number of ratings given by each user
rating_count = genres_ave_rating.groupby(['userId'])['rating'].count()

In [34]:
# Find the average of ratings for each genres by each user
user_ave = genres_ave_rating.groupby(['userId', 'genres'])['rating'].mean()

In [35]:
# convert series to dataframe
rating_count = rating_count.to_frame().reset_index()
# rename the column for more clarity
rating_count.rename(columns={'rating':'user_rating_count'}, inplace=True)
rating_count

Unnamed: 0,userId,user_rating_count
0,1,683
1,2,67
2,3,80
3,4,488
4,5,125
...,...,...
605,606,1958
606,607,509
607,608,2219
608,609,83


In [36]:
# convert series to dataframe
user_ave = user_ave.to_frame().reset_index()
# rename the column for more clarity
user_ave.rename(columns={'rating':'ave_rating_genres'}, inplace=True)
user_ave

Unnamed: 0,userId,genres,ave_rating_genres
0,1,Action,4.318182
1,1,Adventure,4.380952
2,1,Animation,4.678571
3,1,Children,4.536585
4,1,Comedy,4.283951
...,...,...,...
9483,610,Romance,3.725000
9484,610,Sci-Fi,3.710900
9485,610,Thriller,3.662890
9486,610,War,3.833333


In [37]:
# merge the average rating datas
user_data = pd.merge(user_ave, rating_count, how='left', on=['userId', 'userId'])
user_data = pd.merge(user_data, user_ave_rating, how='left', on=['userId', 'userId'])
user_data

Unnamed: 0,userId,genres,ave_rating_genres,user_rating_count,user_ave_rating
0,1,Action,4.318182,683,4.361233
1,1,Adventure,4.380952,683,4.361233
2,1,Animation,4.678571,683,4.361233
3,1,Children,4.536585,683,4.361233
4,1,Comedy,4.283951,683,4.361233
...,...,...,...,...,...
9483,610,Romance,3.725000,2701,3.759514
9484,610,Sci-Fi,3.710900,2701,3.759514
9485,610,Thriller,3.662890,2701,3.759514
9486,610,War,3.833333,2701,3.759514


## Create a pivot table where each row contains the data of a unique user

In [38]:
# .reset_index() make pivot table back to datafram
user_data = user_data.pivot_table('ave_rating_genres', ['userId', 'user_ave_rating', 'user_rating_count'], 
                                                  'genres', fill_value = 0.0).reset_index()
user_data

genres,userId,user_ave_rating,user_rating_count,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.00,4.523077,4.297872,5.000000,3.470588,4.681818,4.166667,4.307692,4.225000,4.129630,4.500000,4.285714
1,2,3.981481,67,3.954545,4.166667,0.000000,0.000000,4.000000,4.000000,4.00,4.000000,0.000000,0.000000,3.000000,0.000000,4.000000,4.500000,3.875000,3.888889,4.500000,3.500000
2,3,1.482143,80,2.833333,2.500000,0.500000,0.500000,0.500000,0.500000,0.00,0.500000,2.833333,0.000000,4.250000,0.500000,0.000000,0.500000,3.428571,3.625000,0.500000,0.000000
3,4,3.542714,488,3.333333,3.655172,4.000000,3.800000,3.489796,3.814815,5.00,3.467890,3.684211,4.000000,4.250000,4.000000,3.409091,3.314815,2.833333,3.513514,3.571429,3.800000
4,5,3.636364,125,3.111111,3.250000,4.333333,4.111111,3.466667,3.833333,0.00,3.800000,4.142857,0.000000,3.000000,4.400000,4.000000,3.090909,2.500000,3.555556,3.333333,3.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,606,3.662162,1958,3.221374,3.511278,3.743590,3.475000,3.531690,3.786408,3.75,3.803797,3.600000,3.785714,3.384615,3.840000,3.833333,3.721519,3.559211,3.549383,3.768293,3.583333
606,607,3.785714,509,3.718310,3.466667,3.333333,3.388889,3.327273,3.814815,0.00,4.012658,3.571429,0.000000,4.147059,3.600000,4.687500,3.517241,3.250000,4.118644,4.166667,4.000000
607,608,3.137995,2219,3.336364,3.220994,3.118182,2.460227,2.742733,3.607639,2.80,3.429104,3.000000,3.750000,3.340426,2.718750,3.550725,2.876238,3.310976,3.537402,3.578947,2.636364
608,609,3.218750,83,3.090909,3.200000,3.000000,3.000000,3.285714,3.400000,3.00,3.312500,3.000000,0.000000,3.500000,0.000000,0.000000,3.200000,3.000000,3.230769,3.333333,4.000000


## Merge the data to create user_data

In [39]:
genres_ave_rating

Unnamed: 0,userId,movieId,rating,year,genres,movie_ave_rating
0,1,1,4.0,1995,Adventure,3.920930
1,1,1,4.0,1995,Animation,3.920930
2,1,1,4.0,1995,Children,3.920930
3,1,1,4.0,1995,Comedy,3.920930
4,1,1,4.0,1995,Fantasy,3.920930
...,...,...,...,...,...,...
247895,610,168248,5.0,2017,Crime,4.142857
247896,610,168248,5.0,2017,Thriller,4.142857
247897,610,168250,5.0,2017,Horror,3.633333
247898,610,168252,5.0,2017,Action,4.280000


In [40]:
user_data  = pd.merge(genres_ave_rating, user_data, how='inner', on=['userId', 'userId'])

In [41]:
user_data = user_data.drop('genres', axis=1)
user_data = user_data.drop('rating', axis=1)
user_data = user_data.drop('movieId', axis=1)
user_data = user_data.drop('year', axis=1)
user_data = user_data.drop('movie_ave_rating', axis=1)
# display all columns
pd.set_option('display.max_columns', None)
user_data.head()

Unnamed: 0,userId,user_ave_rating,user_rating_count,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.0,4.523077,4.297872,5.0,3.470588,4.681818,4.166667,4.307692,4.225,4.12963,4.5,4.285714
1,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.0,4.523077,4.297872,5.0,3.470588,4.681818,4.166667,4.307692,4.225,4.12963,4.5,4.285714
2,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.0,4.523077,4.297872,5.0,3.470588,4.681818,4.166667,4.307692,4.225,4.12963,4.5,4.285714
3,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.0,4.523077,4.297872,5.0,3.470588,4.681818,4.166667,4.307692,4.225,4.12963,4.5,4.285714
4,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.0,4.523077,4.297872,5.0,3.470588,4.681818,4.166667,4.307692,4.225,4.12963,4.5,4.285714


In [42]:
movie_data.head()

Unnamed: 0,movieId,year,movie_ave_rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,3.92093,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1995,3.92093,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1995,3.92093,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1995,3.92093,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1995,3.92093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
y.head()

Unnamed: 0,rating
0,4.0
1,4.0
2,4.0
3,4.0
4,4.0


In [44]:
user_data.shape

(247900, 21)

In [45]:
movie_data.shape

(247900, 21)

In [46]:
y.shape

(247900, 1)

# Get movie_dict

In [47]:
# movie_dict = defaultdict(dict)
# count = 0
# with open('./data/content_movie_list.csv', newline='') as movie:
#     reader = csv.reader(movie, delimiter=',', quotechar='"')
#     for line in reader:
#         if count == 0:
#             count += 1  # skip header
#         else:
#             count += 1
#             movie_id = int(line[0])
#             movie_dict[movie_id]["title"] = line[1]
#             movie_dict[movie_id]["genres"] = line[2]

In [48]:
# movie_dict

# Get item_vecs

In [49]:
movies

Unnamed: 0,movieId,genres,year
0,1,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Adventure|Children|Fantasy,1995
2,3,Comedy|Romance,1995
3,4,Comedy|Drama|Romance,1995
4,5,Comedy,1995
...,...,...,...
3645,180031,Adventure|Drama|Fantasy,2017
3646,180985,Drama,2017
3647,183897,Animation|Comedy,2018
3648,187593,Action|Comedy|Sci-Fi,2018


In [50]:
movies = pd.merge(movies, movie_ave_rating, how='inner', on=['movieId', 'movieId'])
movies

Unnamed: 0,movieId,genres,year,movie_ave_rating
0,1,Adventure|Animation|Children|Comedy|Fantasy,1995,3.920930
1,2,Adventure|Children|Fantasy,1995,3.431818
2,3,Comedy|Romance,1995,3.259615
3,4,Comedy|Drama|Romance,1995,2.357143
4,5,Comedy,1995,3.071429
...,...,...,...,...
3645,180031,Adventure|Drama|Fantasy,2017,3.687500
3646,180985,Drama,2017,3.000000
3647,183897,Animation|Comedy,2018,3.500000
3648,187593,Action|Comedy|Sci-Fi,2018,3.875000


In [51]:
# split the movie genres in the column genres into multiple rows
movies = (movies.set_index(['movieId', 'year', 'movie_ave_rating'])).apply(lambda x: x.str.split('|').explode()).reset_index()
movies

Unnamed: 0,movieId,year,movie_ave_rating,genres
0,1,1995,3.92093,Adventure
1,1,1995,3.92093,Animation
2,1,1995,3.92093,Children
3,1,1995,3.92093,Comedy
4,1,1995,3.92093,Fantasy
...,...,...,...,...
9284,187593,2018,3.87500,Sci-Fi
9285,187595,2018,3.90000,Action
9286,187595,2018,3.90000,Adventure
9287,187595,2018,3.90000,Children


In [52]:
# 'IMAX' and '(no genres listed)' would not contribute much to the recommendation system, so we drop these columns
# After dropping some rows, the indices would not be consecutively, so use .reset_index() to fix it 
movies = movies.drop(movies[(movies.genres == '(no genres listed)') | 
                                           (movies.genres == 'IMAX')].index).reset_index(drop=True)

In [53]:
len(movies['genres'].unique())

18

In [54]:
# creating instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='error')

# perform one-hot encoding on 'genres' column 
encoder_df = pd.DataFrame(encoder.fit_transform(movies[['genres']]).toarray(), columns = genres)

# merge one-hot encoded columns back with original DataFrame
item_vecs = movies.join(encoder_df)
# pd.concat([genres_rows, encoder_df], axis=1)

In [55]:
item_vecs = item_vecs.drop('genres', axis=1)

In [56]:
item_vecs.head()

Unnamed: 0,movieId,year,movie_ave_rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,3.92093,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1995,3.92093,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1995,3.92093,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1995,3.92093,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1995,3.92093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
item_vecs.shape

(9167, 21)

In [58]:
# item_vecs = item_vecs.to_numpy()

# Get ratings

In [59]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
90269,610,166528,4.0
90270,610,166534,4.0
90271,610,168248,5.0
90272,610,168250,5.0


In [60]:
ratings.loc[ratings['userId'] == 1].loc[ratings['movieId'] == 50].iloc[0]['rating']

5.0

In [61]:
movie_id = list(ratings.loc[ratings['userId'] == 1]['movieId'])

In [62]:
3 in movie_id

True

In [63]:
user = ratings['userId'].unique()

In [64]:
len(user)

610

In [65]:
user_data

Unnamed: 0,userId,user_ave_rating,user_rating_count,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.00,4.523077,4.297872,5.0000,3.470588,4.681818,4.166667,4.307692,4.2250,4.12963,4.500000,4.285714
1,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.00,4.523077,4.297872,5.0000,3.470588,4.681818,4.166667,4.307692,4.2250,4.12963,4.500000,4.285714
2,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.00,4.523077,4.297872,5.0000,3.470588,4.681818,4.166667,4.307692,4.2250,4.12963,4.500000,4.285714
3,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.00,4.523077,4.297872,5.0000,3.470588,4.681818,4.166667,4.307692,4.2250,4.12963,4.500000,4.285714
4,1,4.361233,683,4.318182,4.380952,4.678571,4.536585,4.283951,4.333333,0.00,4.523077,4.297872,5.0000,3.470588,4.681818,4.166667,4.307692,4.2250,4.12963,4.500000,4.285714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247895,610,3.759514,2701,3.630273,3.743697,3.919643,3.755556,3.769841,3.815534,4.25,3.926075,3.632479,4.5625,3.616279,3.944444,3.956250,3.725000,3.7109,3.66289,3.833333,3.895833
247896,610,3.759514,2701,3.630273,3.743697,3.919643,3.755556,3.769841,3.815534,4.25,3.926075,3.632479,4.5625,3.616279,3.944444,3.956250,3.725000,3.7109,3.66289,3.833333,3.895833
247897,610,3.759514,2701,3.630273,3.743697,3.919643,3.755556,3.769841,3.815534,4.25,3.926075,3.632479,4.5625,3.616279,3.944444,3.956250,3.725000,3.7109,3.66289,3.833333,3.895833
247898,610,3.759514,2701,3.630273,3.743697,3.919643,3.755556,3.769841,3.815534,4.25,3.926075,3.632479,4.5625,3.616279,3.944444,3.956250,3.725000,3.7109,3.66289,3.833333,3.895833


# Export the dataframes to csv for future training

In [66]:
user_data.to_csv('ml-latest-small/user_data.csv', sep=',', index=False)

In [67]:
movie_data.to_csv('ml-latest-small/movie_data.csv', sep=',', index=False)

In [68]:
y.to_csv('ml-latest-small/y.csv', sep=',', index=False)

In [69]:
item_vecs.to_csv('ml-latest-small/item_vecs.csv', sep=',', index=False)