In [44]:
##Import the libraries 
import pandas as pd
import numpy as np
import torch
from torch import nn, optim
import torch.nn.functional as F
from sklearn import preprocessing
import torch.utils.data

In [45]:
movies = pd.read_csv('movies.csv', sep=',', encoding='latin-1', usecols=['movieid', 'title', 'genre'])
ratings = pd.read_csv('ratings.csv', sep=',', encoding='latin-1', usecols=['userid', 'movieid', 'rating'])

In [46]:
movies.head()

Unnamed: 0,movieid,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [47]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies['year'] = movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies.head(3)

Unnamed: 0,movieid,title,genre,year
0,1,Toy Story (1995),Animation|Children's|Comedy,(1995)
1,2,Jumanji (1995),Adventure|Children's|Fantasy,(1995)
2,3,Grumpier Old Men (1995),Comedy|Romance,(1995)


In [48]:
#Removing the parentheses
movies['year'] = movies.year.str.extract('(\d\d\d\d)',expand=False)
movies.head(3)

Unnamed: 0,movieid,title,genre,year
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995


In [49]:
#Removing the years from the 'title' column
movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')
movies.head(3)

  movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')


Unnamed: 0,movieid,title,genre,year
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995


In [50]:
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies['title'] = movies['title'].apply(lambda x: x.strip())
movies.head()

Unnamed: 0,movieid,title,genre,year
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama,1995
4,5,Father of the Bride Part II,Comedy,1995


In [51]:
#Every genre is separated by a | so we simply have to call the split function on |
movies['genre'] = movies.genre.str.split('|')
movies.head()

Unnamed: 0,movieid,title,genre,year
0,1,Toy Story,"[Animation, Children's, Comedy]",1995
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [52]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieid  3883 non-null   int64 
 1   title    3883 non-null   object
 2   genre    3883 non-null   object
 3   year     3883 non-null   object
dtypes: int64(1), object(3)
memory usage: 121.5+ KB


In [53]:
# Let's convert movieId column from int64 to int8 to save memory space
movies.movieid = movies.movieid.astype('int32')

In [54]:
movies.isna().sum()


movieid    0
title      0
genre      0
year       0
dtype: int64

In [55]:
# Let's now convert year column from int6a to int8, since it holds a max of just 4 digits of numbers. Thereby saving space.
movies.year = movies.year.astype('int16')

In [56]:
movies.dtypes

movieid     int32
title      object
genre      object
year        int16
dtype: object

In [57]:
movies.head()

Unnamed: 0,movieid,title,genre,year
0,1,Toy Story,"[Animation, Children's, Comedy]",1995
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama]",1995
4,5,Father of the Bride Part II,[Comedy],1995


Now, let's One-Hot-Encode the list of genres. This encoding is needed for feeding categorical data. In this case, we store every different genre in columns that contain either 1 or 0. 1 shows that a movie has that genre and 0 shows that it doesn't. Let's also store this dataframe in another variable, just incase we need the one without genres at some point.

In [58]:
# First let's make a copy of the movies_df
movies_with_genres = movies.copy(deep=True)

# Let's iterate through movies_df, then append the movie genres as columns of 1s or 0s.
# 1 if that column contains movies in the genre at the present index and 0 if not.

x = []
for index, row in movies.iterrows():
    x.append(index)
    for genre in row['genre']:
        movies_with_genres.at[index, genre] = 1

# Confirm that every row has been iterated and acted upon
print(len(x) == len(movies))

movies_with_genres.head(5)

True


Unnamed: 0,movieid,title,genre,year,Animation,Children's,Comedy,Adventure,Fantasy,Romance,...,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story,"[Animation, Children's, Comedy]",1995,1.0,1.0,1.0,,,,...,,,,,,,,,,
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995,,1.0,,1.0,1.0,,...,,,,,,,,,,
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,,,1.0,,,1.0,...,,,,,,,,,,
3,4,Waiting to Exhale,"[Comedy, Drama]",1995,,,1.0,,,,...,,,,,,,,,,
4,5,Father of the Bride Part II,[Comedy],1995,,,1.0,,,,...,,,,,,,,,,


In [59]:
#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
movies_with_genres = movies_with_genres.fillna(0)
movies_with_genres.head(5)

Unnamed: 0,movieid,title,genre,year,Animation,Children's,Comedy,Adventure,Fantasy,Romance,...,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story,"[Animation, Children's, Comedy]",1995,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama]",1995,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
ratings.head()

Unnamed: 0,userid,movieid,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


# Content Based recommender System
Now, let's implement a Content-Based or Item-Item recommendation systems. This technique attempts to figure out what a user's favourite aspects of an item is, and then recommends items that present those aspects.

Let's begin by creating an input user to recommend movies to. The user's name will be Lawrence and we would assume Lawrence has rated the following movies with the following ratings:-

Notice: To add more movies, simply increase the amount of elements in the userInput. Feel free to add more in! Just be sure to write it in with capital letters and if a movie starts with a "The", like "The Matrix" then write it in like this: 'Matrix, The' .

Step 1: Creating Lawrence's Profile

In [61]:
# so on a scale of 0 to 5, with 0 min and 5 max, see Lawrence's movie ratings below
Lawrence_movie_ratings = [
            {'title':'Predator', 'rating':4.9},
            {'title':'Final Destination', 'rating':4.9},
            {'title':'Mission Impossible', 'rating':4},
            {'title':"Beverly Hills Cop", 'rating':3},
            {'title':'Exorcist, The', 'rating':4.8},
            {'title':'Waiting to Exhale', 'rating':3.9},
            {'title':'Avengers, The', 'rating':4.5},
            {'title':'Omen, The', 'rating':5.0}
         ] 
Lawrence_movie_ratings = pd.DataFrame(Lawrence_movie_ratings)
Lawrence_movie_ratings

Unnamed: 0,title,rating
0,Predator,4.9
1,Final Destination,4.9
2,Mission Impossible,4.0
3,Beverly Hills Cop,3.0
4,"Exorcist, The",4.8
5,Waiting to Exhale,3.9
6,"Avengers, The",4.5
7,"Omen, The",5.0


Add movieId to input user With the input complete, let's extract the input movie's ID's from the movies dataframe and add them into it.

We can achieve this by first filtering out the rows that contain the input movie's title and then merging this subset with the input dataframe. We also drop unnecessary columns for the input to save memory space.

In [62]:
# Extracting movie Ids from movies_df and updating lawrence_movie_ratings with movie Ids.

Lawrence_movie_Id = movies[movies['title'].isin(Lawrence_movie_ratings['title'])]

# Merging Lawrence movie Id and ratings into the lawrence_movie_ratings data frame. 
# This action implicitly merges both data frames by the title column.

Lawrence_movie_ratings = pd.merge(Lawrence_movie_Id, Lawrence_movie_ratings)

# Display the merged and updated data frame.

Lawrence_movie_ratings

Unnamed: 0,movieid,title,genre,year,rating
0,4,Waiting to Exhale,"[Comedy, Drama]",1995,3.9
1,1350,"Omen, The",[Horror],1976,5.0
2,1997,"Exorcist, The",[Horror],1973,4.8
3,2153,"Avengers, The","[Action, Adventure]",1998,4.5
4,3409,Final Destination,"[Drama, Thriller]",2000,4.9
5,3527,Predator,"[Action, Sci-Fi, Thriller]",1987,4.9


In [64]:
Lawrence_movie_ratings = Lawrence_movie_ratings.drop(['genre','year'], 1)


#Final input dataframe
#If a movie you added in above isn't here, then it might not be in the original 
#dataframe or it might spelled differently, please check capitalisation.
Lawrence_movie_ratings

Unnamed: 0,movieid,title,rating
0,4,Waiting to Exhale,3.9
1,1350,"Omen, The",5.0
2,1997,"Exorcist, The",4.8
3,2153,"Avengers, The",4.5
4,3409,Final Destination,4.9
5,3527,Predator,4.9


Step 2: Learning Lawrence's Profile

We're going to start by learning the input's preferences, so let's get the subset of movies that the input has watched from the Dataframe containing genres defined with binary values.

In [66]:
# filter the selection by outputing movies that exist in both lawrence_movie_ratings and movies_with_genres
Lawrence_genres_df = movies_with_genres[movies_with_genres.movieid.isin(Lawrence_movie_ratings.movieid)]
Lawrence_genres_df

Unnamed: 0,movieid,title,genre,year,Animation,Children's,Comedy,Adventure,Fantasy,Romance,...,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
3,4,Waiting to Exhale,"[Comedy, Drama]",1995,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1329,1350,"Omen, The",[Horror],1976,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1928,1997,"Exorcist, The",[Horror],1973,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2084,2153,"Avengers, The","[Action, Adventure]",1998,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3340,3409,Final Destination,"[Drama, Thriller]",2000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3458,3527,Predator,"[Action, Sci-Fi, Thriller]",1987,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
# First, let's reset index to default and drop the existing index.
Lawrence_genres_df.reset_index(drop=True, inplace=True)

# Next, let's drop redundant columns
Lawrence_genres_df.drop(['movieid','title','genre','year'], axis=1, inplace=True)

# Let's view chamges

Lawrence_genres_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


Step 3: Building Lawrence's Profile
To do this, we're going to turn each genre into weights, by multiplying Lawrence's movie ratings by lawrence_genres_df table. And then summing up the resulting table by column. This operation is actually a dot product between a matrix and a vector. First let's confirm the shapes of the data frames we have recently defined

In [69]:
# let's confirm the shapes of our data frames to guide us as we do matrix multiplication

print('Shape of Lawrence_movie_ratings is:',Lawrence_movie_ratings.shape)
print('Shape of Lawrence_genres_df is:',Lawrence_genres_df.shape)

Shape of Lawrence_movie_ratings is: (6, 3)
Shape of Lawrence_genres_df is: (6, 18)


In [70]:
# Let's find the dot product of transpose of Lawrence_genres_df by lawrence rating column
Lawrence_profile = Lawrence_genres_df.T.dot(Lawrence_movie_ratings.rating)

# Let's see the result
Lawrence_profile

Animation      0.0
Children's     0.0
Comedy         3.9
Adventure      4.5
Fantasy        0.0
Romance        0.0
Drama          8.8
Action         9.4
Crime          0.0
Thriller       9.8
Horror         9.8
Sci-Fi         4.9
Documentary    0.0
War            0.0
Musical        0.0
Mystery        0.0
Film-Noir      0.0
Western        0.0
dtype: float64

Just by Eye-balling his profile, it is clear that Lawrence loves 'Thriller', 'Action' and 'Horror' movies the most… apt as can be.
Now, we have the weights for all his preferences. This is known as the User Profile. We can now recommend movies that satisfy Lawrence.
Let's start by editing the original movies_with_genres data frame that contains all movies and their genres columns.

Step 4: Deploying The Content-Based Recommender System.

In [71]:
# let's set the index to the movieId
movies_with_genres = movies_with_genres.set_index(movies_with_genres.movieid)

# let's view the head
movies_with_genres.head()

Unnamed: 0_level_0,movieid,title,genre,year,Animation,Children's,Comedy,Adventure,Fantasy,Romance,...,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,Toy Story,"[Animation, Children's, Comedy]",1995,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Jumanji,"[Adventure, Children's, Fantasy]",1995,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Waiting to Exhale,"[Comedy, Drama]",1995,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
# Deleting four unnecessary columns.
movies_with_genres.drop(['movieid','title','genre','year'], axis=1, inplace=True)

# Viewing changes.
movies_with_genres.head()

Unnamed: 0_level_0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
# Multiply the genres by the weights and then take the weighted average.
recommendation_table_df = (movies_with_genres.dot(Lawrence_profile)) / Lawrence_profile.sum()

# Let's view the recommendation table
recommendation_table_df.head()

movieid
1    0.076321
2    0.088063
3    0.076321
4    0.248532
5    0.076321
dtype: float64

In [74]:
# Let's sort values from great to small
recommendation_table_df.sort_values(ascending=False, inplace=True)

#Just a peek at the values
recommendation_table_df.head(20)

movieid
1320    0.663405
1214    0.663405
2288    0.663405
2617    0.655577
1876    0.643836
70      0.643836
2344    0.636008
1215    0.636008
2826    0.567515
610     0.559687
2916    0.559687
1127    0.559687
1544    0.559687
2322    0.559687
1591    0.559687
849     0.559687
1129    0.559687
1917    0.559687
2488    0.555773
591     0.547945
dtype: float64

In [77]:
# first we make a copy of the original movies_df
copy = movies.copy(deep=True)

# Then we set its index to movieId
copy = copy.set_index('movieid', drop=True)

# Next we enlist the top 20 recommended movieIds we defined above
top_20_index = recommendation_table_df.index[:20].tolist()

# finally we slice these indices from the copied movies df and save in a variable
recommended_movies = copy.loc[top_20_index, :]

# Now we can display the top 20 movies in descending order of preference
recommended_movies

Unnamed: 0_level_0,title,genre,year
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1320,AlienÂ³,"[Action, Horror, Sci-Fi, Thriller]",1992
1214,Alien,"[Action, Horror, Sci-Fi, Thriller]",1979
2288,"Thing, The","[Action, Horror, Sci-Fi, Thriller]",1982
2617,"Mummy, The","[Action, Adventure, Horror, Thriller]",1999
1876,Deep Impact,"[Action, Drama, Sci-Fi, Thriller]",1998
70,From Dusk Till Dawn,"[Action, Comedy, Crime, Horror, Thriller]",1996
2344,Runaway Train,"[Action, Adventure, Drama, Thriller]",1985
1215,Army of Darkness,"[Action, Adventure, Comedy, Horror, Sci-Fi]",1993
2826,"13th Warrior, The","[Action, Horror, Thriller]",1999
610,Heavy Metal,"[Action, Adventure, Animation, Horror, Sci-Fi]",1981
