# Content Based Filtering

In [2]:
import pandas as pd
import numpy as np

In [3]:
movie_data = pd.read_csv("Data\movies.csv")
movie_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# remove the year from title columns and store it in new column
movie_data["year"] = movie_data['title'].str.extract('(\(\d\d\d\d\))', expand=False)
movie_data["year"] = movie_data['year'].str.extract('(\d\d\d\d)', expand=False)     #Removing the parentheses
movie_data['title'] = movie_data.title.str.replace('(\(\d\d\d\d\))', '')        #Removing the years from the title
#movie_data['title'] = movie_data['title'].apply(lambda x: x.strip())
movie_data['title'] = movie_data.title.str.strip()
movie_data.head()

  movie_data['title'] = movie_data.title.str.replace('(\(\d\d\d\d\))', '')        #Removing the years from the title


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [5]:
# save the dataframe for collaborative filtering
#movie_data.to_csv('movie_data.csv')

In [6]:
# convert genres column into list
movie_data['genres'] = movie_data.genres.str.split('|')
movie_data.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


genres in a list format isn't optimal for the content-based recommendation system.

convert the list of genres to a vector where each column corresponds to one possible value of the feature.

This is needed for feeding categorical data, every different genre in columns contain either 1 or 0.

1 shows that a movie has that genre and 0 shows that it doesn't.

In [7]:
movie_genres = movie_data.copy()

#For every row in the movie_data, iterate through the list of genres and place a 1 into the corresponding column
for index, row in movie_data.iterrows():
    for genre in row['genres']:
        movie_genres.at[index, genre] = 1

movie_genres.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,,...,,,,,,,,,,
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,,1.0,,1.0,,...,,,,,,,,,,
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,,,,1.0,,1.0,...,,,,,,,,,,
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,,,,1.0,,1.0,...,,,,,,,,,,
4,5,Father of the Bride Part II,[Comedy],1995,,,,1.0,,,...,,,,,,,,,,


In [8]:
#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
movie_genres = movie_genres.fillna(0)
movie_genres.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### creating an input user to recommend movies to

In [9]:
userInput = [
            {'title':'Batman: The Dark Knight Returns, Part 1', 'rating':5},
            {'title':'3 Idiots', 'rating':5},
            {'title':'Rockstar', 'rating':3.5},
            {'title':'Toy Story', 'rating':2},
            {'title':"Iron Man", 'rating':5},
            {'title':"Pirates of the Caribbean: At World's End", 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Batman: The Dark Knight Returns, Part 1",5.0
1,3 Idiots,5.0
2,Rockstar,3.5
3,Toy Story,2.0
4,Iron Man,5.0
5,Pirates of the Caribbean: At World's End,4.5


#### Add movieId to input user
##### extract the input movie's ID's from the movie_data dataframe and add them into imputMovies.
##### first filter out the rows that contain the input movie's title and then merge with the inputMovies dataframe.

In [10]:
#Filtering out the movies by title
input_movieid = movie_data[movie_data['title'].isin(inputMovies['title'].tolist())]

# merge the both dataframe
inputMovies = pd.merge(input_movieid, inputMovies)
inputMovies


Unnamed: 0,movieId,title,genres,year,rating
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,2.0
1,53125,Pirates of the Caribbean: At World's End,"[Action, Adventure, Comedy, Fantasy]",2007,4.5
2,59315,Iron Man,"[Action, Adventure, Sci-Fi]",2008,5.0
3,73881,3 Idiots,"[Comedy, Drama, Romance]",2009,5.0
4,98124,"Batman: The Dark Knight Returns, Part 1","[Action, Animation, Sci-Fi]",2012,5.0
5,135601,Rockstar,"[Drama, Romance]",2011,3.5


In [11]:
# drop unwanted columns
inputMovies = inputMovies.drop(['genres', 'year'], axis = 1)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,2.0
1,53125,Pirates of the Caribbean: At World's End,4.5
2,59315,Iron Man,5.0
3,73881,3 Idiots,5.0
4,98124,"Batman: The Dark Knight Returns, Part 1",5.0
5,135601,Rockstar,3.5


In [12]:
# filtering out the movie from main data
user_movie = movie_genres[movie_genres['movieId'].isin(inputMovies['movieId'].tolist())]
user_movie

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11885,53125,Pirates of the Caribbean: At World's End,"[Action, Adventure, Comedy, Fantasy]",2007,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12646,59315,Iron Man,"[Action, Adventure, Sci-Fi]",2008,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14801,73881,3 Idiots,"[Comedy, Drama, Romance]",2009,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19930,98124,"Batman: The Dark Knight Returns, Part 1","[Action, Animation, Sci-Fi]",2012,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29705,135601,Rockstar,"[Drama, Romance]",2011,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# resetting index to avoid future issues
user_movie = user_movie.reset_index(drop=True)

#dropping unwanted columns
user_movie = user_movie.drop(['genres', 'year', 'movieId', 'title'], axis=1)
user_movie

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
inputMovies.rating

0    2.0
1    4.5
2    5.0
3    5.0
4    5.0
5    3.5
Name: rating, dtype: float64

##### get weights for every of the user's preferences. 
##### This is known as the User Profile. 
##### Using this we can recommend movies that satisfy the user's preferences.

In [15]:
# Dot product to gets weights
user_profile = user_movie.transpose().dot(inputMovies['rating'])
user_profile

Adventure             11.5
Animation              7.0
Children               2.0
Comedy                11.5
Fantasy                6.5
Romance                8.5
Drama                  8.5
Action                14.5
Crime                  0.0
Thriller               0.0
Horror                 0.0
Mystery                0.0
Sci-Fi                10.0
IMAX                   0.0
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [16]:
#get the genres of every movie in our movie_genres
genres_df = movie_genres.set_index(movie_genres['movieId'])

genres_df = genres_df.drop(['genres', 'year', 'movieId', 'title'], axis=1)
genres_df.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# multiply the genres by the weights and then take the weights average.
weighted_movie_matrix = genres_df * user_profile
weighted_movie_matrix.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,11.5,7.0,2.0,11.5,6.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11.5,0.0,2.0,0.0,6.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,11.5,0.0,8.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,11.5,0.0,8.5,8.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,11.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# weighted avg
recommendation_matrix = (weighted_movie_matrix.sum(axis=1))/(user_profile.sum())
recommendation_matrix.head()

movieId
1    0.48125
2    0.25000
3    0.25000
4    0.35625
5    0.14375
dtype: float64

In [19]:
# sort the recommendation matrix into descending order, its our recommenadation dataframe

recommendation_matrix = recommendation_matrix.sort_values(ascending=False)
recommendation_matrix.head()

movieId
27344    0.85000
62956    0.76250
52462    0.76250
49593    0.74375
71999    0.74375
dtype: float64

## Final Recommendation table

In [20]:
recommendation_df = movie_data.loc[movie_data['movieId'].isin(recommendation_matrix.head(10).keys())]
recommendation_df

Unnamed: 0,movieId,title,genres,year
9296,27344,Revolutionary Girl Utena: Adolescence of Utena...,"[Action, Adventure, Animation, Comedy, Drama, ...",1999
10575,40339,Chicken Little,"[Action, Adventure, Animation, Children, Comed...",2005
11497,49593,She,"[Action, Adventure, Drama, Fantasy, Horror, Ro...",1965
11785,52287,Meet the Robinsons,"[Action, Adventure, Animation, Children, Comed...",2007
11806,52462,Aqua Teen Hunger Force Colon Movie Film for Th...,"[Action, Adventure, Animation, Comedy, Fantasy...",2007
12021,54278,Underdog,"[Action, Adventure, Children, Comedy, Fantasy,...",2007
13109,62956,Futurama: Bender's Game,"[Action, Adventure, Animation, Comedy, Fantasy...",2008
14397,71999,Aelita: The Queen of Mars (Aelita),"[Action, Adventure, Drama, Fantasy, Romance, S...",1924
16884,85261,Mars Needs Moms,"[Action, Adventure, Animation, Children, Comed...",2011
22145,106240,Free Birds,"[Action, Adventure, Animation, Children, Comed...",2013
