In [1]:
import pandas as pd
from math import sqrt
import numpy as np

In [2]:
movies_df = pd.read_csv('E:/Projects/Content Based Movie Recommendation System/movies.csv')

In [3]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)

#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)

#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')

#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

movies_df

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
34203,151697,Grand Slam,Thriller,1967
34204,151701,Bloodmoney,(no genres listed),2010
34205,151703,The Butterfly Circus,Drama,2009
34206,151709,Zero,Drama|Sci-Fi,2015


In [4]:
#Every genre is separated by a | so we simply have to call the split function on |
movies_df['genres'] = movies_df.genres.str.split('|')

#Copying the movie dataframe into a new one since we won't need to use the genre information in our first case.
moviesWithGenres_df = movies_df.copy()

moviesWithGenres_df

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995
...,...,...,...,...
34203,151697,Grand Slam,[Thriller],1967
34204,151701,Bloodmoney,[(no genres listed)],2010
34205,151703,The Butterfly Circus,[Drama],2009
34206,151709,Zero,"[Drama, Sci-Fi]",2015


In [5]:
#For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1
        
#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34203,151697,Grand Slam,[Thriller],1967,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34204,151701,Bloodmoney,[(no genres listed)],2010,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
34205,151703,The Butterfly Circus,[Drama],2009,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34206,151709,Zero,"[Drama, Sci-Fi]",2015,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
inputMovies = pd.read_excel('C:/Users/swapn/OneDrive/Desktop/Content.xlsx')
df = pd.read_csv('E:/Projects/Content Based Movie Recommendation System/movies.csv')
inputMovies.columns = ['movieId', 'title', 'rating']
inputMovies = inputMovies.drop('movieId', 1)

inputId = df[df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('genres', 1)

inputMovies['year'] = inputMovies.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
inputMovies['year'] = inputMovies.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
inputMovies['title'] = inputMovies.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
inputMovies['title'] = inputMovies['title'].apply(lambda x: x.strip())
inputMovies = inputMovies.drop('year', 1)

inputMovies


Unnamed: 0,movieId,title,rating
0,11,"American President, The",3.2
1,16,Casino,2.8
2,18,Four Rooms,4.0
3,20,Money Train,3.6


In [7]:
#Filtering out the movies from the input
userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(inputMovies['movieId'].tolist())]

#Resetting the index to avoid future issues
userMovies = userMovies.reset_index(drop=True)
userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,11,"American President, The","[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16,Casino,"[Crime, Drama]",1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,18,Four Rooms,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20,Money Train,"[Action, Comedy, Crime, Drama, Thriller]",1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
#Dropping unnecessary issues due to save memory and to avoid issues
userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)

#Dot produt to get weights
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])
userProfile

Adventure              0.0
Animation              0.0
Children               0.0
Comedy                10.8
Fantasy                0.0
Romance                3.2
Drama                  9.6
Action                 3.6
Crime                  6.4
Thriller               3.6
Horror                 0.0
Mystery                0.0
Sci-Fi                 0.0
IMAX                   0.0
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [9]:
#Now let's get the genres of every movie in our original dataframe
genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['movieId'])

#And drop the unnecessary information
genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)

#Multiply the genres by the weights and then take the weighted average
recommendationTable_df = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())

#Sort our recommendations in descending order
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
rdf = pd.DataFrame(recommendationTable_df)
rdf.columns = ['Predicted Rating']

rdf

Unnamed: 0_level_0,Predicted Rating
movieId,Unnamed: 1_level_1
127341,1.000000
76153,1.000000
75408,1.000000
4719,1.000000
1432,0.913978
...,...
128544,0.000000
99103,0.000000
99119,0.000000
128540,0.000000


In [10]:
#The final recommendation table
final_df = movies_df.loc[movies_df['movieId'].isin(recommendationTable_df.head(20).keys())]
result = pd.merge(rdf, final_df, on='movieId')
result['Predicted Rating'] = result['Predicted Rating']*10
result.to_excel("C:/Users/swapn/OneDrive/Desktop/Content Results.xlsx")  
result

Unnamed: 0,movieId,Predicted Rating,title,genres,year
0,127341,10.0,Longshot,"[Action, Comedy, Crime, Drama, Romance, Thriller]",2001
1,76153,10.0,Lupin III: First Contact (Rupan Sansei: Faasut...,"[Action, Animation, Comedy, Crime, Drama, Myst...",2002
2,75408,10.0,Lupin III: Sweet Lost Night (Rupan Sansei: Swe...,"[Action, Animation, Comedy, Crime, Drama, Myst...",2008
3,4719,10.0,Osmosis Jones,"[Action, Animation, Comedy, Crime, Drama, Roma...",2001
4,1432,9.139785,Metro,"[Action, Comedy, Crime, Drama, Thriller]",1997
5,145,9.139785,Bad Boys,"[Action, Comedy, Crime, Drama, Thriller]",1995
6,7235,9.139785,Ichi the Killer (Koroshiya 1),"[Action, Comedy, Crime, Drama, Horror, Thriller]",2001
7,69136,9.139785,Don,"[Action, Comedy, Crime, Drama, Musical, Thriller]",1978
8,43853,9.139785,"Business, The","[Action, Comedy, Crime, Drama, Thriller]",2005
9,81132,9.139785,Rubber,"[Action, Adventure, Comedy, Crime, Drama, Film...",2010
