In [1]:
import pandas as pd

In [2]:
movies_data = 'data/movies.csv'

In [3]:
pd.set_option('display.max_rows', 20)

In [4]:
missing_values = ['na','--','?','-','None','none','non']

In [5]:
movies_df = pd.read_csv(movies_data, na_values=missing_values)

In [6]:
print('Movies_df Shape:',movies_df.shape)
movies_df

Movies_df Shape: (62423, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [7]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses.
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column.
movies_df['title'] = movies_df['title'].str.replace('(\(\d\d\d\d\))', '', regex=True)
#Applying the strip function to get rid of any ending white space characters that may have appeared, using lambda function.
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
62418,209157,We,Drama,2018
62419,209159,Window of the Soul,Documentary,2001
62420,209163,Bad Poems,Comedy|Drama,2018
62421,209169,A Girl Thing,(no genres listed),2001


In [8]:
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [9]:
movies_df.dtypes

movieId     int64
title      object
genres     object
year       object
dtype: object

In [11]:
# Filling year NaN values with zeros.
movies_df.fillna(3, inplace=True)
# Converting columns year from obj to int16 and movieId from int64 to int32 to save memory.
movies_df.year = movies_df.year.astype('int16')
movies_df.movieId = movies_df.movieId.astype('int32')

In [12]:
# First let's make a copy of the movies_df.
movies_with_genres = movies_df.copy(deep=True)
# Let's iterate through movies_df, then append the movie genres as columns of 1s or 0s.
# 1 if that column contains movies in the genre at the present index and 0 if not.
x = []
for index, row in movies_df.iterrows():
    x.append(index)
    for genre in row['genres']:
        movies_with_genres.at[index, genre] = 1
# Confirm that every row has been iterated and acted upon.
print(len(x) == len(movies_df))
movies_with_genres.head(3)

True


Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,,...,,,,,,,,,,
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,,1.0,,1.0,,...,,,,,,,,,,
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,,,,1.0,,1.0,...,,,,,,,,,,


In [13]:
#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre.
movies_with_genres = movies_with_genres.fillna(0)
movies_with_genres.head(3)


Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
#someones list
a_movie_ratings = [
            {'title':'Predator', 'rating':4.9},
            {'title':'Final Destination', 'rating':4.9},
            {'title':'Mission Impossible', 'rating':4},
            {'title':"Beverly Hills Cop", 'rating':3},
            {'title':'Exorcist, The', 'rating':4.8},
            {'title':'Waiting to Exhale', 'rating':3.9},
            {'title':'Avengers, The', 'rating':4.5},
            {'title':'Omen, The', 'rating':5.0}
         ] 
a_movie_ratings = pd.DataFrame(a_movie_ratings)
a_movie_ratings

Unnamed: 0,title,rating
0,Predator,4.9
1,Final Destination,4.9
2,Mission Impossible,4.0
3,Beverly Hills Cop,3.0
4,"Exorcist, The",4.8
5,Waiting to Exhale,3.9
6,"Avengers, The",4.5
7,"Omen, The",5.0


In [15]:
# Extracting movie Ids from movies_df and updating a_movie_ratings with movie Ids.
a_movie_Id = movies_df[movies_df['title'].isin(a_movie_ratings['title'])]
# Merging a movie Id  into the a_movie_ratings data frame. 
# This action implicitly merges both data frames by the title column.
a_movie_ratings = pd.merge(a_movie_Id, a_movie_ratings)
# Display the merged and updated data frame.
a_movie_ratings

Unnamed: 0,movieId,title,genres,year,rating
0,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,3.9
1,1350,"Omen, The","[Horror, Mystery, Thriller]",1976,5.0
2,1997,"Exorcist, The","[Horror, Mystery]",1973,4.8
3,2153,"Avengers, The","[Action, Adventure]",1998,4.5
4,3409,Final Destination,"[Drama, Thriller]",2000,4.9
5,3527,Predator,"[Action, Sci-Fi, Thriller]",1987,4.9
6,4085,Beverly Hills Cop,"[Action, Comedy, Crime, Drama]",1984,3.0
7,45662,"Omen, The","[Horror, Thriller]",2006,5.0
8,89745,"Avengers, The","[Action, Adventure, Sci-Fi, IMAX]",2012,4.5


In [16]:
# Check if the columns 'genres' and 'year' exist in 'person_movie_ratings' before dropping them
if 'genres' in a_movie_ratings.columns and 'year' in a_movie_ratings.columns:
    #Dropping information we don't need such as year and genres
    a_movie_ratings = a_movie_ratings.drop(['genres','year'], axis=1)
    # Final profile for a
    a_movie_ratings
    # filter the selection by outputing movies that exist in both a_movie_ratings and movies_with_genres.
    a_genres_df = movies_with_genres[movies_with_genres.movieId.isin(a_movie_ratings.movieId)]
    a_genres_df
else:
    print("Error: 'genres' and/or 'year' columns not found in 'person_movie_ratings'")

In [17]:

# reset index to default and drop the existing index.
a_genres_df.reset_index(drop=True, inplace=True)

#  drop redundant columns using .loc
a_genres_df = a_genres_df.loc[:, ~a_genres_df.columns.isin(['movieId', 'title', 'genres', 'year'])]

# Let's view changes
a_genres_df

print('Shape of a_movie_ratings is:',a_movie_ratings.shape)
print('Shape of a_genres_df is:',a_genres_df.shape)


Shape of a_movie_ratings is: (9, 3)
Shape of a_genres_df is: (9, 20)


In [18]:
# find the dot product of transpose of a_genres_df by a rating column.
a_profile = a_genres_df.T.dot(a_movie_ratings.rating)
#  result
a_profile

Adventure              9.0
Animation              0.0
Children               0.0
Comedy                 6.9
Fantasy                0.0
Romance                3.9
Drama                 11.8
Action                16.9
Crime                  3.0
Thriller              19.8
Horror                14.8
Mystery                9.8
Sci-Fi                 9.4
IMAX                   4.5
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [19]:
print(a_profile.sum())

109.80000000000001


In [20]:
#set the index to the movieId.
movies_with_genres = movies_with_genres.set_index(movies_with_genres.movieId)
movies_with_genres.head()
#del four unnecessary columns.
movies_with_genres.drop(['movieId','title','genres','year'], axis=1, inplace=True)
movies_with_genres.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
#multiply the genres by the weights and then take the weighted average
recommendation_table_df = (movies_with_genres.dot(a_profile)) / a_profile.sum()

In [22]:
#sort values from highest to lowest
recommendation_table_df.sort_values(ascending=False, inplace=True)
recommendation_table_df.head()

movieId
81132     0.837887
43932     0.751366
36509     0.725865
83266     0.711293
192789    0.699454
dtype: float64

In [23]:
#make a copy of the original movies_df
copy = movies_df.copy(deep=True)
#set its index to movieId, ie make index values as movieid 
copy = copy.set_index('movieId', drop=True)
#list out the top 20 recommended movieIds
top_20_index = recommendation_table_df.index[:20].tolist()
#slice these indices from the copied movies df and save in a variable
recommended_movies = copy.loc[top_20_index, :]
#display the top 20 movies in descending order of preference
recommended_movies

Unnamed: 0_level_0,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
81132,Rubber,"[Action, Adventure, Comedy, Crime, Drama, Film...",2010
43932,Pulse,"[Action, Drama, Fantasy, Horror, Mystery, Sci-...",2006
36509,"Cave, The","[Action, Adventure, Horror, Mystery, Sci-Fi, T...",2005
83266,Kaho Naa... Pyaar Hai,"[Action, Adventure, Comedy, Drama, Mystery, Ro...",2000
192789,Future-Kill,"[Action, Adventure, Comedy, Horror, Sci-Fi, Th...",1985
192495,Big Bad,"[Action, Adventure, Comedy, Horror, Sci-Fi, Th...",2016
79132,Inception,"[Action, Crime, Drama, Mystery, Sci-Fi, Thrill...",2010
7235,Ichi the Killer (Koroshiya 1),"[Action, Comedy, Crime, Drama, Horror, Thriller]",2001
180677,Child of Satan,"[Action, Drama, Horror, Mystery, Thriller]",2016
139012,Empty,"[Action, Drama, Horror, Sci-Fi, Thriller]",2011
