# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack, csr_matrix
import math
from google.colab import drive
drive.mount('/content/drive')
# Load the dataset from Google Drive
df_ratings = pd.read_csv('/content/drive/My Drive/rating.csv')

Mounted at /content/drive


# Load the data

In [2]:
movies = pd.read_csv('movie.csv')
ratings = df_ratings

## Explore the data

In [3]:
# List of the features within the dataset
print("Movies : ", movies.columns,end="\n\n")
print("Ratings : ", ratings.columns,end="\n\n")

Movies :  Index(['movieId', 'title', 'genres'], dtype='object')

Ratings :  Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')



In [4]:
# Check the number of rows and columns in the movies data
print(movies.shape)

# Check the data types of each column in the movies data
print(movies.info())

(27278, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB
None


In [5]:
# Check for null values
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [6]:
for df in [movies, ratings]:

    # Check the number of rows and columns in the ratings data
    print('Shape:', df.shape)

    # Check the data types of each column in the ratings data
    print('Info:', df.info())

    # Check the summary statistics of the ratings data
    print('Describe: ', df.describe())

    # Check for null values
    print('Null values:', df.isnull().sum())

    # Check for duplicated values
    print('Duplicated values:',df.duplicated().sum())

    # Display the first few rows of the data
    print('First few rows:', df.head())
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

Shape: (27278, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB
Info: None
Describe:               movieId
count   27278.000000
mean    59855.480570
std     44429.314697
min         1.000000
25%      6931.250000
50%     68068.000000
75%    100293.250000
max    131262.000000
Null values: movieId    0
title      0
genres     0
dtype: int64
Duplicated values: 0
First few rows:    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                       

**Summary**
* Features:
    * Numerical features: userId, movieId, rating
    * Categorical features: title, genres
    * Alphanumerical Features: timestamp

* Both datasets don't have missing or duplicated values
* There are a total of 27278 movie in our dataset with 20000263 user ratings.

In [7]:
# movieId: Unique Id provided for each movie.
print(pd.merge(movies, ratings, on='movieId', how='inner')['movieId'].nunique(),'movie out of',movies['movieId'].nunique(), 'has at least one rating')

24236 movie out of 27278 has at least one rating


In [8]:
# rating: Range from (0.5 - 5.0) with 0.5 increments
print('Mean rating of a movie:',ratings['rating'].mean())

Mean rating of a movie: 3.5248581540724766


In [9]:
# userId: Unique Id provided for each user.
print('Number of users:', ratings['userId'].nunique())
print('Mean number of ratings for each user:', ratings.groupby(by='userId').size().mean())

Number of users: 85788
Mean number of ratings for each user: 144.76715857695714


In [10]:
print('Number of ratings made by each user:\n', ratings.groupby(by='userId').size())

Number of ratings made by each user:
 userId
1        175
2         61
3        187
4         28
5         66
        ... 
85784     53
85785     25
85786    129
85787     53
85788    181
Length: 85788, dtype: int64


In [11]:
print('Count of most given ratings in desc order:\n')
ratings['rating'].value_counts()

Count of most given ratings in desc order:



rating
4.0    3446921
3.0    2659199
5.0    1797318
3.5    1374316
4.5     955176
2.0     889165
2.5     553000
1.0     420266
1.5     172891
0.5     151033
Name: count, dtype: int64

## Preprocess the data

In [12]:
# Extract the release year from the movie titles
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)')

In [13]:
movies['year'].unique()

array(['1995', '1994', '1996', '1976', '1992', '1988', '1967', '1993',
       '1964', '1977', '1965', '1982', '1985', '1990', '1991', '1989',
       '1937', '1940', '1969', '1981', '1973', '1970', '1960', '1955',
       '1959', '1968', '1980', '1975', '1986', '1948', '1943', '1950',
       '1946', '1987', '1997', '1974', '1956', '1958', '1949', '1972',
       '1998', '1933', '1952', '1951', '1957', '1961', '1954', '1934',
       '1944', '1963', '1942', '1941', '1953', '1939', '1947', '1945',
       '1938', '1935', '1936', '1926', '1932', '1979', '1971', '1978',
       '1966', '1962', '1983', '1984', '1931', '1922', '1999', '1927',
       '1929', '1930', '1928', '1925', '1914', '2000', '1919', '1923',
       '1920', '1918', '1921', '2001', '1924', '2002', '2003', '1915',
       '2004', '1916', '1917', '2005', '2006', '1902', nan, '1903',
       '2007', '2008', '2009', '1912', '2010', '1913', '2011', '1898',
       '1899', '1894', '2012', '1909', '1910', '1901', '1893', '2013',
       '1

In [14]:
movies.isnull().sum()

movieId     0
title       0
genres      0
year       22
dtype: int64

In [15]:
movies.loc[movies['year'].isnull()]

Unnamed: 0,movieId,title,genres,year
10593,40697,Babylon 5,Sci-Fi,
15646,79607,"Millions Game, The (Das Millionenspiel)",Action|Drama|Sci-Fi|Thriller,
17341,87442,"Bicycle, Spoon, Apple (Bicicleta, cullera, poma)",Documentary,
22368,107434,Diplomatic Immunity (2009– ),Comedy,
22669,108548,"Big Bang Theory, The (2007-)",Comedy,
22679,108583,Fawlty Towers (1975-1979),Comedy,
23617,112406,Brazil: In the Shadow of the Stadiums,Documentary,
23824,113190,Slaying the Badger,Documentary,
24286,115133,Tatort: Im Schmerz geboren,Crime,
24412,115685,National Theatre Live: Frankenstein,Drama|Fantasy,


In [16]:
# Remove any movies without a valid year of release
movies.dropna(subset=['year'], inplace=True)

In [17]:
movies['year'].astype(int)

0        1995
1        1995
2        1995
3        1995
4        1995
         ... 
27273    2007
27274    2002
27275    2014
27276    2001
27277    2014
Name: year, Length: 27256, dtype: int64

In [18]:
movies.isnull().sum()

movieId    0
title      0
genres     0
year       0
dtype: int64

In [19]:
movies.shape

(27256, 4)

In [20]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [21]:
# Extract genres

# Split the "genres" column into multiple columns using get_dummies()
genres_df = movies['genres'].str.get_dummies('|')
print('These are the available genres:\n',genres_df.columns.values)

These are the available genres:
 ['(no genres listed)' 'Action' 'Adventure' 'Animation' 'Children' 'Comedy'
 'Crime' 'Documentary' 'Drama' 'Fantasy' 'Film-Noir' 'Horror' 'IMAX'
 'Musical' 'Mystery' 'Romance' 'Sci-Fi' 'Thriller' 'War' 'Western']


In [22]:
genres_df

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27273,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27274,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27275,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27276,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Visualize the data

# How to compute the recommendations for one user

**1. First, we choose the movies features that we are going to use to compute similarity between different movies.**

* Genres: each genre is a feature
    
* Release year
    
* Both together
    
* We can extract other features (by adding infos from imdb; other features like actors, directors, and plot keywords can also be useful, or using tags)
    
**2. Then, we fix the similarity metric that measures how similar two movies are, based on their feature vectors.**

   * For genres as features: Cosine similarity metric
    
   * For release year as feature: Euclidian distance, Exponential Decay Similarity Metric.
    
   The choice of similarity metric depends on the features being used. For example, cosine similarity works well for genre features because they are binary (a movie either has a particular genre or it doesn't), while Euclidean distance may work better for continuous features like release year.
    
**3. Next, for each movie that the user has rated, we need to find the top K most similar movies based on the computed similarities. We are going to use the KNeighborsClassifier.**

The choice of k (the number of nearest neighbors to consider) can also affect the recommendations. A larger k may result in more diverse recommendations, while a smaller k may result in more similar recommendations.

**4. We combine the recommendations from all of the movies that the user has rated to create a list of recommended movies for the user.**

The method for combining recommendations from different movies can also affect the recommendations. One common approach is to simply aggregate the recommendations and sort them by some relevance metric.

**5. Sort the recommended movies by some relevance metric: similarity score weighted by user ratings.**
For example, user rated high movie1 and rated low movie2, then it is only logical that he gets recommended to more movies similar to movie1.


# Data preparation


In [23]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [24]:
ratings.drop(['timestamp'], axis=1, inplace= True)

In [25]:
# Drop unused attributes
movies_features = movies.drop(['title', 'genres'], axis=1, inplace=False)

In [26]:
# Merge with genres
movies_features = pd.merge(movies_features, genres_df, left_index=True, right_index=True)

In [27]:
movies_features.tail()

Unnamed: 0,movieId,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
27273,131254,2007,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
27274,131256,2002,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
27275,131258,2014,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27276,131260,2001,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27277,131262,2014,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


# Content based filtering, with Features = Genres

In [28]:
movies_features.iloc[:, 2:]

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27273,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27274,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27275,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27276,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [29]:
model = NearestNeighbors(n_neighbors=10, metric='cosine')
model.fit(movies_features.iloc[:, 2:])

In [30]:
# Define a function to recommend movies based on a given movie
def recommend_movies(movie_idx, features, model):
    # Find the n nearest neighbors based on the cosine similarity
    distances, indices = model.kneighbors(features)

    # Get the attributes of the nearest neighbor movies
    recommended_movies = pd.DataFrame([movies.iloc[idx] for idx in indices.flatten()])
    recommended_movies['distance'] = distances.flatten()

    # Print the recommended movies
#     print(movies.loc[movie_idx])
#     print("Movies similar to", find_title_by_idx(movie_idx), ":\n")
#     print(recommended_movies)
    # Return the recommended movies
    return recommended_movies

In [31]:
def find_idx_by_title(title):
    idx = movies[movies['title'] == title].index
    return idx[0]

In [None]:
def find_title_by_idx(idx):
    title = movies.loc[idx]['title']
    return title

In [32]:
# Test recs on "Toy Story (1995)" :
index = find_idx_by_title("Toy Story (1995)")

recommend_movies(index, pd.DataFrame(movies_features.iloc[index, 2:]).transpose(), model)

Unnamed: 0,movieId,title,genres,year,distance
24092,114240,Aladdin (1992),Adventure|Animation|Children|Comedy|Fantasy,1992,0.0
3663,3754,"Adventures of Rocky and Bullwinkle, The (2000)",Adventure|Animation|Children|Comedy|Fantasy,2000,0.0
10987,45074,"Wild, The (2006)",Adventure|Animation|Children|Comedy|Fantasy,2006,0.0
24156,114552,"Boxtrolls, The (2014)",Adventure|Animation|Children|Comedy|Fantasy,2014,0.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,0.0
3027,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,1999,0.0
3922,4016,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy,2000,0.0
24458,115875,Toy Story Toons: Hawaiian Vacation (2011),Adventure|Animation|Children|Comedy|Fantasy,2011,0.0
18274,91355,Asterix and the Vikings (Astérix et les Viking...,Adventure|Animation|Children|Comedy|Fantasy,2006,0.0
24460,115879,Toy Story Toons: Small Fry (2011),Adventure|Animation|Children|Comedy|Fantasy,2011,0.0


As you can see, all the recommended movies have similar genres as the movie "Toy Story (1995)"


# Content based filtering, with Features = Release year

In [33]:
model_1 = NearestNeighbors(n_neighbors=10, metric='euclidean')
model_1.fit(pd.DataFrame(movies_features.iloc[:, 1]))

In [34]:
pd.DataFrame(movies_features.iloc[:, 1])

Unnamed: 0,year
0,1995
1,1995
2,1995
3,1995
4,1995
...,...
27273,2007
27274,2002
27275,2014
27276,2001


In [35]:
pd.DataFrame(movies_features.iloc[index, 1:2]).transpose()

Unnamed: 0,year
0,1995


In [36]:
# Test recs on "Toy Story (1995)" :
index = find_idx_by_title("Toy Story (1995)")

recommend_movies(index, pd.DataFrame(movies_features.iloc[index, 1:2]).transpose(), model_1)

Unnamed: 0,movieId,title,genres,year,distance
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,0.0
7,8,Tom and Huck (1995),Adventure|Children,1995,0.0
4,5,Father of the Bride Part II (1995),Comedy,1995,0.0
8,9,Sudden Death (1995),Action,1995,0.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,0.0
5,6,Heat (1995),Action|Crime|Thriller,1995,0.0
13,14,Nixon (1995),Drama,1995,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,0.0
9,10,GoldenEye (1995),Action|Adventure|Thriller,1995,0.0


# Content based filtering, with Features = genres + release year

In [37]:
# Cosine sim
def compute_genre_similarity(movie1, movie2):
    genres1 = movie1[1:]
    genres2 = movie2[1:]
#     sumxx, sumxy, sumyy = 0, 0, 0
#     for i in range(len(genres1)):
#         x = genres1[i]
#         y = genres2[i]
#         sumxx += x * x
#         sumyy += y * y
#         sumxy += x * y

    #return (1 - sumxy/math.sqrt(sumxx*sumyy))

    return sklearn.metrics.pairwise.cosine_distances(genres1.reshape(1, -1), genres2.reshape(1, -1))

In [38]:
# exponential decay similarity score
def compute_year_similarity(x1, x2):
    diff = abs(x1[0] - x2[0])
    sim = math.exp(-diff / 10.0)
    #print(sklearn.metrics.pairwise.euclidean_distances(x1[0].reshape(1, -1), x2[0].reshape(1, -1)))
    #print(1 - sim)
    return (1 - sim)

The next function can be a good starting point to compute similarity between two movies based on genres and release year. However, it depends on the specific use case and the weight given to each similarity measure.


Adding these scores together assumes that they have equal weight in determining the overall similarity between two movies. This may not be appropriate in all cases, as the importance of genres and release years in determining similarity can vary depending on the context.

Multiplying them means that the final similarity score will be affected by the difference in magnitude between the genre similarity score and the year similarity score.

In [39]:
def compute_distance(x1, x2):
    genre_similarity= compute_genre_similarity(x1, x2)
    year_similarity= compute_year_similarity(x1, x2)

    #return year_similarity + genre_similarity
    #return year_similarity * genre_similarity
    return (year_similarity*0.2 + genre_similarity*0.8)


In [40]:
movies_features.head()

Unnamed: 0,movieId,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1995,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1995,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,1995,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,1995,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
pd.DataFrame(movies_features.iloc[:, 1:])

Unnamed: 0,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1995,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1995,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1995,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,1995,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,1995,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27273,2007,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27274,2002,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27275,2014,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27276,2001,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
nn = NearestNeighbors(n_neighbors=10, metric=compute_distance)
nn.fit(pd.DataFrame(movies_features.iloc[:, 1:]))

In [43]:
# Test recs on "Toy Story (1995)" :
index = find_idx_by_title("Toy Story (1995)")

recommend_movies(index, pd.DataFrame(movies_features.iloc[index, 1:]).transpose(), nn)

Unnamed: 0,movieId,title,genres,year,distance
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,0.0
2209,2294,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy,1998,0.051836
24092,114240,Aladdin (1992),Adventure|Animation|Children|Comedy|Fantasy,1992,0.051836
3027,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,1999,0.065936
10114,33463,DuckTales: The Movie - Treasure of the Lost La...,Adventure|Animation|Children|Comedy|Fantasy,1990,0.078694
3663,3754,"Adventures of Rocky and Bullwinkle, The (2000)",Adventure|Animation|Children|Comedy|Fantasy,2000,0.078694
3922,4016,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy,2000,0.078694
9890,32352,"Thief and the Cobbler, The (a.k.a. Arabian Kni...",Adventure|Animation|Comedy|Fantasy,1995,0.084458
26053,124919,The Wind in the Willows (1995),Adventure|Animation|Children|Comedy,1995,0.084458
664,673,Space Jam (1996),Adventure|Animation|Children|Comedy|Fantasy|Sc...,1996,0.088736


The movies recommended are pretty similar to the input movie, in terms of genres and release decade (year).

# Recommend movies for user

In [44]:
user_id = 6526

In [45]:
print(ratings['userId'].value_counts())

userId
8405     7515
82418    5646
74142    5447
34576    5356
83090    5169
         ... 
21602      20
62526      20
42370      20
67408      20
13467      20
Name: count, Length: 85788, dtype: int64


In [46]:
def get_ratings_per_user (user_id):
    # Get all rows corresponding to the user_id
    user_ratings = ratings[ratings['userId'] == user_id]

    # Drop userId col
    user_ratings = user_ratings.drop(['userId'], axis=1)

    return user_ratings

In [47]:
ratings_per_user = get_ratings_per_user(user_id)

In [48]:
ratings_per_user

Unnamed: 0,movieId,rating
968935,168,2.5
968936,520,3.0
968937,543,2.5
968938,552,3.0
968939,596,4.5
968940,903,5.0
968941,1148,5.0
968942,1250,4.5
968943,1320,3.5
968944,1376,2.5


In [49]:
rated_movies = pd.merge(ratings_per_user, movies, on='movieId')

In [50]:
rated_movies

Unnamed: 0,movieId,rating,title,genres,year
0,168,2.5,First Knight (1995),Action|Drama|Romance,1995
1,520,3.0,Robin Hood: Men in Tights (1993),Comedy,1993
2,543,2.5,So I Married an Axe Murderer (1993),Comedy|Romance|Thriller,1993
3,552,3.0,"Three Musketeers, The (1993)",Action|Adventure|Comedy|Romance,1993
4,596,4.5,Pinocchio (1940),Animation|Children|Fantasy|Musical,1940
5,903,5.0,Vertigo (1958),Drama|Mystery|Romance|Thriller,1958
6,1148,5.0,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,1993
7,1250,4.5,"Bridge on the River Kwai, The (1957)",Adventure|Drama|War,1957
8,1320,3.5,Alien³ (a.k.a. Alien 3) (1992),Action|Horror|Sci-Fi|Thriller,1992
9,1376,2.5,Star Trek IV: The Voyage Home (1986),Adventure|Comedy|Sci-Fi,1986


Looks like this user likes a lot of genres, but it is clear that he hates horror movies 👻.

In [51]:
def find_index_by_id(movie_id):
    idx = movies[movies['movieId'] == movie_id].index
    return idx[0]

In [52]:
recs = pd.DataFrame()
for index, row in ratings_per_user.iterrows():
    movie_id = int(row['movieId'])
    rating = row['rating']
    features = pd.DataFrame(movies_features.iloc[find_index_by_id(movie_id), 1:]).transpose()
    recs_by_movie = recommend_movies(movie_id, features, nn)
    # Multiplying by (1/rating), so that movies similar to those high rated get more chance
    # to appear in the top n list.
    # Adding 1 to avoid 0 values (0*1 = 0*5: ratings make no difference in this case)
    # The smaller relevance is, the better recommendation is
    recs_by_movie['relevance'] = (1+recs_by_movie['distance'])*(1/rating)
    recs = pd.concat([recs, recs_by_movie], ignore_index=True)

In [53]:
recs

Unnamed: 0,movieId,title,genres,year,distance,relevance
0,168,First Knight (1995),Action|Drama|Romance,1995,0.000000,0.400000
1,118916,Titanic (1996),Action|Drama|Romance,1996,0.019033,0.407613
2,91112,Blown Away (1993),Action|Drama|Romance,1993,0.036254,0.414502
3,3996,"Crouching Tiger, Hidden Dragon (Wo hu cang lon...",Action|Drama|Romance,2000,0.078694,0.431478
4,1100,Days of Thunder (1990),Action|Drama|Romance,1990,0.078694,0.431478
...,...,...,...,...,...,...
195,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,2001,0.069703,0.534852
196,45074,"Wild, The (2006)",Adventure|Animation|Children|Comedy|Fantasy,2006,0.078694,0.539347
197,131248,Brother Bear 2 (2006),Adventure|Animation|Children|Comedy|Fantasy,2006,0.078694,0.539347
198,91355,Asterix and the Vikings (Astérix et les Viking...,Adventure|Animation|Children|Comedy|Fantasy,2006,0.078694,0.539347


In [54]:
bottom_20 = recs.nlargest(20, 'relevance')

In [55]:
bottom_20

Unnamed: 0,movieId,title,genres,year,distance,relevance
158,3018,Re-Animator (1985),Comedy|Horror|Sci-Fi,1985,0.2336038,1.233604
159,4533,"Return of the Living Dead, The (1985)",Comedy|Horror|Sci-Fi,1985,0.2336038,1.233604
156,105761,Blood Suckers from Outer Space (1984),Comedy|Horror|Sci-Fi,1984,0.2258657,1.225866
157,2613,Night of the Comet (1984),Comedy|Horror|Sci-Fi,1984,0.2258657,1.225866
155,8519,"Bat People, The (1974)",Comedy|Drama|Horror|Sci-Fi,1974,0.2190325,1.219033
154,5855,Shock Treatment (1981),Comedy|Musical|Sci-Fi,1981,0.1974173,1.197417
153,109767,Toomorrow (1970),Comedy|Musical|Sci-Fi,1970,0.1858735,1.185874
152,4412,"Thing with Two Heads, The (1972)",Comedy|Horror|Sci-Fi,1972,0.159016,1.159016
151,5724,"Creature Wasn't Nice, The (a.k.a. Naked Space)...",Comedy|Horror|Musical|Sci-Fi,1981,0.09023767,1.090238
167,3695,Toxic Avenger Part III: The Last Temptation of...,Comedy|Horror,1989,0.01903252,1.019033


Our recommender system is aware that this user hates horror movies

In [56]:
top_10 = recs.nsmallest(10, 'relevance')

In [57]:
top_10

Unnamed: 0,movieId,title,genres,year,distance,relevance
50,903,Vertigo (1958),Drama|Mystery|Romance|Thriller,1958,0.0,0.2
60,1148,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,1993,0.0,0.2
51,2181,Marnie (1964),Drama|Mystery|Romance|Thriller,1964,0.09023767,0.218048
40,1282,Fantasia (1940),Animation|Children|Fantasy|Musical,1940,0.0,0.222222
41,596,Pinocchio (1940),Animation|Children|Fantasy|Musical,1940,0.0,0.222222
70,1250,"Bridge on the River Kwai, The (1957)",Adventure|Drama|War,1957,0.0,0.222222
120,2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance,1989,8.881784000000001e-17,0.222222
180,3298,Boiler Room (2000),Crime|Drama|Thriller,2000,0.0,0.222222
181,5520,"Matter of Taste, A (Affaire de Goût, Une) (2000)",Crime|Drama|Thriller,2000,0.0,0.222222
182,6009,"City of Lost Souls, The (Hyôryuu-gai) (2000)",Crime|Drama|Thriller,2000,0.0,0.222222


In [58]:
top_20 = recs.nsmallest(20, 'relevance')

In [59]:
top_20

Unnamed: 0,movieId,title,genres,year,distance,relevance
50,903,Vertigo (1958),Drama|Mystery|Romance|Thriller,1958,0.0,0.2
60,1148,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,1993,0.0,0.2
51,2181,Marnie (1964),Drama|Mystery|Romance|Thriller,1964,0.09023767,0.218048
40,1282,Fantasia (1940),Animation|Children|Fantasy|Musical,1940,0.0,0.222222
41,596,Pinocchio (1940),Animation|Children|Fantasy|Musical,1940,0.0,0.222222
70,1250,"Bridge on the River Kwai, The (1957)",Adventure|Drama|War,1957,0.0,0.222222
120,2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance,1989,8.881784000000001e-17,0.222222
180,3298,Boiler Room (2000),Crime|Drama|Thriller,2000,0.0,0.222222
181,5520,"Matter of Taste, A (Affaire de Goût, Une) (2000)",Crime|Drama|Thriller,2000,0.0,0.222222
182,6009,"City of Lost Souls, The (Hyôryuu-gai) (2000)",Crime|Drama|Thriller,2000,0.0,0.222222
