# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack, csr_matrix
import math

# Load the data

In [None]:
movies = pd.read_csv('movie.csv')
ratings = pd.read_csv('rating.csv')

## Explore the data

In [None]:
# List of the features within the dataset
print("Movies : ", movies.columns,end="\n\n")
print("Ratings : ", ratings.columns,end="\n\n")

Movies :  Index(['movieId', 'title', 'genres'], dtype='object')

Ratings :  Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')



In [None]:
# Check the number of rows and columns in the movies data
print(movies.shape)

# Check the data types of each column in the movies data
print(movies.info())

(27278, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB
None


In [None]:
# Check for null values
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [None]:
for df in [movies, ratings]:

    # Check the number of rows and columns in the ratings data
    print('Shape:', df.shape)

    # Check the data types of each column in the ratings data
    print('Info:', df.info())

    # Check the summary statistics of the ratings data
    print('Describe: ', df.describe())

    # Check for null values
    print('Null values:', df.isnull().sum())

    # Check for duplicated values
    print('Duplicated values:',df.duplicated().sum())

    # Display the first few rows of the data
    print('First few rows:', df.head())
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

Shape: (27278, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB
Info: None
Describe:               movieId
count   27278.000000
mean    59855.480570
std     44429.314697
min         1.000000
25%      6931.250000
50%     68068.000000
75%    100293.250000
max    131262.000000
Null values: movieId    0
title      0
genres     0
dtype: int64
Duplicated values: 0
First few rows:    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                       

**Summary**
* Features:
    * Numerical features: userId, movieId, rating
    * Categorical features: title, genres
    * Alphanumerical Features: timestamp

* Both datasets don't have missing or duplicated values
* There are a total of 27278 movie in our dataset with 20000263 user ratings.

In [None]:
# movieId: Unique Id provided for each movie.
print(pd.merge(movies, ratings, on='movieId', how='inner')['movieId'].nunique(),'movie out of',movies['movieId'].nunique(), 'has at least one rating')

MemoryError: Unable to allocate 153. MiB for an array with shape (1, 20000263) and data type float64

In [None]:
# rating: Range from (0.5 - 5.0) with 0.5 increments
print('Mean rating of a movie:',ratings['rating'].mean())

In [None]:
# userId: Unique Id provided for each user.
print('Number of users:', ratings['userId'].nunique())
print('Mean number of ratings for each user:', ratings.groupby(by='userId').size().mean())

In [None]:
print('Number of ratings made by each user:\n', ratings.groupby(by='userId').size())

In [None]:
print('Count of most given ratings in desc order:\n')
ratings['rating'].value_counts()

## Preprocess the data

In [None]:
# Extract the release year from the movie titles
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)')

In [None]:
movies['year'].unique()

In [None]:
movies.isnull().sum()

In [None]:
movies.loc[movies['year'].isnull()]

In [None]:
# Remove any movies without a valid year of release
movies.dropna(subset=['year'], inplace=True)

In [None]:
movies['year'].astype(int)

In [None]:
movies.isnull().sum()

In [None]:
movies.shape

In [None]:
movies.head()

In [None]:
# Extract genres

# Split the "genres" column into multiple columns using get_dummies()
genres_df = movies['genres'].str.get_dummies('|')
print('These are the available genres:\n',genres_df.columns.values)

In [None]:
genres_df

## Visualize the data

# How to compute the recommendations for one user

**1. First, we choose the movies features that we are going to use to compute similarity between different movies.**

* Genres: each genre is a feature
    
* Release year
    
* Both together
    
* We can extract other features (by adding infos from imdb; other features like actors, directors, and plot keywords can also be useful, or using tags)
    
**2. Then, we fix the similarity metric that measures how similar two movies are, based on their feature vectors.**

   * For genres as features: Cosine similarity metric
    
   * For release year as feature: Euclidian distance, Exponential Decay Similarity Metric.
    
   The choice of similarity metric depends on the features being used. For example, cosine similarity works well for genre features because they are binary (a movie either has a particular genre or it doesn't), while Euclidean distance may work better for continuous features like release year.
    
**3. Next, for each movie that the user has rated, we need to find the top K most similar movies based on the computed similarities. We are going to use the KNeighborsClassifier.**

The choice of k (the number of nearest neighbors to consider) can also affect the recommendations. A larger k may result in more diverse recommendations, while a smaller k may result in more similar recommendations.

**4. We combine the recommendations from all of the movies that the user has rated to create a list of recommended movies for the user.**

The method for combining recommendations from different movies can also affect the recommendations. One common approach is to simply aggregate the recommendations and sort them by some relevance metric.

**5. Sort the recommended movies by some relevance metric: similarity score weighted by user ratings.**
For example, user rated high movie1 and rated low movie2, then it is only logical that he gets recommended to more movies similar to movie1.


# Data preparation


In [None]:
movies.head()

In [None]:
ratings.drop(['timestamp'], axis=1, inplace= True)

In [None]:
# Drop unused attributes
movies_features = movies.drop(['title', 'genres'], axis=1, inplace=False)

In [None]:
# Merge with genres
movies_features = pd.merge(movies_features, genres_df, left_index=True, right_index=True)

In [None]:
movies_features.tail()

# Content based filtering, with Features = Genres

In [None]:
movies_features.iloc[:, 2:]

In [None]:
model = NearestNeighbors(n_neighbors=10, metric='cosine')
model.fit(movies_features.iloc[:, 2:])

In [None]:
# Define a function to recommend movies based on a given movie
def recommend_movies(movie_idx, features, model):
    # Find the n nearest neighbors based on the cosine similarity
    distances, indices = model.kneighbors(features)

    # Get the attributes of the nearest neighbor movies
    recommended_movies = pd.DataFrame([movies.iloc[idx] for idx in indices.flatten()])
    recommended_movies['distance'] = distances.flatten()

    # Print the recommended movies
#     print(movies.loc[movie_idx])
#     print("Movies similar to", find_title_by_idx(movie_idx), ":\n")
#     print(recommended_movies)
    # Return the recommended movies
    return recommended_movies

In [None]:
def find_idx_by_title(title):
    idx = movies[movies['title'] == title].index
    return idx[0]

In [None]:
def find_title_by_idx(idx):
    title = movies.loc[idx]['title']
    return title

In [None]:
# Test recs on "Toy Story (1995)" :
index = find_idx_by_title("Toy Story (1995)")

recommend_movies(index, pd.DataFrame(movies_features.iloc[index, 2:]).transpose(), model)

As you can see, all the recommended movies have similar genres as the movie "Toy Story (1995)"


# Content based filtering, with Features = Release year

In [None]:
model_1 = NearestNeighbors(n_neighbors=10, metric='euclidean')
model_1.fit(pd.DataFrame(movies_features.iloc[:, 1]))

In [None]:
pd.DataFrame(movies_features.iloc[:, 1])

In [None]:
pd.DataFrame(movies_features.iloc[index, 1:2]).transpose()

In [None]:
# Test recs on "Toy Story (1995)" :
index = find_idx_by_title("Toy Story (1995)")

recommend_movies(index, pd.DataFrame(movies_features.iloc[index, 1:2]).transpose(), model_1)

# Content based filtering, with Features = genres + release year

In [None]:
# Cosine sim
def compute_genre_similarity(movie1, movie2):
    genres1 = movie1[1:]
    genres2 = movie2[1:]
#     sumxx, sumxy, sumyy = 0, 0, 0
#     for i in range(len(genres1)):
#         x = genres1[i]
#         y = genres2[i]
#         sumxx += x * x
#         sumyy += y * y
#         sumxy += x * y

    #return (1 - sumxy/math.sqrt(sumxx*sumyy))

    return sklearn.metrics.pairwise.cosine_distances(genres1.reshape(1, -1), genres2.reshape(1, -1))

In [None]:
# exponential decay similarity score
def compute_year_similarity(x1, x2):
    diff = abs(x1[0] - x2[0])
    sim = math.exp(-diff / 10.0)
    #print(sklearn.metrics.pairwise.euclidean_distances(x1[0].reshape(1, -1), x2[0].reshape(1, -1)))
    #print(1 - sim)
    return (1 - sim)

The next function can be a good starting point to compute similarity between two movies based on genres and release year. However, it depends on the specific use case and the weight given to each similarity measure.


Adding these scores together assumes that they have equal weight in determining the overall similarity between two movies. This may not be appropriate in all cases, as the importance of genres and release years in determining similarity can vary depending on the context.

Multiplying them means that the final similarity score will be affected by the difference in magnitude between the genre similarity score and the year similarity score.

In [None]:
def compute_distance(x1, x2):
    genre_similarity= compute_genre_similarity(x1, x2)
    year_similarity= compute_year_similarity(x1, x2)

    #return year_similarity + genre_similarity
    #return year_similarity * genre_similarity
    return (year_similarity*0.2 + genre_similarity*0.8)


In [None]:
movies_features.head()

In [None]:
pd.DataFrame(movies_features.iloc[:, 1:])

In [None]:
nn = NearestNeighbors(n_neighbors=10, metric=compute_distance)
nn.fit(pd.DataFrame(movies_features.iloc[:, 1:]))

In [None]:
# Test recs on "Toy Story (1995)" :
index = find_idx_by_title("Toy Story (1995)")

recommend_movies(index, pd.DataFrame(movies_features.iloc[index, 1:]).transpose(), nn)

The movies recommended are pretty similar to the input movie, in terms of genres and release decade (year).

# Recommend movies for user

In [None]:
user_id = 6526

In [None]:
print(ratings['userId'].value_counts())

In [None]:
def get_ratings_per_user (user_id):
    # Get all rows corresponding to the user_id
    user_ratings = ratings[ratings['userId'] == user_id]

    # Drop userId col
    user_ratings = user_ratings.drop(['userId'], axis=1)

    return user_ratings

In [None]:
ratings_per_user = get_ratings_per_user(user_id)

In [None]:
ratings_per_user

In [None]:
rated_movies = pd.merge(ratings_per_user, movies, on='movieId')

In [None]:
rated_movies

Looks like this user likes a lot of genres, but it is clear that he hates horror movies 👻.

In [None]:
def find_index_by_id(movie_id):
    idx = movies[movies['movieId'] == movie_id].index
    return idx[0]

In [None]:
recs = pd.DataFrame()
for index, row in ratings_per_user.iterrows():
    movie_id = int(row['movieId'])
    rating = row['rating']
    features = pd.DataFrame(movies_features.iloc[find_index_by_id(movie_id), 1:]).transpose()
    recs_by_movie = recommend_movies(movie_id, features, nn)
    # Multiplying by (1/rating), so that movies similar to those high rated get more chance
    # to appear in the top n list.
    # Adding 1 to avoid 0 values (0*1 = 0*5: ratings make no difference in this case)
    # The smaller relevance is, the better recommendation is
    recs_by_movie['relevance'] = (1+recs_by_movie['distance'])*(1/rating)
    recs = pd.concat([recs, recs_by_movie], ignore_index=True)

In [None]:
recs

In [None]:
bottom_20 = recs.nlargest(20, 'relevance')

In [None]:
bottom_20

Our recommender system is aware that this user hates horror movies

In [None]:
top_10 = recs.nsmallest(10, 'relevance')

In [None]:
top_10

In [None]:
top_20 = recs.nsmallest(20, 'relevance')

In [None]:
top_20