In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import re

# Data Preparation

In [50]:
# import data
movies =    pd.read_csv('data/movies.csv')
links =     pd.read_csv('data/links.csv')
ratings =   pd.read_csv('data/ratings.csv')
tags =      pd.read_csv('data/tags.csv')

# clean titles column by moving "The" to the beginning of the string
# # this makes it more searchable for users
movies.loc[lambda df: df["title"].str.contains(", The", regex=True), 'title'] = 'The ' + movies['title']
movies.loc[lambda df: df["title"].str.contains(", The", regex=True), 'title'] = movies['title'].str.replace(", The", '', regex=True)

# Popularity-Based Recommender

In [27]:
def get_popular_recommendations(n, genres):
    return (
        ratings
            .groupby('movieId')
            .agg(avg_rating = ('rating', 'mean'), num_ratings = ('rating', 'count'))
            .merge(movies, on='movieId')
            .assign(combined_rating = lambda x: x['avg_rating'] * x['num_ratings']**0.5)
            [lambda df: df["genres"].str.contains(genres, regex=True)]
            .sort_values('combined_rating', ascending=False)
            .head(n)
            [['title', 'avg_rating', 'num_ratings', 'genres']]
    )

In [28]:
def transform_genre_to_regex(genres):
    regex = ""
    index = 0
    for genre in genres:
        regex += f"(?=.*{genre})"

    return regex

In [29]:
genres = ['Action', 'Drama', 'Comedy']

In [30]:
get_popular_recommendations(10, transform_genre_to_regex(genres))

Unnamed: 0,title,avg_rating,num_ratings,genres
1474,Lethal Weapon (1987),3.673333,75,Action|Comedy|Crime|Drama
2173,Three Kings (1999),3.711111,45,Action|Adventure|Comedy|Drama|War
1075,Sneakers (1992),3.478261,46,Action|Comedy|Crime|Drama|Sci-Fi
118,Bad Boys (1995),3.245098,51,Action|Comedy|Crime|Drama|Thriller
3047,Beverly Hills Cop (1984),3.402174,46,Action|Comedy|Crime|Drama
1475,Lethal Weapon 2 (1989),3.180851,47,Action|Comedy|Crime|Drama
1476,Lethal Weapon 3 (1992),2.934783,46,Action|Comedy|Crime|Drama
5158,The Blind Swordsman: Zatoichi (Zatôichi) (2003),3.958333,12,Action|Comedy|Crime|Drama
3662,48 Hrs. (1982),3.590909,11,Action|Comedy|Crime|Drama
7109,The Men Who Stare at Goats (2009),3.388889,9,Action|Comedy|Drama


# Similarity-Based Recommender

In [51]:
movie_user_matrix = (
                ratings
                    .merge(movies, on='movieId')[['title', 'rating', 'userId']]
                    .pivot_table(index='title', columns='userId', values='rating')
                    .fillna(0)
                )
similarities_movies = pd.DataFrame(cosine_similarity(movie_user_matrix),
                                  index=movie_user_matrix.index,
                                  columns=movie_user_matrix.index)

In [31]:
def get_similar_recommendations(movie_title, n, genres):

    # calculate similarity for chosen movie
    similarities = pd.DataFrame(
        (similarities_movies.query("index != @movie_title")[movie_title] / sum(similarities_movies.query("index != @movie_title")[movie_title]))
        .sort_values(ascending= False))

    # compute weighted averages and return the n movies with the highest predicted ratings
    recommendations = (
        similarities
            .head(n)
            .merge(movies, how= 'left', left_index = True, right_on = 'title')
            [lambda df: df["genres"].str.contains(genres, regex=True)]
            .title)

    return recommendations

In [111]:
# calculate similarity for chosen movie
similarities = pd.DataFrame(
    (similarities_movies.query("index != @movie_title")[movie_title] / sum(similarities_movies.query("index != @movie_title")[movie_title]))
    .sort_values(ascending= False))

# compute weighted averages and return the n movies with the highest predicted ratings
recommendations = (
    similarities
        .head(10)
        .merge(movies, how= 'left', left_index = True, right_on = 'title')
        [lambda df: df["genres"].str.contains(genres, regex=True)]
        [['title', 'genres']]
        )


In [53]:
movie_title = "The King's Speech (2010)" 

In [56]:
get_similar_recommendations(movie_title, 10, '')

7039                                            Up (2009)
7665                                      The Help (2011)
7214                               Sherlock Holmes (2009)
7372                                     Inception (2010)
7906                                         Brave (2012)
4644                                 Love Actually (2003)
8372                      The Grand Budapest Hotel (2014)
8303                                        Frozen (2013)
6772                                        WALL·E (2008)
7644    Harry Potter and the Deathly Hallows: Part 2 (...
Name: title, dtype: object

In [125]:
def find_movie_title(user_input):
    title_list = movies.title.unique()
    
    r = re.compile(f".*{user_input}.*")
    result = []

    for title in title_list:
        match = r.findall(title)
        if match:
            result.append(match)
    
    return result[0][0]

In [126]:
find_movie_title('Matrix')

'Matrix, The (1999)'

# User-Based Recommender

In [8]:
# create "database" to use for recommendations
user_item_matrix = all_ratings_pivoted.fillna(0)

similarities_users = pd.DataFrame(cosine_similarity(user_item_matrix),
                                  index=user_item_matrix.index,
                                  columns=user_item_matrix.index)

In [9]:
def get_user_recommendations(user_id, n):
    
    # calculate weights for ratings
    weights = similarities_users.query("index != @user_id")[user_id] / sum(similarities_users.query("index != @user_id")[user_id])

    # get unwatched movies for recommendations
    unwatched_movies = user_item_matrix.loc[user_item_matrix.index != user_id, user_item_matrix.loc[user_id,:] == 0].T

    # compute weighted averages and return the n movies with the highest predicted ratings
    weighted_averages = pd.DataFrame(unwatched_movies.dot(weights), columns = ["predicted_rating"])
    recommendations = weighted_averages.sort_values("predicted_rating", ascending=False).head(n).index

    return recommendations

In [58]:
get_user_recommendations(5, 10)

Index(['Forrest Gump (1994)', 'Silence of the Lambs, The (1991)',
       'Jurassic Park (1993)', 'Seven (a.k.a. Se7en) (1995)', 'Speed (1994)',
       'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
       'Die Hard: With a Vengeance (1995)',
       'Star Wars: Episode IV - A New Hope (1977)', 'Matrix, The (1999)',
       'Independence Day (a.k.a. ID4) (1996)'],
      dtype='object', name='title')