In [6]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import re
import requests
import json
import api_keys

# Data Preparation

In [35]:
# import data
movies =    pd.read_csv('data/movies.csv')
links =     pd.read_csv('data/links.csv')
ratings =   pd.read_csv('data/ratings.csv')
tags =      pd.read_csv('data/tags.csv')

# clean titles column by moving "The" and "A" to the beginning of the string
# # this makes it more searchable for users
movies.loc[lambda df: df["title"].str.contains(", The", regex=True), 'title'] = 'The ' + movies['title']
movies.loc[lambda df: df["title"].str.contains(", The", regex=True), 'title'] = movies['title'].str.replace(", The", '', regex=True)

movies.loc[lambda df: df["title"].str.contains(", A", regex=True), 'title'] = 'A ' + movies['title']
movies.loc[lambda df: df["title"].str.contains(", A", regex=True), 'title'] = movies['title'].str.replace(", A", '', regex=True)

# Popularity-Based Recommender

In [118]:
def get_popular_recommendations(n, genres):
    recommendations = (
        ratings
            .groupby('movieId')
            .agg(avg_rating = ('rating', 'mean'), num_ratings = ('rating', 'count'))
            .merge(movies, on='movieId')
            .assign(combined_rating = lambda x: x['avg_rating'] * x['num_ratings']**0.5)
            [lambda df: df["genres"].str.contains(genres, regex=True)]
            .sort_values('combined_rating', ascending=False)
            .head(n)
            [['title', 'avg_rating', 'num_ratings', 'genres']]
    )

    recommendations_ids =   (
                            recommendations
                                .merge(movies, how = 'left', on = 'title')
                                .merge(links, how = 'left', on = 'movieId')
                                # [['title', 'genres', 'imdbId']]
                            )
    recommendations_ids['imdbId'] = 'tt0' + recommendations_ids['imdbId'].astype('str')
    imdb_ids = list(recommendations_ids['imdbId'])

    return recommendations_ids

In [126]:
recommendations = (
    ratings
        .groupby('movieId')
        .agg(avg_rating = ('rating', 'mean'), num_ratings = ('rating', 'count'))
        .merge(movies, on='movieId')
        .assign(combined_rating = lambda x: x['avg_rating'] * x['num_ratings']**0.5)
        [lambda df: df["genres"].str.contains(transform_genre_to_regex(genres), regex=True)]
        .sort_values('combined_rating', ascending=False)
        .head(n)
        [['title', 'avg_rating', 'num_ratings', 'genres', 'movieId']]
)

recommendations_ids =   (
                        recommendations
                            .merge(links, how = 'left', on = 'movieId')
                            # [['title', 'genres', 'imdbId']]
                        )
recommendations_ids['imdbId'] = 'tt0' + recommendations_ids['imdbId'].astype('str')
imdb_ids = list(recommendations_ids['imdbId'])

# recommendations_ids

In [121]:
recommendations.merge(movies, how = 'left', on = 'title').

Unnamed: 0,title,avg_rating,num_ratings,genres_x,movieId,genres_y
0,Lethal Weapon (1987),3.673333,75,Action|Comedy|Crime|Drama,2000,Action|Comedy|Crime|Drama
1,Three Kings (1999),3.711111,45,Action|Adventure|Comedy|Drama|War,2890,Action|Adventure|Comedy|Drama|War
2,Sneakers (1992),3.478261,46,Action|Comedy|Crime|Drama|Sci-Fi,1396,Action|Comedy|Crime|Drama|Sci-Fi


In [113]:
def transform_genre_to_regex(genres):
    regex = ""
    index = 0
    for genre in genres:
        regex += f"(?=.*{genre})"

    return regex

In [114]:
genres = ['Action', 'Drama', 'Comedy']

In [119]:
get_popular_recommendations(10, transform_genre_to_regex(genres))

KeyError: "['genres'] not in index"

# Similarity-Based Recommender

In [31]:
movie_user_matrix = (
                ratings
                    .merge(movies, on='movieId')[['title', 'rating', 'userId']]
                    .pivot_table(index='title', columns='userId', values='rating')
                    .fillna(0)
                )
similarities_movies = pd.DataFrame(cosine_similarity(movie_user_matrix),
                                  index=movie_user_matrix.index,
                                  columns=movie_user_matrix.index)

In [87]:
def get_similar_recommendations(movie_title, n, genres):
    
    # select similarity for chosen movie
    similarities = pd.DataFrame(
        (similarities_movies.query("index != @movie_title")[movie_title] / sum(similarities_movies.query("index != @movie_title")[movie_title]))
        .sort_values(ascending= False))
 
    # exclude genres if necessary and return the n movies with the highest similarity
    recommendations = (
        similarities
            .merge(movies, how= 'left', left_index = True, right_on = 'title')
            [lambda df: df["genres"].str.contains(genres, regex=True)]
            .head(n)
            [['title', 'genres']]
            )

    return recommendations

In [90]:
movie_title = "A Beautiful Mind (2001)" 
n = 3
genres = ''

In [91]:
# select similarity for chosen movie
similarities = pd.DataFrame(
    (similarities_movies.query("index != @movie_title")[movie_title] / sum(similarities_movies.query("index != @movie_title")[movie_title]))
    .sort_values(ascending= False))

# exclude genres if necessary and return the n movies with the highest similarity
recommendations = (
    similarities
        .merge(movies, how= 'left', left_index = True, right_on = 'title')
        [lambda df: df["genres"].str.contains(genres, regex=True)]
        .head(n)
        [['title', 'genres', 'movieId']]
        )

In [62]:
sample_output = get_similar_recommendations(movie_title, 3, '')

In [125]:
def find_movie_title(user_input):
    title_list = movies.title.unique()
    
    r = re.compile(f".*{user_input}.*")
    result = []

    for title in title_list:
        match = r.findall(title)
        if match:
            result.append(match)
    
    return result[0][0]

In [126]:
find_movie_title('Matrix')

'Matrix, The (1999)'

In [None]:
def get_similar_recommendations_streaming(movie_title, n, genres, country, url, headers):

    # select similarity for chosen movie
    similarities = pd.DataFrame(
        (similarities_movies.query("index != @movie_title")[movie_title] / sum(similarities_movies.query("index != @movie_title")[movie_title]))
        .sort_values(ascending= False))

    # exclude genres if necessary and return the n movies with the highest similarity
    recommendations = (
        similarities
            .merge(movies, how= 'left', left_index = True, right_on = 'title')
            [lambda df: df["genres"].str.contains(genres, regex=True)]
            .head(n)
            [['title', 'genres', 'movieId']]
            )

    # merge recommendations with links df to get imdbIds for the API calls
    recommendations_ids = recommendations.merge(links, how = 'left', on = 'movieId')[['title', 'genres', 'imdbId']]
    recommendations_ids['imdbId'] = 'tt0' + recommendations_ids['imdbId'].astype('str')
    imdb_ids = list(recommendations_ids['imdbId'])

    # create new column for streaming links
    recommendations_ids['Streaming Availability'] = ""

    # loop through imdb_ids to make one api call for each to get available streaming links
    for id in imdb_ids:

        # make api call
        querystring = {"country":country,"imdb_id":id,"output_language":"en"}
        response = requests.request("GET", url, headers=headers, params=querystring)
        streaming_info = response.json()

        for streaming_service in streaming_info['streamingInfo']:
            recommendations_ids.loc[recommendations_ids['imdbId'] == id, 'Streaming Availability'] += f"{streaming_service}: {streaming_info['streamingInfo'][streaming_service][country]['link']} \n" 

# User-Based Recommender

In [8]:
# create "database" to use for recommendations
user_item_matrix = all_ratings_pivoted.fillna(0)

similarities_users = pd.DataFrame(cosine_similarity(user_item_matrix),
                                  index=user_item_matrix.index,
                                  columns=user_item_matrix.index)

In [9]:
def get_user_recommendations(user_id, n):
    
    # calculate weights for ratings
    weights = similarities_users.query("index != @user_id")[user_id] / sum(similarities_users.query("index != @user_id")[user_id])

    # get unwatched movies for recommendations
    unwatched_movies = user_item_matrix.loc[user_item_matrix.index != user_id, user_item_matrix.loc[user_id,:] == 0].T

    # compute weighted averages and return the n movies with the highest predicted ratings
    weighted_averages = pd.DataFrame(unwatched_movies.dot(weights), columns = ["predicted_rating"])
    recommendations = weighted_averages.sort_values("predicted_rating", ascending=False).head(n).index

    return recommendations

In [58]:
get_user_recommendations(5, 10)

Index(['Forrest Gump (1994)', 'Silence of the Lambs, The (1991)',
       'Jurassic Park (1993)', 'Seven (a.k.a. Se7en) (1995)', 'Speed (1994)',
       'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
       'Die Hard: With a Vengeance (1995)',
       'Star Wars: Episode IV - A New Hope (1977)', 'Matrix, The (1999)',
       'Independence Day (a.k.a. ID4) (1996)'],
      dtype='object', name='title')

# Streaming API

## notes

* imdb-ids have "tt0" as prefix

In [20]:
url = "https://streaming-availability.p.rapidapi.com/get/basic"

querystring = {"country":"de","imdb_id":"tt0268978","output_language":"en"}

headers = {
	"X-RapidAPI-Key": "d3bf10e76bmshe58fc89e9aaa547p1c119cjsncdd51d83db32",
	"X-RapidAPI-Host": "streaming-availability.p.rapidapi.com"
}

response = requests.request("GET", url, headers=headers, params=querystring)

In [27]:
country = 'de'
streaming_platform = 'netflix'

In [21]:
streaming_info = response.json()

In [44]:
streaming_info['streamingInfo']

{'netflix': {'de': {'link': 'https://www.netflix.com/title/60021793/',
   'added': 1659998955,
   'leaving': 0}},
 'prime': {'de': {'link': 'https://www.amazon.de/gp/video/detail/0J3OTJNXZRY1JRCLCIMD37FKTZ/',
   'added': 1636925663,
   'leaving': 0}}}

In [108]:
movie_title = "A Beautiful Mind (2001)" 
n = 3
genres = ''

# select similarity for chosen movie
similarities = pd.DataFrame(
    (similarities_movies.query("index != @movie_title")[movie_title] / sum(similarities_movies.query("index != @movie_title")[movie_title]))
    .sort_values(ascending= False))

# exclude genres if necessary and return the n movies with the highest similarity
recommendations = (
    similarities
        .merge(movies, how= 'left', left_index = True, right_on = 'title')
        [lambda df: df["genres"].str.contains(genres, regex=True)]
        .head(n)
        [['title', 'genres', 'movieId']]
        )

# merge recommendations with links df to get imdbIds for the API calls
recommendations_ids = recommendations.merge(links, how = 'left', on = 'movieId')[['title', 'genres', 'imdbId']]
recommendations_ids['imdbId'] = 'tt0' + recommendations_ids['imdbId'].astype('str')
imdb_ids = list(recommendations_ids['imdbId'])

# create new column for streaming links
recommendations_ids['Streaming Availability'] = ""

# loop through imdb_ids to make one api call for each to get available streaming links
for id in imdb_ids:

    # make api call
    querystring = {"country":country,"imdb_id":id,"output_language":"en"}
    response = requests.request("GET", url, headers=headers, params=querystring)
    streaming_info = response.json()

    for streaming_service in streaming_info['streamingInfo']:
        recommendations_ids.loc[recommendations_ids['imdbId'] == id, 'Streaming Availability'] += f"{streaming_service}: {streaming_info['streamingInfo'][streaming_service][country]['link']} \n"  

In [109]:
for id in imdb_ids:

    # make api call
    querystring = {"country":country,"imdb_id":id,"output_language":"en"}
    response = requests.request("GET", url, headers=headers, params=querystring)
    streaming_info = response.json()

    for streaming_service in streaming_info['streamingInfo']:
        recommendations_ids.loc[recommendations_ids['imdbId'] == id, 'Streaming Availability'] += f"{streaming_service}: {streaming_info['streamingInfo'][streaming_service][country]['link']} \n"  

In [107]:
for id in imdb_ids:

    test_list= [1 , 2 , 3]

    for streaming_service in test_list:
        recommendations_ids.loc[recommendations_ids['imdbId'] == id, 'Streaming Availability'] += f"{streaming_service}: {streaming_service} \n" 

In [61]:
recommendations_ids['Streaming Availability'] = ""

for streaming_service in streaming_info['streamingInfo']:
    sample_output['Streaming Availability'] += f"{streaming_service}: {streaming_info['streamingInfo'][streaming_service][country]['link']} \n"  

In [29]:
streaming_info['streamingInfo'][streaming_platform][country]['link']

'https://www.netflix.com/title/60021793/'

In [19]:
movies.merge(links, how= 'left', on= 'movieId').loc[lambda df: df['title'].str.contains('Beautiful Mind')]

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
3640,4995,A Beautiful Mind (2001),Drama|Romance,268978,453.0
