In [1]:
import pickle

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics import pairwise
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

### Load prepared data

In [2]:
ratings = pd.read_csv("data/ratings_prepared.csv")
movies = pd.read_csv("data/movies_prepared.csv")

In [3]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,0,4.0,964982703
1,0,1,4.0,964981247
2,0,2,4.0,964982224
3,0,3,5.0,964983815
4,0,4,5.0,964982931


In [4]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,Grumpier Old Men (1995),Comedy|Romance
2,2,Heat (1995),Action|Crime|Thriller
3,3,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,4,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


### Initialize a sparse user-item rating matrix

In [5]:
# (data, (row_ind, col_ind))
R = csr_matrix((ratings["rating"], (ratings["user_id"], ratings["movie_id"])))

In [6]:
df_r = pd.DataFrame(R.todense())
df_r

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
0,4.0,4.0,4.0,5.0,5.0,3.0,5.0,4.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2.5,0.0,0.0,3.0,4.5,4.0,0.0,3.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
606,4.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,2.5,2.0,0.0,4.5,4.5,3.0,0.0,4.0,0.0,3.0,...,0.0,0.5,0.0,0.0,4.5,0.0,2.5,1.0,1.5,0.0
608,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
R.shape

(610, 1235)

### Create a model and set the hyperparameters

In [8]:
# Initialize the NearestNeighbors model
model = NearestNeighbors(metric='cosine')

# fit it to the Ratings matrix
model.fit(R)

### Save/load model to file

In [9]:
with open('./model_neighbors.pkl', 'wb') as file:
    pickle.dump(model, file)

In [10]:
with open('./model_neighbors.pkl', 'rb') as file:
    model = pickle.load(file)

### Recommend movies

In [11]:
# movie_id: rating
user_query = {
    10: 4,  # Billy Madison (1995)
    100: 3, # Bambi (1942)
    555: 3.5,  # Mortal Kombat (1995)
    756: 2,  # Inside Man (2006)
    1224: 5,  # Babe: Pig in the City (1998)
}

#### Construct a user vector

Input has to be the same as during training

In [12]:
df_new_user = pd.DataFrame(user_query, columns=movies["movie_id"], index=["new_user"])
df_new_user

movie_id,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
new_user,,,,,,,,,,,...,,,,,,,,,,


In [13]:
# Fill missing values
df_new_user_filled = df_new_user.fillna(0)
df_new_user_filled

movie_id,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
new_user,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Calculate the score**

1. find the neighborhood of $n$ similar users
2. use their ratings to calculate a score

In [14]:
# Calculate the distances to all other users in the data!
similarity_scores, neighbor_ids = model.kneighbors(
    df_new_user_filled,
    n_neighbors=5,
    return_distance=True,
)

# sklearn returns a list of predictions
# extract the first and only value of the list
df_neighbors = pd.DataFrame(
    data={
        "neighbor_id": neighbor_ids[0],
        "similarity_score": similarity_scores[0],
    }
)

df_neighbors.sort_values("similarity_score", ascending=False, inplace=True)
df_neighbors

Unnamed: 0,neighbor_id,similarity_score
4,283,0.904735
3,497,0.900674
2,135,0.900255
1,239,0.898896
0,499,0.89807


In [15]:
# Look at ratings for users that are similar
neighborhood = df_r.iloc[neighbor_ids[0]]
neighborhood

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
499,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
239,5.0,4.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135,0.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,5.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
497,0.0,0.0,0.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283,0.0,0.0,0.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Filter out seen movies

In [16]:
neighborhood_filtered = neighborhood.drop(user_query.keys(), axis=1)
neighborhood_filtered

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
499,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
239,5.0,4.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135,0.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,5.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
497,0.0,0.0,0.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283,0.0,0.0,0.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Calculate score

In [17]:
df_get_score = df_neighbors.set_index("neighbor_id")
df_get_score

Unnamed: 0_level_0,similarity_score
neighbor_id,Unnamed: 1_level_1
283,0.904735
497,0.900674
135,0.900255
239,0.898896
499,0.89807


In [18]:
# Multiply the ratings with the similarity score of each user and
# calculate the summed up rating for each movie

df_score = neighborhood_filtered.apply(
    lambda x: df_get_score.loc[x.index]["similarity_score"] * x
)
df_score

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
499,3.592278,0.0,0.0,0.0,0.0,0.0,0.89807,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
239,4.494479,3.595583,0.0,0.0,0.0,0.0,0.0,4.494479,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135,0.0,0.0,0.0,4.501273,0.0,0.0,0.0,4.501273,4.501273,0.900255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
497,0.0,0.0,0.0,2.702023,0.0,0.0,0.0,4.503372,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
283,0.0,0.0,0.0,2.714206,0.0,0.0,0.0,4.523677,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Rank the movie

In [19]:
df_score_ranked = df_score.sum(axis=0).reset_index().sort_values(0, ascending=False)
df_score_ranked.columns = ["movie_id", "score"]
df_score_ranked.reset_index(drop=True, inplace=True)
df_score_ranked.head(5)

Unnamed: 0,movie_id,score
0,233,18.915674
1,7,18.0228
2,17,17.118065
3,340,17.113806
4,19,17.113806


#### Give recommendations

In [20]:
recommendations = movies[movies["movie_id"].isin(df_score_ranked.iloc[:5]["movie_id"])]
recommendations[["movie_id", "title"]]

Unnamed: 0,movie_id,title
7,7,Braveheart (1995)
17,17,Tommy Boy (1995)
19,19,Forrest Gump (1994)
233,233,Aladdin (1992)
340,340,Happy Gilmore (1996)


## Recommender function

In [21]:
def recommended_movies(query, model, ratings, movies, k=10):
    """
    Filters and recommends the top k movies for any given input query
    based on a trained nearest neighbors model.
    Returns a list of k movie ids and corresponding movie titles.
    """

    # Create user vector
    df_new_user = pd.DataFrame(query, columns=movies["movie_id"], index=["new_user"])

    # Fill missing values
    df_new_user_filled = df_new_user.fillna(0)

    # Calculate the distances to other users
    similarity_scores, neighbor_ids = model.kneighbors(
        df_new_user_filled,
        n_neighbors=5,
        return_distance=True,
    )

    # Save ids and scores in a DataFrame
    df_neighbors = pd.DataFrame(
        data={
            "neighbor_id": neighbor_ids[0],
            "similarity_score": similarity_scores[0],
        }
    )

    df_neighbors.sort_values("similarity_score", ascending=False, inplace=True)

    # Calculate CSR Matrix (R) and convert do Dataframe
    R = csr_matrix((ratings["rating"], (ratings["user_id"], ratings["movie_id"])))
    df_r = pd.DataFrame(R.todense())

    # Filte to only show similar users
    neighborhood = df_r.iloc[neighbor_ids[0]]

    # Filter out movies rated by the user
    neighborhood_filtered = neighborhood.drop(user_query.keys(), axis=1)

    # Multiply the ratings with the similarity score of each user and
    # calculate the summed up rating for each movie
    df_get_score = df_neighbors.set_index("neighbor_id")
    df_score = neighborhood_filtered.apply(
        lambda x: df_get_score.loc[x.index]["similarity_score"] * x
    )
    df_score_ranked = df_score.sum(axis=0).reset_index().sort_values(0, ascending=False)
    df_score_ranked.columns = ["movie_id", "score"]
    df_score_ranked.reset_index(drop=True, inplace=True)

    # Get movie ids of k best rated movies
    movie_ids = df_score_ranked.iloc[:k]["movie_id"]

    # Get corresponding titles in the same order
    titles = [movies.loc[id]["title"] for id in movie_ids]

    return movie_ids, titles

## Try some predictions

In [22]:
# movie_id: rating
user_query = {
    10: 4,  # Billy Madison (1995)
    100: 3, # Bambi (1942)
    555: 3.5,  # Mortal Kombat (1995)
    756: 2,  # Inside Man (2006)
    1224: 5,  # Babe: Pig in the City (1998)
}

movie_ids, titles = recommended_movies(user_query, model, ratings, movies, k=5)

print("Recommended movies:\n")
for i, title in enumerate(titles):
    print(f"{i+1}. {title}")

Recommended movies:

1. Aladdin (1992)
2. Braveheart (1995)
3. Tommy Boy (1995)
4. Happy Gilmore (1996)
5. Forrest Gump (1994)


In [23]:
# movie_id: rating
user_query = {
    2: 5,  # Heat (1995)
    34: 3,  # Fargo (1996)
    142: 3.5,  # Big (1988)
    556: 1,  # River Wild, The (1994)
    765: 5,  # Simpsons Movie, The (2007)
}

movie_ids, titles = recommended_movies(user_query, model, ratings, movies, k=5)

print("Recommended movies:\n")
for i, title in enumerate(titles):
    print(f"{i+1}. {title}")

Recommended movies:

1. Leaving Las Vegas (1995)
2. Rock, The (1996)
3. Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
4. Willy Wonka & the Chocolate Factory (1971)
5. Independence Day (a.k.a. ID4) (1996)


In [24]:
# movie_id: rating
user_query = {
    0: 5,  # Toy Story (1995)
    20: 4,  # Jungle Book, The (1994)
}

movie_ids, titles = recommended_movies(user_query, model, ratings, movies, k=5)

print("Recommended movies:\n")
for i, title in enumerate(titles):
    print(f"{i+1}. {title}")

Recommended movies:

1. Shawshank Redemption, The (1994)
2. Aladdin (1992)
3. Beauty and the Beast (1991)
4. Dances with Wolves (1990)
5. Usual Suspects, The (1995)


In [18]:
from imdb import Cinemagoer
from imdb.helpers import resizeImage

# create an instance of the Cinemagoer class
ia = Cinemagoer()

movies = ia.search_movie('matrix')

movies

[<Movie id:0133093[http] title:_The Matrix (1999)_>,
 <Movie id:10838180[http] title:_The Matrix Resurrections (2021)_>,
 <Movie id:0234215[http] title:_The Matrix Reloaded (2003)_>,
 <Movie id:0242653[http] title:_The Matrix Revolutions (2003)_>,
 <Movie id:0106062[http] title:_"Matrix" (1993)_>,
 <Movie id:9847360[http] title:_A Glitch in the Matrix (2021)_>,
 <Movie id:0410519[http] title:_The Matrix Recalibrated (2004) (V)_>,
 <Movie id:0365467[http] title:_Making 'The Matrix' (1999) (TV)_>,
 <Movie id:0364888[http] title:_"Threat Matrix" (2004) (2003)_>,
 <Movie id:11749868[http] title:_Matrix (2020)_>,
 <Movie id:11574780[http] title:_The Matrix Redeemed (2020)_>,
 <Movie id:0274085[http] title:_Sex and the Matrix (2000)_>,
 <Movie id:0109151[http] title:_Armitage III: Polymatrix (1996) (V)_>,
 <Movie id:0295432[http] title:_The Matrix Revisited (2001) (V)_>,
 <Movie id:0303678[http] title:_Armitage III: Dual Matrix (2001) (V)_>,
 <Movie id:2579522[http] title:_Escape the Matrix 

In [10]:
print(ia.get_movie_infoset())

['airing', 'akas', 'alternate versions', 'awards', 'connections', 'crazy credits', 'critic reviews', 'episodes', 'external reviews', 'external sites', 'faqs', 'full credits', 'goofs', 'keywords', 'list', 'locations', 'main', 'misc sites', 'news', 'official sites', 'parents guide', 'photo sites', 'plot', 'quotes', 'recommendations', 'release dates', 'release info', 'reviews', 'sound clips', 'soundtrack', 'synopsis', 'taglines', 'technical', 'trivia', 'tv schedule', 'video clips', 'vote details']


In [15]:
movie = ia.get_movie(movies[0].movieID)
print(sorted(movie.keys()))

['akas', 'animation department', 'art department', 'art direction', 'aspect ratio', 'assistant director', 'box office', 'camera and electrical department', 'canonical title', 'cast', 'casting department', 'casting director', 'certificates', 'cinematographer', 'color info', 'composer', 'costume department', 'costume designer', 'countries', 'country codes', 'cover url', 'director', 'distributors', 'editor', 'editorial department', 'full-size cover url', 'genres', 'imdbID', 'kind', 'language codes', 'languages', 'localized title', 'location management', 'long imdb canonical title', 'long imdb title', 'make up', 'miscellaneous crew', 'music department', 'original air date', 'original title', 'other companies', 'plot', 'plot outline', 'producer', 'production companies', 'production design', 'production manager', 'rating', 'runtimes', 'script department', 'set decoration', 'smart canonical title', 'smart long imdb canonical title', 'sound crew', 'sound mix', 'special effects', 'special effec

In [17]:
print(f"ID: {movies[0].movieID}\nTitle: {movies[0]['title']}\nCover: {movies[0]['full-size cover url']}")

ID: 0133093
Title: The Matrix
Cover: https://m.media-amazon.com/images/M/MV5BNzQzOTk3OTAtNDQ0Zi00ZTVkLWI0MTEtMDllZjNkYzNjNTc4L2ltYWdlXkEyXkFqcGdeQXVyNjU0OTQ0OTY@.jpg


In [20]:
image_url = resizeImage(movies[0]['full-size cover url'], width=200)

In [65]:
def display_movie(movie):
    directors = [director['name'] for director in movie['director']]
    cast = [actor['name'] for actor in movie['cast']]

    print(f"<img src=\"{resizeImage(movie['full-size cover url'], width=200)}\">")
    print(f"{movie['title']} ({movie['year']})")
    print(f"IMDB-rating: {movie['rating']}/10")
    print(f"Genres: {', '.join(movie['genres'])}")
    print(f"Director(s): {', '.join(directors)}")
    print(f"Cast: {', '.join(cast[:5])}, ...")
    print(f"{movie['plot outline']}")

In [67]:
display_movie(movies[0])

KeyError: 'director'

In [1]:
import pandas as pd

In [7]:
df = pd.read_csv("../data/data_with_imdb.csv", index_col=0)

In [8]:
df

Unnamed: 0,movie_id,imdb_id,title,rating,year,genre,director,cast,cover_url,plot
0,0,114709.0,Toy Story,8.3,1995.0,"['Animation', 'Adventure', 'Comedy', 'Family',...",['John Lasseter'],"['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",https://m.media-amazon.com/images/M/MV5BMDU2ZW...,A little boy named Andy loves to be in his roo...
1,1,113228.0,Grumpier Old Men,6.6,1995.0,"['Comedy', 'Romance']",['Howard Deutch'],"['Walter Matthau', 'Jack Lemmon', 'Sophia Lore...",https://m.media-amazon.com/images/M/MV5BMjQxM2...,Things don't seem to change much in Wabasha Co...
2,2,113277.0,Heat,8.3,1995.0,"['Action', 'Crime', 'Drama']",['Michael Mann'],"['Al Pacino', 'Robert De Niro', 'Val Kilmer', ...",https://m.media-amazon.com/images/M/MV5BYjZjNT...,Hunters and their prey--Neil and his professio...
3,3,,Seven (a.k.a. Se7en) (1995),3.975369,,,,,,
4,4,114814.0,The Usual Suspects,8.5,1995.0,"['Crime', 'Drama', 'Mystery', 'Thriller']",['Bryan Singer'],"['Stephen Baldwin', 'Gabriel Byrne', 'Benicio ...",https://m.media-amazon.com/images/M/MV5BYTViNj...,"Following a truck hijack in New York, five cri..."
