In [107]:
import pandas as pd
import re

movies = pd.read_csv("movies.csv")

In [108]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [109]:
movies["clean_title"] = movies["title"].apply(clean_title).str[:-5]

In [17]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II
...,...,...,...,...
62418,209157,We (2018),Drama,We
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing


In [110]:
ratings = pd.read_csv("ratings.csv")

In [111]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
22327994,145199,87232,4.0,1494733204
22327995,145199,88125,5.0,1494733106
22327996,145199,91630,2.0,1494734866
22327997,145199,95167,4.5,1494735246


In [112]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [113]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Find id given title
def find(movie_title):
    movie_title = clean_title(movie_title)
    query_vec = vectorizer.transform([movie_title])
    
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    
    index = np.argpartition(similarity, -5)[-5:]
    result = movies.iloc[index][::-1][:1]
    
    return result.movieId.item()

In [114]:
find("Batman")

153

In [117]:
# Recommendation algo
def recommend(movie_title):
    movie_id = find(movie_title)
    
    # finding recommendations from users similar to us
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    # adjusting so we only have recommendations where over 10% of users recommended that movie
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    # finding how common the recommendation is among all users
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    # creating recommendation score
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

In [118]:
recommend("Harry Potter")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
13512,1.0,0.032988,30.313919,69844,Harry Potter and the Half-Blood Prince (2009),Adventure|Fantasy|Mystery|Romance|IMAX,Harry Potter and the HalfBlood Prince
15538,0.731713,0.034705,21.083634,81834,Harry Potter and the Deathly Hallows: Part 1 (...,Action|Adventure|Fantasy|IMAX,Harry Potter and the Deathly Hallows Part 1
11700,0.638615,0.030295,21.079716,54001,Harry Potter and the Order of the Phoenix (2007),Adventure|Drama|Fantasy|IMAX,Harry Potter and the Order of the Phoenix
16718,0.668654,0.036105,18.51988,88125,Harry Potter and the Deathly Hallows: Part 2 (...,Action|Adventure|Drama|Fantasy|Mystery|IMAX,Harry Potter and the Deathly Hallows Part 2
10408,0.65008,0.03851,16.880762,40815,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX,Harry Potter and the Goblet of Fire
29962,0.123137,0.008404,14.652182,135143,Fantastic Beasts and Where to Find Them (2016),Fantasy,Fantastic Beasts and Where to Find Them
5704,0.528778,0.037398,14.139138,5816,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy,Harry Potter and the Chamber of Secrets
7742,0.685393,0.049395,13.875697,8368,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX,Harry Potter and the Prisoner of Azkaban
23024,0.120844,0.008911,13.56149,116823,The Hunger Games: Mockingjay - Part 1 (2014),Adventure|Sci-Fi|Thriller,The Hunger Games Mockingjay Part 1
23594,0.127494,0.009773,13.045309,118696,The Hobbit: The Battle of the Five Armies (2014),Adventure|Fantasy,The Hobbit The Battle of the Five Armies


In [121]:
import pickle

pickle.dump(movies.to_dict(), open("movies_dict.pkl", "wb"))

In [122]:
pickle.dump(ratings, open("ratings.pkl", "wb"))

In [123]:
recommend("Sabrina")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title


In [124]:
recommend("Dracula")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
12951,0.25,7e-06,34834.75,65352,"Have Rocket, Will Travel (1959)",Comedy|Sci-Fi,Have Rocket Will Travel
3671,0.25,7e-06,34834.75,3772,Hatchet for the Honeymoon (Rosso segno della f...,Fantasy|Horror|Thriller,Hatchet for the Honeymoon Rosso segno della fo...
13430,0.25,7e-06,34834.75,69419,"Crawling Hand, The (1963)",Horror|Sci-Fi,Crawling Hand The
12962,0.25,7e-06,34834.75,65518,"Dungeonmaster, The (1985)",Fantasy|Horror|Sci-Fi,Dungeonmaster The
5780,0.25,7e-06,34834.75,5892,"Island at the Top of the World, The (1974)",Action|Adventure|Children,Island at the Top of the World The
9676,0.25,7e-06,34834.75,32088,DNA (1997),Action|Sci-Fi,DNA
12646,0.25,7e-06,34834.75,61638,Flu Bird Horror (2008),Horror|Thriller,Flu Bird Horror
4939,0.25,7e-06,34834.75,5045,Galaxina (1980),Comedy|Sci-Fi,Galaxina
6592,0.25,7e-06,34834.75,6715,Children of the Night (1991),Horror,Children of the Night
2361,0.25,7e-06,34834.75,2452,"Gate II: Trespassers, The (1990)",Horror,Gate II Trespassers The


In [128]:
recommend("Sudden Death")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
