In [1]:

import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display
from ipywidgets import widgets

# Load the data
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
# Create a function to clean the data
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

#Go through title column and clean the title
movies["clean_title"]=movies["title"].apply(clean_title)

vectorizer = CountVectorizer(ngram_range=(1,2))
#Transform to matrix
X = vectorizer.fit_transform(movies["clean_title"])

# Compute the cosine similarity matrix
def search(title):
    query = vectorizer.transform([clean_title(title)])
    #Find similarity between search term
    similarity = cosine_similarity(X, query).flatten()
    #Find the top 5 highest values similarity
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

#Building a recommendation function
def find_similar_movies(movie_id):
    #Finding users who liked same movies
    similar_users = ratings[(ratings["movieId"]==movie_id) & (ratings["rating"]>=4)]["userId"].unique()
    
    #Find others film of similar users
    similar_users_rec = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>=4)]["movieId"]

    #Calculate percentage of recommended movies
    similar_users_rec = similar_users_rec.value_counts() / len(similar_users)

    #List movies greater than 20%
    similar_users_rec = similar_users_rec[similar_users_rec >.2]
    #Finding how much all users like movies
    all_users = ratings[(ratings["movieId"].isin(similar_users_rec.index)) & (ratings["rating"] >=4)]

    all_users_rec = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    #creating a recommendation score
    rec_percentages = pd.concat([similar_users_rec, all_users_rec], axis=1)
    rec_percentages.columns = ["similar", "all"]
    #Score equal differential between similar and all, cause we
    #are looking for movies that are liked by similar users but not by all
    rec_percentages["score"] = rec_percentages["similar"]/rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

#Creating a recommendation widget
movie_input_name = widgets.Text(
    value="",
    description = "Movie Title: ",
    disabled=False
)

recommendation_list = widgets.Output()

def on_button_clicked(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title)>5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_input_name.observe(on_button_clicked, names="value")

display(movie_input_name, recommendation_list)

Text(value='', description='Movie Title: ')

Output()