In [23]:
import pandas as pd
movies = pd.read_csv("movies.csv")

In [24]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [28]:
import re

def clean_title(title):
  cleaned_title = re.sub(r"[^\w\s]", "", title)
  # Print to check cleaning logic
  print(f"Cleaned Title: {cleaned_title}")
  return cleaned_title


In [29]:
movies["clean_title"] = movies["title"].apply(clean_title)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Cleaned Title: The Day Time Ended 1980
Cleaned Title: Courier X 2016
Cleaned Title: Sabrina 2018
Cleaned Title: Second Act 2018
Cleaned Title: Amazing Grace 2018
Cleaned Title: Jonestown Terror in the Jungle 2018
Cleaned Title: The Sacred Science 2011
Cleaned Title: Memorias de un hombre en pijama 2018
Cleaned Title: Разжалованный 2009
Cleaned Title: Crush 2009
Cleaned Title: The Song of Sway Lake 2017
Cleaned Title: Ana Maria in Novela Land 2015
Cleaned Title: The Pink Cloud Syndrome 2018
Cleaned Title: Battle 2018
Cleaned Title: Walt The Man Behind the Myth 2001
Cleaned Title: Pixie Hollow Games 2011
Cleaned Title: A German Youth 2015
Cleaned Title: The Appearance 2018
Cleaned Title: The Dream Lady 1918
Cleaned Title: Robinson Crusoe 2003
Cleaned Title: Hypnotized and Hysterical Hairstylist Wanted 2002
Cleaned Title: Leprechaun Returns 2018
Cleaned Title: Wheres Firuze 2004
Cleaned Title: Propaganda 1999
Cleaned Title: 

In [30]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [32]:
#Creating TFIDF Matrix
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [34]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]

    return results

In [37]:
!pip install ipywidgets
!jupyter labextension install @jupyter-widgets/jupyterlab-manager

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi
Successfully installed jedi-0.19.1
usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir] [--paths] [--json]
               [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available sub

In [38]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [39]:
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [40]:
ratings = pd.read_csv("ratings.csv")

In [41]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp    float64
dtype: object

In [42]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [43]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [44]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [45]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [46]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [47]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]


In [48]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
89745,1.000000,0.045426
58559,0.643836,0.133790
79132,0.575342,0.131923
2571,0.479452,0.241444
59315,0.479452,0.049782
...,...,...
8360,0.109589,0.019913
115149,0.109589,0.011201
69757,0.109589,0.023647
3996,0.109589,0.067206


In [49]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [50]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [51]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
17067,1.0,0.045426,22.013699,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012
16312,0.150685,0.009334,16.143379,86332,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,Thor 2011
16725,0.191781,0.01369,14.008717,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War,Captain America The First Avenger 2011
25071,0.232877,0.016801,13.860477,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller,Captain America Civil War 2016
21348,0.273973,0.019913,13.758562,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,Captain America The Winter Soldier 2014
21606,0.287671,0.021157,13.596696,111362,X-Men: Days of Future Past (2014),Action|Adventure|Sci-Fi,XMen Days of Future Past 2014
25058,0.178082,0.01369,13.008095,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi,Avengers Age of Ultron 2015
20603,0.136986,0.010579,12.949234,106489,"Hobbit: The Desolation of Smaug, The (2013)",Adventure|Fantasy|IMAX,Hobbit The Desolation of Smaug The 2013
25068,0.136986,0.010579,12.949234,122914,Avengers: Infinity War - Part II (2019),Action|Adventure|Sci-Fi,Avengers Infinity War Part II 2019
20492,0.109589,0.008712,12.579256,106002,Ender's Game (2013),Action|Adventure|Sci-Fi|IMAX,Enders Game 2013


In [52]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [54]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()