# Movie Recomendation System

### The [dataset](https://grouplens.org/datasets/movielens/) provided by grouplens
We import *pandas* and our first dataset.

In [45]:
import pandas as pd

movies = pd.read_csv('movies.csv')

In [44]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


### We use *RegEx* to clean the titles of the movies.

In [3]:
import re

def clean_title(title):
    title = title.lower()
    title = re.sub(r'[^a-z0-9\s]', '', title)
    title = title.strip()
    return title

In [4]:
movies['clean_title'] = movies['title'].apply(clean_title)

In [5]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,grumpier old men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,waiting to exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,father of the bride part ii 1995
...,...,...,...,...
62418,209157,We (2018),Drama,we 2018
62419,209159,Window of the Soul (2001),Documentary,window of the soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,bad poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),a girl thing 2001


### Term Frequency matrix for the search engine.
We will use [sklearn](https://github.com/scikit-learn/scikit-learn) a Python ML library.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))

tfidf_matrix = vectorizer.fit_transform(movies['clean_title'])

### Create a search function.

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
    indices = np.argpartition(similarity, -5) [-5:]
    results = movies.iloc[indices][::-1]
    return results

In [8]:
search("Avengers")

Unnamed: 0,movieId,title,genres,clean_title
34536,145676,3 Avengers (1964),(no genres listed),3 avengers 1964
2063,2153,"Avengers, The (1998)",Action|Adventure,avengers the 1998
17067,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,avengers the 2012
40636,159920,Shaolin Avengers (1994),Action,shaolin avengers 1994
45394,170297,Ultimate Avengers 2 (2006),Action|Animation|Sci-Fi,ultimate avengers 2 2006


### Interactive Jupyter Notebook movie search engine using [widgets](https://pypi.org/project/ipywidgets/).

In [9]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="",
    placeholder="Insert movie title",
    description="Movie: ",
    disabled=False
)

movie_list = widgets.Output ()

def on_type(data):
    with movie_list:
        movie_list.clear_output ()
        title = data["new"]
        if len(title) > 5:
            display(search(title))
            
movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='', description='Movie: ', placeholder='Insert movie title')

Output()

### Reading movie ratings. Our second dataset.

In [10]:
ratings = pd.read_csv("ratings.csv")

In [11]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [12]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [13]:
movie_id = 1

In [14]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)] ["userId"].unique()

In [15]:
similar_users

array([     3,      5,      8, ..., 162530, 162533, 162534])

In [16]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >4)]["movieId"]

In [17]:
similar_user_recs

255             29
256             32
257             50
261            214
263            293
             ...  
24999248    101962
24999269    109487
24999326    164179
24999329    165549
24999348    177593
Name: movieId, Length: 2321248, dtype: int64

### *Narrow recomendations* to 10% or more for similar user information.

In [18]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [19]:
similar_user_recs

1       0.499483
318     0.421226
260     0.367817
296     0.353337
356     0.322708
          ...   
1148    0.103609
1527    0.102867
4995    0.102522
778     0.102495
34      0.100162
Name: movieId, Length: 90, dtype: float64

Find movies that defines *similarities* between them.

In [20]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >4)] 

In [21]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000058,162541,4995,5.0,1240951903
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613


### Find % of all users recommend the movie.

In [22]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [23]:
all_users_recs

318     0.345497
296     0.287399
2571    0.246370
356     0.237518
593     0.228071
          ...   
3114    0.054220
2716    0.053892
34      0.052729
1073    0.049232
1148    0.047922
Name: movieId, Length: 90, dtype: float64

### Creating a *recommendation score* based on user taste and our dataset.

In [24]:
rec_percen = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percen.columns = ["similar", "all"]

In [30]:
rec_percen

Unnamed: 0,similar,all,score
1,0.499483,0.125923,3.966586
32,0.151608,0.101253,1.497324
34,0.100162,0.052729,1.899556
47,0.208385,0.145852,1.428742
50,0.266594,0.202432,1.316955
...,...,...,...
33794,0.117160,0.074036,1.582479
58559,0.180461,0.147871,1.220392
60069,0.134371,0.077038,1.744224
68954,0.119070,0.065565,1.816047


### We want the movies that have a big difference between the two columns.

In [31]:
rec_percen["score"] = rec_percen["similar"] / rec_percen["all"]

### We *sort* results.

In [32]:
rec_percen = rec_percen.sort_values("score", ascending=False)

In [35]:
rec_percen

Unnamed: 0,similar,all,score
1,0.499483,0.125923,3.966586
3114,0.170357,0.054220,3.141967
4886,0.166645,0.071489,2.331060
6377,0.166565,0.072960,2.282977
1073,0.111591,0.049232,2.266621
...,...,...,...
58559,0.180461,0.147871,1.220392
318,0.421226,0.345497,1.219189
4973,0.136148,0.113481,1.199744
2959,0.252380,0.218792,1.153517


### *Top 10* Recommendations.

In [36]:
rec_percen.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,0.499483,0.125923,3.966586,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story 1995
3021,0.170357,0.05422,3.141967,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,toy story 2 1999
4780,0.166645,0.071489,2.33106,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,monsters inc 2001
6258,0.166565,0.07296,2.282977,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,finding nemo 2003
1047,0.111591,0.049232,2.266621,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,willy wonka the chocolate factory 1971
8246,0.154207,0.069109,2.231373,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,incredibles the 2004
580,0.151449,0.068159,2.221989,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,aladdin 1992
1120,0.103609,0.047922,2.162033,1148,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime,wallace gromit the wrong trousers 1993
359,0.18473,0.086585,2.133522,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,lion king the 1994
587,0.12806,0.060551,2.1149,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,beauty and the beast 1991


## Recommendation function
#### We implement everything we did to this point to the function.

In [37]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)] ["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >4)]["movieId"]

    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]

    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >4)] 
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percen = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percen.columns = ["similar", "all"]

    rec_percen["score"] = rec_percen["similar"] / rec_percen["all"]

    rec_percen = rec_percen.sort_values("score", ascending=False)

    return rec_percen.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]


## Interactive Recommendation Widget
#### We implement our previous widget with our function to build the actual recommendation search engine.

In [46]:
movie_name_input = widgets.Text(
    value="",
    placeholder="Insert movie title",
    description="Movie: ",
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_name_input.observe(on_type, names="value")

display(movie_name_input, recommendation_list)

Text(value='', description='Movie: ', placeholder='Insert movie title')

Output()