In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ipywidgets as widgets
from IPython.display import display

In [2]:
movie=pd.read_csv("ml-25m/movies.csv")

In [3]:
movie.head(10)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [5]:
movie["clean_title"]=movie["title"].apply(clean_title)

In [6]:
movie.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


In [7]:
vectroization=TfidfVectorizer(ngram_range=(1,2))
tfidf=vectroization.fit_transform(movie["clean_title"])

In [8]:
tfidf

<62423x170073 sparse matrix of type '<class 'numpy.float64'>'
	with 446566 stored elements in Compressed Sparse Row format>

In [9]:
vectroization


In [10]:

def search(title):
    title=clean_title(title)
    query_vec=vectroization.transform([title])
    similarity=cosine_similarity(query_vec,tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movie.iloc[indices].iloc[::-1]
    return results

In [11]:

movie_input = widgets.Text(
    value='Jumanji',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()




def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)


Text(value='Jumanji', description='Movie Title:')

Output()

In [13]:
ratings=pd.read_csv("ml-25m/ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [15]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [18]:
movie_id = 89745


In [43]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()


In [44]:
similar_users

array([    21,    187,    208, ..., 162469, 162485, 162532], dtype=int64)

In [49]:
similar_user_recs =ratings[(ratings["movieId"].isin(similar_users))&(ratings["rating"]>4)]

In [50]:
similar_user_recs

Unnamed: 0,userId,movieId,rating,timestamp
33,1,5767,5.0,1147878729
56,1,8154,5.0,1147868865
76,2,260,5.0,1141417172
149,2,2194,5.0,1141415685
159,2,2501,4.5,1141416992
...,...,...,...,...
24999891,162540,56941,4.5,1248860345
24999893,162540,58293,5.0,1248858327
24999923,162541,260,5.0,1240952836
24999935,162541,541,5.0,1240952537
