In [2]:
import pandas as pd 
import numpy as np

In [3]:
movies = pd.read_csv(r"C:\Users\vijay\Downloads\ml-25m\ml-25m\movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
import re 

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]","",title)
    return title

In [5]:
movies['clean_title']=movies['title'].apply(clean_title)
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec,tfidf).flatten()
    indices = np.argpartition(similarity,-5)[-5:]
    results = movies.iloc[indices][::-1]
    print("working")
    return results

In [8]:
title = input("Search movie -> ")

Search movie -> Toy Story


In [9]:
results = search(title=title)
results

working


Unnamed: 0,movieId,title,genres,clean_title
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [10]:
df_sorted = results.sort_values(by="movieId")
df_sorted

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019


In [11]:
ratings = pd.read_csv(r"C:\Users\vijay\Downloads\ml-25m\ml-25m\ratings.csv")

In [12]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [13]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [14]:
top_result = int(results['movieId'][:1])

In [15]:
similar_users = ratings[(ratings['movieId'] == top_result) & (ratings['rating']>4)]['userId'].unique()

In [16]:
similar_users

array([     2,     86,    160, ..., 162508, 162519, 162530], dtype=int64)

In [17]:
similar_users_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating']>4)]['movieId']

In [18]:
similar_users_recs = similar_users_recs.value_counts()/len(similar_users)
similar_users_recs = similar_users_recs[similar_users_recs>0.1]

In [19]:
similar_users_recs

3114     1.000000
1        0.651788
318      0.451418
260      0.440567
1196     0.431443
           ...   
1079     0.101850
91529    0.101480
903      0.101480
48394    0.100370
590      0.100370
Name: movieId, Length: 174, dtype: float64

In [20]:
similar_users_recs.index

Int64Index([ 3114,     1,   318,   260,  1196,  2571,  1198,  4993,  4886,
              356,
            ...
            68157,  1230,   899,  3175, 44191,  1079, 91529,   903, 48394,
              590],
           dtype='int64', length=174)

In [21]:
all_users = ratings[(ratings['movieId'].isin(similar_users_recs.index))& (ratings['rating']>4)]

In [22]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
19,1,2692,5.0,1147869100
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [23]:
all_users_recs = all_users['movieId'].value_counts()/len(all_users['userId'].unique())

In [24]:
all_users_recs

318     0.338189
296     0.281320
2571    0.241159
356     0.232494
593     0.223247
          ...   
2081    0.021354
3751    0.021170
2761    0.020680
1907    0.019796
3175    0.019417
Name: movieId, Length: 174, dtype: float64

In [31]:
rec_percentages = pd.concat([similar_users_recs,all_users_recs],axis=1)
rec_percentages.columns = ["similar","all"]

In [32]:
rec_percentages

Unnamed: 0,similar,all
3114,1.000000,0.053073
1,0.651788,0.123259
318,0.451418,0.338189
260,0.440567,0.219589
1196,0.431443,0.185815
...,...,...
1079,0.101850,0.036418
91529,0.101480,0.054245
903,0.101480,0.044913
48394,0.100370,0.054441


In [33]:
rec_percentages['score'] = rec_percentages['similar']/rec_percentages['all']

In [34]:
rec_percentages

Unnamed: 0,similar,all,score
3114,1.000000,0.053073,18.841924
1,0.651788,0.123259,5.287943
318,0.451418,0.338189,1.334809
260,0.440567,0.219589,2.006324
1196,0.431443,0.185815,2.321895
...,...,...,...
1079,0.101850,0.036418,2.796663
91529,0.101480,0.054245,1.870781
903,0.101480,0.044913,2.259493
48394,0.100370,0.054441,1.843650


In [39]:
rec_percentages = rec_percentages.sort_values('score',ascending=False)

In [40]:
rec_percentages

Unnamed: 0,similar,all,score
3114,1.000000,0.053073,18.841924
2355,0.203576,0.024796,8.210086
2761,0.142047,0.020680,6.868954
78499,0.225771,0.034717,6.503216
3751,0.132799,0.021170,6.272875
...,...,...,...
2959,0.308015,0.214164,1.438218
593,0.314673,0.223247,1.409527
79132,0.176326,0.129836,1.358062
318,0.451418,0.338189,1.334809


In [41]:
rec_percentages[:10]

Unnamed: 0,similar,all,score
3114,1.0,0.053073,18.841924
2355,0.203576,0.024796,8.210086
2761,0.142047,0.02068,6.868954
78499,0.225771,0.034717,6.503216
3751,0.132799,0.02117,6.272875
2081,0.118126,0.021354,5.531892
1907,0.106165,0.019796,5.362941
2987,0.135758,0.025378,5.349396
1,0.651788,0.123259,5.287943
3175,0.102589,0.019417,5.283613


In [45]:
rec_percentages.head(10).merge(movies,left_index=True,right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
3021,1.0,0.053073,18.841924,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.203576,0.024796,8.210086,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
2669,0.142047,0.02068,6.868954,2761,"Iron Giant, The (1999)",Adventure|Animation|Children|Drama|Sci-Fi,Iron Giant The 1999
14813,0.225771,0.034717,6.503216,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
3650,0.132799,0.02117,6.272875,3751,Chicken Run (2000),Animation|Children|Comedy,Chicken Run 2000
1992,0.118126,0.021354,5.531892,2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance,Little Mermaid The 1989
1818,0.106165,0.019796,5.362941,1907,Mulan (1998),Adventure|Animation|Children|Comedy|Drama|Musi...,Mulan 1998
2895,0.135758,0.025378,5.349396,2987,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...,Who Framed Roger Rabbit 1988
0,0.651788,0.123259,5.287943,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3082,0.102589,0.019417,5.283613,3175,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi,Galaxy Quest 1999


In [50]:
rec_percentages.head(10).merge(movies,left_index=True,right_on="movieId")[['title','genres']]

Unnamed: 0,title,genres
3021,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2264,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
2669,"Iron Giant, The (1999)",Adventure|Animation|Children|Drama|Sci-Fi
14813,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
3650,Chicken Run (2000),Animation|Children|Comedy
1992,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance
1818,Mulan (1998),Adventure|Animation|Children|Comedy|Drama|Musi...
2895,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3082,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi
