In [13]:
import numpy  as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import ipywidgets as widgets
from  IPython.display import display
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
movies_df = pd.read_csv("D:\\other\\code\\aiml_project\\ml-25m\\movies.csv")

In [15]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [16]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "",title)

In [17]:
movies_df["clean_title"] = movies_df["title"].apply(clean_title)

In [18]:
movies_df

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [19]:
vectorizer = TfidfVectorizer(ngram_range= (1, 2))
tfidf = vectorizer.fit_transform(movies_df["clean_title"])

In [20]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity,-5)[-5:]
    results = movies_df.iloc[indices][::-1]
    return results

In [21]:
movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title)>5:
            display(search(title))
            
movie_input.observe(on_type,names='value')

display(movie_input, movie_list)


Text(value='Toy Story', description='Movie Title:')

Output()

In [22]:
ratings = pd.read_csv("D:\\other\\code\\aiml_project\\ml-25m\\ratings.csv")

In [23]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [24]:
movie_id = 1
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [25]:
similar_users

array([  36,   75,   86,   90,   93,   95,   96,   98,  111,  120,  127,
        143,  152,  158,  160,  162,  171,  186,  188,  211,  217,  229,
        230,  235,  249,  257,  259,  297,  298,  302,  323,  329,  355,
        359,  369,  371,  381,  392,  402,  411,  428,  435,  439,  447,
        449,  468,  469,  477,  484,  513,  519,  537,  540,  541,  548,
        551,  553,  561,  567,  573,  582,  593,  607,  609,  611,  623,
        624,  626,  628,  631,  644,  653,  654,  670,  683,  686,  694,
        697,  702,  709,  727,  733,  741,  749,  752,  765,  768,  773,
        785,  791,  793,  796,  803,  805,  807,  811,  830,  834,  839,
        848,  856,  896,  904,  905,  911,  927,  947,  950,  956,  966,
        969,  986,  997, 1007, 1010, 1013, 1036, 1038, 1042, 1065, 1079,
       1092, 1096, 1101, 1118, 1123, 1131, 1138, 1140, 1141, 1143, 1146,
       1150, 1159, 1166, 1167, 1169, 1171, 1176, 1179, 1192, 1196, 1198,
       1199, 1200, 1228, 1230, 1232, 1240, 1242, 12

In [26]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]> 4)]["movieId"]

In [27]:
similar_user_recs

5101          1
5105         34
5111        110
5114        150
5127        260
           ... 
1048561    3996
1048563    4022
1048564    4027
1048565    4223
1048573    4886
Name: movieId, Length: 54307, dtype: int64

In [28]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs>.1]

In [29]:
similar_user_recs

1        1.000000
318      0.430189
260      0.374843
356      0.339623
296      0.338365
           ...   
1278     0.101887
780      0.101887
33794    0.101887
953      0.100629
1246     0.100629
Name: movieId, Length: 101, dtype: float64

In [30]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [31]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [32]:
all_users_recs

318      0.348655
296      0.283321
2571     0.241814
356      0.234743
593      0.225211
           ...   
953      0.042583
1278     0.039508
50872    0.038432
78499    0.036434
2355     0.024750
Name: movieId, Length: 101, dtype: float64

In [33]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1)
rec_percentages.columns = ["similar", "all"]

In [34]:
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.122214
318,0.430189,0.348655
260,0.374843,0.211376
356,0.339623,0.234743
296,0.338365,0.283321
...,...,...
1278,0.101887,0.039508
780,0.101887,0.052882
33794,0.101887,0.064566
953,0.100629,0.042583


In [35]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [36]:
rec_percentages = rec_percentages.sort_values("score", ascending = False)

In [37]:
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.122214,8.182390
3114,0.267925,0.051038,5.249545
2355,0.113208,0.024750,4.574007
78499,0.153459,0.036434,4.212032
588,0.221384,0.067025,3.302983
...,...,...,...
4973,0.125786,0.106226,1.184137
2329,0.110692,0.094696,1.168913
2858,0.192453,0.165257,1.164563
7361,0.111950,0.102998,1.086914


In [38]:
rec_percentages.head(10).merge(movies_df, left_index = True, right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.122214,8.18239,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.267925,0.051038,5.249545,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.113208,0.02475,4.574007,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.153459,0.036434,4.212032,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
580,0.221384,0.067025,3.302983,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
4780,0.212579,0.067487,3.149941,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
587,0.188679,0.064258,2.936264,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
6258,0.191195,0.067333,2.839551,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
1047,0.140881,0.050423,2.793987,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
359,0.242767,0.088701,2.736917,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [39]:
def  find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]> 4)]["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending = False)
    return rec_percentages.head(10).merge(movies_df, left_index = True, right_on = "movieId")[["score", "title", "genres"]]

In [40]:
movie_input_name = widgets.Text(
    value = "Toy Story", 
    description = "Movie Title:", 
    disabled = False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 2:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_input_name.observe(on_type, names = "value")

display(movie_input_name, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()