In [1]:
# su dung thu vien pandas
import pandas as pd

# http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
movies = pd.read_csv("movies.csv")

In [2]:
# hien thi 5 dong dau movies
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# nhap vao thu vien re(regular expressions)
import re
# lam sach title 
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [4]:
# tao cot moi vua lam sach
movies["clean_title"] = movies["title"].apply(clean_title)

In [5]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Black Butler Book of the Atlantic 2017
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,No Game No Life Zero 2017
9739,193585,Flint (2017),Drama,Flint 2017
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Bungo Stray Dogs Dead Apple 2018


In [6]:
#  sử dụng thư viện scikit-learn để vector hóa TF-IDF trên các xâu ký tự trong cột "clean_title"
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [7]:
tfidf

<9742x33421 sparse matrix of type '<class 'numpy.float64'>'
	with 70422 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    # lam sach title
    title = clean_title(title)
    # chuyen thanh vecto TF-IDF
    query_vec = vectorizer.transform([title])
    # tinh do tuong dong giua title can tim voi title khac
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    # xac dinh 5 phim tuong dong nhat
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [9]:
import ipywidgets as widgets
from IPython.display import display
# tao o tim kiem
movie_input = widgets.Text(
    value='',
    description='Movie Title:',
    disabled=False
)
# hien thi ket qua 
movie_list = widgets.Output()
# ham xu ly khi thay doi nhap lieu
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

In [10]:
movie_id = 89745

# def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [11]:
movie

Unnamed: 0,movieId,title,genres,clean_title
7693,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012


In [12]:
ratings = pd.read_csv("ratings.csv")

In [13]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [14]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [15]:
# tim cac user co dang gia cao
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [16]:
similar_users

array([ 52,  68, 154, 184, 210, 211, 249, 291, 308, 344, 377, 380, 382,
       393, 489, 509, 511, 522, 525, 550, 561, 573, 582, 586, 601, 610])

In [17]:
# cac phim co danh gia cao
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [18]:
similar_user_recs

7784         318
7785         356
7786         364
7787         588
7788         733
           ...  
100821    160527
100829    164179
100832    168248
100833    168250
100834    168252
Name: movieId, Length: 2253, dtype: int64

In [19]:
# loc ra cac phim dang gia cao cua nguoi dung 
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [20]:
similar_user_recs

movieId
89745     1.000000
79132     0.615385
2571      0.615385
112852    0.576923
58559     0.576923
            ...   
2951      0.115385
1222      0.115385
58998     0.115385
1240      0.115385
86377     0.115385
Name: count, Length: 241, dtype: float64

In [21]:
# loc ra cac ratings cao trong similar_user_recs tren
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [22]:
all_users 

Unnamed: 0,userId,movieId,rating,timestamp
3,1,47,5.0,964983815
4,1,50,5.0,964982931
10,1,163,5.0,964983650
15,1,260,5.0,964981680
25,1,457,5.0,964981909
...,...,...,...,...
100743,610,122920,5.0,1493845626
100780,610,139385,4.5,1493846777
100814,610,158238,5.0,1479545219
100829,610,164179,5.0,1493845631


In [23]:
# tinh toan ti le xuat hien cua moi movieid trong all_users
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [24]:
all_user_recs 

movieId
318       0.353765
296       0.292469
356       0.271454
2571      0.262697
2959      0.227671
            ...   
122906    0.007005
179819    0.007005
158238    0.007005
93721     0.005254
112623    0.005254
Name: count, Length: 241, dtype: float64

In [25]:
# tạo rec_percentages từ similar_user_recs và all_user_recs
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [26]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
89745,1.000000,0.045534
79132,0.615385,0.120841
2571,0.615385,0.262697
112852,0.576923,0.047285
58559,0.576923,0.138354
...,...,...
2951,0.115385,0.026270
1222,0.115385,0.075306
58998,0.115385,0.021016
1240,0.115385,0.057793


In [27]:
# tao cot score bang similar chia all
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [28]:
# sap xep score cao den thap
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [29]:
# show 10 hang dau 
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
8469,0.115385,0.005254,21.961538,112623,Dawn of the Planet of the Apes (2014),Sci-Fi,Dawn of the Planet of the Apes 2014
7850,0.115385,0.005254,21.961538,93721,Jiro Dreams of Sushi (2011),Documentary,Jiro Dreams of Sushi 2011
8301,0.192308,0.008757,21.961538,106642,"Day of the Doctor, The (2013)",Adventure|Drama|Sci-Fi,Day of the Doctor The 2013
8696,0.192308,0.008757,21.961538,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller,Captain America Civil War 2016
7693,1.0,0.045534,21.961538,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012
8151,0.230769,0.012259,18.824176,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,Iron Man 3 2013
8451,0.192308,0.010508,18.301282,112175,How to Train Your Dragon 2 (2014),Action|Adventure|Animation,How to Train Your Dragon 2 2014
8689,0.192308,0.010508,18.301282,122900,Ant-Man (2015),Action|Adventure|Sci-Fi,AntMan 2015
8395,0.307692,0.017513,17.569231,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,Captain America The Winter Soldier 2014
8686,0.153846,0.008757,17.569231,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi,Avengers Age of Ultron 2015


In [30]:
# tong hop code tren
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [31]:
import ipywidgets as widgets
from IPython.display import display
# nhap ten phim
movie_name_input = widgets.Text(
    value='',
    description='Movie Title:',
    disabled=False
)
# hien danh sach de xuat
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title:')

Output()