In [1]:
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display

# Đọc dữ liệu từ file CSV
df_movies = pd.read_csv('movies.csv')
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [2]:
df_ratings = pd.read_csv('ratings.csv')
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [3]:
# Xóa các dòng có giá trị "(no genres listed)" theo số movieId và giữ lại một dòng duy nhất cho mỗi số movieId
df_moviesclean = df_movies[df_movies['genres'] != '(no genres listed)'].drop_duplicates(subset='movieId', keep='first')

# In ra DataFrame sau khi xóa
print("\nDataFrame sau khi xóa:")
df_moviesclean


DataFrame sau khi xóa:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
# Sử dụng hàm str.extract để tách "title" thành "title" và "year"
df_moviesclean[['title', 'year']] = df_moviesclean['title'].str.extract(r'([^\(]+)\s*\((\d{4})\)', expand=True)
# Hiển thị DataFrame mới
df_moviesclean[['movieId', 'title', 'year', 'genres']]

Unnamed: 0,movieId,title,year,genres
0,1,Toy Story,1995,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,1995,Adventure|Children|Fantasy
2,3,Grumpier Old Men,1995,Comedy|Romance
3,4,Waiting to Exhale,1995,Comedy|Drama|Romance
4,5,Father of the Bride Part II,1995,Comedy
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,2017,Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero,2017,Animation|Comedy|Fantasy
9739,193585,Flint,2017,Drama
9740,193587,Bungo Stray Dogs: Dead Apple,2018,Action|Animation


In [5]:
# Tính toán số lượng cột cần tạo
max_genres_count = df_moviesclean['genres'].str.count('|').max() + 1

# Tạo danh sách cột mới
genres_columns = [f'genres{i+1}' for i in range(max_genres_count)]

# Tạo ra đúng số lượng cột cần thiết với giá trị None
for col in genres_columns:
    df_moviesclean[col] = None

# Thêm giá trị cho từng cột
for i in range(max_genres_count):
    # Kiểm tra nếu chỉ mục vượt quá số lượng cột thực tế
    if i < len(df_movies['genres'].str.split('|', expand=True).columns):
        df_moviesclean[genres_columns[i]] = df_moviesclean['genres'].str.split('|', expand=True).iloc[:, i]

# In ra DataFrame mới
df_moviesclean

Unnamed: 0,movieId,title,genres,year,genres1,genres2,genres3,genres4,genres5,genres6,...,genres70,genres71,genres72,genres73,genres74,genres75,genres76,genres77,genres78,genres79
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,Adventure,Animation,Children,Comedy,Fantasy,,...,,,,,,,,,,
1,2,Jumanji,Adventure|Children|Fantasy,1995,Adventure,Children,Fantasy,,,,...,,,,,,,,,,
2,3,Grumpier Old Men,Comedy|Romance,1995,Comedy,Romance,,,,,...,,,,,,,,,,
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995,Comedy,Drama,Romance,,,,...,,,,,,,,,,
4,5,Father of the Bride Part II,Comedy,1995,Comedy,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017,Action,Animation,Comedy,Fantasy,,,...,,,,,,,,,,
9738,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,2017,Animation,Comedy,Fantasy,,,,...,,,,,,,,,,
9739,193585,Flint,Drama,2017,Drama,,,,,,...,,,,,,,,,,
9740,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,2018,Action,Animation,,,,,...,,,,,,,,,,


In [6]:
# Xóa những cột có toàn bộ giá trị là None
df_moviesclean = df_moviesclean.dropna(axis=1, how='all')

In [7]:
df_moviesclean[['movieId', 'title', 'year', 'genres1', 'genres2', 'genres3', 'genres4', 'genres5', 'genres6', 'genres7', 'genres8', 'genres9', 'genres10']]

Unnamed: 0,movieId,title,year,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10
0,1,Toy Story,1995,Adventure,Animation,Children,Comedy,Fantasy,,,,,
1,2,Jumanji,1995,Adventure,Children,Fantasy,,,,,,,
2,3,Grumpier Old Men,1995,Comedy,Romance,,,,,,,,
3,4,Waiting to Exhale,1995,Comedy,Drama,Romance,,,,,,,
4,5,Father of the Bride Part II,1995,Comedy,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,2017,Action,Animation,Comedy,Fantasy,,,,,,
9738,193583,No Game No Life: Zero,2017,Animation,Comedy,Fantasy,,,,,,,
9739,193585,Flint,2017,Drama,,,,,,,,,
9740,193587,Bungo Stray Dogs: Dead Apple,2018,Action,Animation,,,,,,,,


In [8]:
# Xác định các hàng chứa giá trị None
rows_with_none = df_moviesclean.isna().any(axis=1)

# Thay thế giá trị None thành chuỗi rỗng chỉ trong các hàng có giá trị None
df_moviesclean.loc[rows_with_none] = df_moviesclean.loc[rows_with_none].fillna('')

# In ra DataFrame sau khi thực hiện thay thế
df_moviesclean[['movieId', 'title', 'year', 'genres1', 'genres2', 'genres3', 'genres4', 'genres5', 'genres6', 'genres7', 'genres8', 'genres9', 'genres10']]

Unnamed: 0,movieId,title,year,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10
0,1,Toy Story,1995,Adventure,Animation,Children,Comedy,Fantasy,,,,,
1,2,Jumanji,1995,Adventure,Children,Fantasy,,,,,,,
2,3,Grumpier Old Men,1995,Comedy,Romance,,,,,,,,
3,4,Waiting to Exhale,1995,Comedy,Drama,Romance,,,,,,,
4,5,Father of the Bride Part II,1995,Comedy,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,2017,Action,Animation,Comedy,Fantasy,,,,,,
9738,193583,No Game No Life: Zero,2017,Animation,Comedy,Fantasy,,,,,,,
9739,193585,Flint,2017,Drama,,,,,,,,,
9740,193587,Bungo Stray Dogs: Dead Apple,2018,Action,Animation,,,,,,,,


In [9]:
print('Số lượng giá trị movies trùng lặp:', df_moviesclean.duplicated().sum())

print('\nSố lượng giá trị movies bị thiếu:', df_moviesclean.isnull().sum().sum())

Số lượng giá trị movies trùng lặp: 0

Số lượng giá trị movies bị thiếu: 0


In [10]:
print('Số lượng giá trị ratings trùng lặp:', df_ratings.duplicated().sum())

print('\nSố lượng giá trị ratings bị thiếu:', df_ratings.isnull().sum().sum())

Số lượng giá trị ratings trùng lặp: 0

Số lượng giá trị ratings bị thiếu: 0


In [11]:
def de_xuat_ratings(movie_id):
    # Sắp xếp theo cột "rating" từ cao đến thấp
    sapxep_ratings = df_ratings.sort_values("rating", ascending=False)
    
    # Merge với df_cleaned dựa trên cột "movieId" và lấy các cột "rating", "title", "genres"
    recomend_ratings = sapxep_ratings.head(100).merge(df_moviesclean, on="movieId")[["rating", "title", 'year', 'genres1', 'genres2', 'genres3', 'genres4', 'genres5', 'genres6', 'genres7', 'genres8', 'genres9', 'genres10']]
    
    return recomend_ratings

# Gọi hàm find_similar_movies với một movie_id cụ thể
dexuatratings = de_xuat_ratings(1)

# Cấu hình tùy chọn hiển thị để hiển thị tất cả các hàng và cột
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# In ra tất cả 100 kết quả
dexuatratings

Unnamed: 0,rating,title,year,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10
0,5.0,"Green Mile, The",1999,Crime,Drama,,,,,,,,
1,5.0,Monty Python's Life of Brian,1979,Comedy,,,,,,,,,
2,5.0,Lightning Jack,1994,Comedy,Western,,,,,,,,
3,5.0,Jurassic Park,1993,Action,Adventure,Sci-Fi,Thriller,,,,,,
4,5.0,"Flamingo Kid, The",1984,Comedy,Drama,,,,,,,,
5,5.0,Pulp Fiction,1994,Comedy,Crime,Drama,Thriller,,,,,,
6,5.0,Pulp Fiction,1994,Comedy,Crime,Drama,Thriller,,,,,,
7,5.0,Star Wars: Episode IV - A New Hope,1977,Action,Adventure,Sci-Fi,,,,,,,
8,5.0,Star Wars: Episode IV - A New Hope,1977,Action,Adventure,Sci-Fi,,,,,,,
9,5.0,Fargo,1996,Comedy,Crime,Drama,Thriller,,,,,,


In [12]:
# Kết hợp thông tin đánh giá với chi tiết phim
df_ratings_movies = pd.merge(df_ratings, df_moviesclean, on='movieId')

In [13]:
def de_xuat_score(movie_id):
    movie_id_list = [movie_id]  # Convert movie_id to a list
    similar_users = df_ratings_movies[(df_ratings_movies["movieId"].isin(movie_id_list)) & (df_ratings_movies["rating"] > 4)]["userId"].unique()
    similar_user_recs = df_ratings_movies[(df_ratings_movies["userId"].isin(similar_users)) & (df_ratings_movies["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > 0.10]
    all_users = df_ratings_movies[(df_ratings_movies["movieId"].isin(similar_user_recs.index)) & (df_ratings_movies["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(df_moviesclean, left_index=True, right_on="movieId")[["score", "title", 'year', 'genres1', 'genres2', 'genres3', 'genres4', 'genres5', 'genres6', 'genres7', 'genres8', 'genres9', 'genres10']]


dexuatscore = de_xuat_score(1)

dexuatscore

Unnamed: 0,score,title,year,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10
0,8.584615,Toy Story,1995,Adventure,Animation,Children,Comedy,Fantasy,,,,,
2355,5.202797,Toy Story 2,1999,Adventure,Animation,Children,Comedy,Fantasy,,,,,
7355,4.905495,Toy Story 3,2010,Adventure,Animation,Children,Comedy,Fantasy,IMAX,,,,
436,4.544796,Mrs. Doubtfire,1993,Comedy,Drama,,,,,,,,
5374,3.219231,"Incredibles, The",2004,Action,Adventure,Animation,Children,Comedy,,,,,
32,3.147692,Babe,1995,Children,Drama,,,,,,,,
2038,3.147692,a.k.a. Ghost Busters),1984,Action,Comedy,Sci-Fi,,,,,,,
506,2.985953,Aladdin,1992,Adventure,Animation,Children,Comedy,Musical,,,,,
592,2.861538,"Rock, The",1996,Action,Adventure,Thriller,,,,,,,
5260,2.747077,Spider-Man 2,2004,Action,Adventure,Sci-Fi,IMAX,,,,,,


In [19]:
def search_title(query_title):
    # Kiểm tra xem có bất kỳ tiêu đề nào trong danh sách chứa tiêu đề đang tìm kiếm không
    results = df_moviesclean[df_moviesclean['title'].apply(lambda x: query_title.lower() in x.lower())]
    return results


def find_similar_movies(movie_ids):
    similar_users = df_ratings_movies[(df_ratings_movies["movieId"].isin(movie_ids)) & (df_ratings_movies["rating"] > 4)]["userId"].unique()
    similar_user_recs = df_ratings_movies[(df_ratings_movies["userId"].isin(similar_users)) & (df_ratings_movies["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > 0.10]
    all_users = df_ratings_movies[(df_ratings_movies["movieId"].isin(similar_user_recs.index)) & (df_ratings_movies["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(df_moviesclean, left_index=True, right_on="movieId")[["score", "title", 'year', 'genres1', 'genres2', 'genres3', 'genres4', 'genres5', 'genres6', 'genres7', 'genres8', 'genres9', 'genres10']]

# Tạo hộp nhập cho tiêu đề phim
movie_title_input = widgets.Text(
    value='',
    description='Tiêu đề phim:',
    disabled=False
)

# Khu vực xuất kết quả
recommendation_title_list = widgets.Output()

def on_type2(data):
    with recommendation_title_list:
        recommendation_title_list.clear_output()
        title = data["new"]
        if len(title) > 2:  # Điều chỉnh độ dài tối thiểu nếu cần
            results = search_title(title)
            if not results.empty:
                movie_ids = results["movieId"].tolist()
                recommendations = find_similar_movies(movie_ids)
                recommendations = recommendations.sort_values("score", ascending=False)
                display(recommendations)
            else:
                print("Không tìm thấy bộ phim nào chứa từ khóa.")
                
# Theo dõi sự thay đổi trong hộp nhập
movie_title_input.observe(on_type2, names='value')

# Hiển thị các widget
display(movie_title_input, recommendation_title_list)


Text(value='', description='Tiêu đề phim:')

Output()

In [20]:
def search_title_or_genres(query):
    # Kiểm tra xem có bất kỳ tiêu đề hoặc thể loại nào trong danh sách chứa từ khóa tìm kiếm không
    results = df_moviesclean[df_moviesclean['title'].apply(lambda x: query.lower() in x.lower()) | df_moviesclean['genres'].apply(lambda x: query.lower() in x.lower())]
    return results


def find_similar_movies(movie_ids):
    similar_users = df_ratings_movies[(df_ratings_movies["movieId"].isin(movie_ids)) & (df_ratings_movies["rating"] > 4)]["userId"].unique()
    similar_user_recs = df_ratings_movies[(df_ratings_movies["userId"].isin(similar_users)) & (df_ratings_movies["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > 0.10]
    all_users = df_ratings_movies[(df_ratings_movies["movieId"].isin(similar_user_recs.index)) & (df_ratings_movies["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(df_moviesclean, left_index=True, right_on="movieId")[["score", "title", 'year', 'genres1', 'genres2', 'genres3', 'genres4', 'genres5', 'genres6', 'genres7', 'genres8', 'genres9', 'genres10']]


# Tạo hộp nhập cho tiêu đề phim và thể loại
search_input = widgets.Text(
    value='',
    description='Tìm kiếm:',
    disabled=False
)

# Khu vực xuất kết quả
recommendation_list_output = widgets.Output()

def on_search_input_change(change):
    with recommendation_list_output:
        recommendation_list_output.clear_output()
        query = change.new
        if len(query) > 2:  # Điều chỉnh độ dài tối thiểu nếu cần
            results = search_title_or_genres(query)
            if not results.empty:
                movie_ids = results["movieId"].tolist()
                recommendations = find_similar_movies(movie_ids)
                recommendations = recommendations.sort_values("score", ascending=False)  # Sắp xếp giảm dần theo score
                display(recommendations)
            else:
                print("Không tìm thấy bộ phim nào chứa từ khóa.")

# Theo dõi sự thay đổi trong hộp nhập
search_input.observe(on_search_input_change, names='value')

# Hiển thị các widget
display(search_input, recommendation_list_output)


Text(value='', description='Tìm kiếm:')

Output()