In [1]:
##Khai báo thư viện
import pandas as pd
import numpy as np
import torch
from torch import nn, optim
import torch.nn.functional as F
from sklearn import preprocessing
import torch.utils.data

In [2]:
movies = pd.read_csv('movies.csv', sep=',', encoding='latin-1', usecols=['movieid', 'title', 'genre'])
ratings = pd.read_csv('ratings.csv', sep=',', encoding='latin-1', usecols=['userid', 'movieid', 'rating'])

In [3]:
movies.head()

Unnamed: 0,movieid,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
##tách năm trong tiêu đề
movies['year'] = movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies.head(3)

Unnamed: 0,movieid,title,genre,year
0,1,Toy Story (1995),Animation|Children's|Comedy,(1995)
1,2,Jumanji (1995),Adventure|Children's|Fantasy,(1995)
2,3,Grumpier Old Men (1995),Comedy|Romance,(1995)


In [5]:
#Loại bỏ dấu ngoặc
movies['year'] = movies.year.str.extract('(\d\d\d\d)',expand=False)
movies.head(3)

Unnamed: 0,movieid,title,genre,year
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995


In [6]:
#Xóa năm trong phần tiêu đề
movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')
movies.head(3)

  movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')


Unnamed: 0,movieid,title,genre,year
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995


In [7]:
#Loại bỏ các khoảng trắng
movies['title'] = movies['title'].apply(lambda x: x.strip())
movies.head()

Unnamed: 0,movieid,title,genre,year
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama,1995
4,5,Father of the Bride Part II,Comedy,1995


In [8]:
movies['genre'] = movies.genre.str.split('|')
movies.head()

Unnamed: 0,movieid,title,genre,year
0,1,Toy Story,"[Animation, Children's, Comedy]",1995
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [9]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieid  3883 non-null   int64 
 1   title    3883 non-null   object
 2   genre    3883 non-null   object
 3   year     3883 non-null   object
dtypes: int64(1), object(3)
memory usage: 121.5+ KB


In [10]:
movies.movieid = movies.movieid.astype('int32')

In [11]:
movies.isna().sum()

movieid    0
title      0
genre      0
year       0
dtype: int64

In [12]:
movies.year = movies.year.astype('int16')

In [13]:
movies.dtypes

movieid     int32
title      object
genre      object
year        int16
dtype: object

In [14]:
movies.head()

Unnamed: 0,movieid,title,genre,year
0,1,Toy Story,"[Animation, Children's, Comedy]",1995
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama]",1995
4,5,Father of the Bride Part II,[Comedy],1995


Dùng One-Hot-Encode liệt kê danh sách các thể loại.

In [15]:
movies_with_genres = movies.copy(deep=True)

x = []
for index, row in movies.iterrows():
    x.append(index)
    for genre in row['genre']:
        movies_with_genres.at[index, genre] = 1

print(len(x) == len(movies))

movies_with_genres.head(5)

True


Unnamed: 0,movieid,title,genre,year,Animation,Children's,Comedy,Adventure,Fantasy,Romance,...,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story,"[Animation, Children's, Comedy]",1995,1.0,1.0,1.0,,,,...,,,,,,,,,,
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995,,1.0,,1.0,1.0,,...,,,,,,,,,,
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,,,1.0,,,1.0,...,,,,,,,,,,
3,4,Waiting to Exhale,"[Comedy, Drama]",1995,,,1.0,,,,...,,,,,,,,,,
4,5,Father of the Bride Part II,[Comedy],1995,,,1.0,,,,...,,,,,,,,,,


In [16]:
movies_with_genres = movies_with_genres.fillna(0)
movies_with_genres.head(5)

Unnamed: 0,movieid,title,genre,year,Animation,Children's,Comedy,Adventure,Fantasy,Romance,...,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story,"[Animation, Children's, Comedy]",1995,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama]",1995,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
ratings.head()

Unnamed: 0,userid,movieid,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


# Content Based recommender System
Triển khai hệ thống đề xuất Dựa trên Nội dung . Kỹ thuật này cố gắng tìm ra khía cạnh yêu thích của người dùng đối với một mặt hàng là gì và sau đó đề xuất các mặt hàng thể hiện những khía cạnh đó.

Hãy bắt đầu bằng cách tạo người dùng đầu vào để giới thiệu phim. Tên của người dùng sẽ là Lawrence và chúng tôi cho rằng Lawrence đã xếp hạng các phim sau với các xếp hạng sau: -

Lưu ý: Để thêm nhiều phim hơn, chỉ cần tăng số lượng phần tử trong userInput. Vui lòng thêm nhiều hơn vào! Chỉ cần nhớ viết nó bằng chữ in hoa và nếu một bộ phim bắt đầu bằng "The", chẳng hạn như "The Matrix" thì hãy viết nó như sau: 'Matrix, The'.

Step 1: Creating Nhien's Profile

In [18]:
# so on a scale of 0 to 5, with 0 min and 5 max, see Lawrence's movie ratings below
Nhien_movie_ratings = [
            {'title':'Predator', 'rating':4.9},
            {'title':'Final Destination', 'rating':4.9},
            {'title':'Mission Impossible', 'rating':4},
            {'title':"Beverly Hills Cop", 'rating':3},
            {'title':'Exorcist, The', 'rating':4.8},
            {'title':'Waiting to Exhale', 'rating':3.9},
            {'title':'Avengers, The', 'rating':4.5},
            {'title':'Omen, The', 'rating':5.0}
         ] 
Nhien_movie_ratings = pd.DataFrame(Nhien_movie_ratings)
Nhien_movie_ratings

Unnamed: 0,title,rating
0,Predator,4.9
1,Final Destination,4.9
2,Mission Impossible,4.0
3,Beverly Hills Cop,3.0
4,"Exorcist, The",4.8
5,Waiting to Exhale,3.9
6,"Avengers, The",4.5
7,"Omen, The",5.0


Thêm movieId cho người.Khi quá trình nhập hoàn tất, hãy trích xuất ID của phim đầu vào từ khung dữ liệu phim và thêm chúng vào đó.

Trước tiên, chúng ta có thể đạt được điều này bằng cách lọc ra các hàng chứa tiêu đề của phim đầu vào và sau đó hợp nhất tập hợp con này với khung dữ liệu đầu vào. Chúng tôi cũng loại bỏ các cột không cần thiết cho đầu vào để tiết kiệm dung lượng bộ nhớ.

In [19]:
Nhien_movie_Id = movies[movies['title'].isin(Nhien_movie_ratings['title'])]

Nhien_movie_ratings = pd.merge(Nhien_movie_Id, Nhien_movie_ratings)

Nhien_movie_ratings

Unnamed: 0,movieid,title,genre,year,rating
0,4,Waiting to Exhale,"[Comedy, Drama]",1995,3.9
1,1350,"Omen, The",[Horror],1976,5.0
2,1997,"Exorcist, The",[Horror],1973,4.8
3,2153,"Avengers, The","[Action, Adventure]",1998,4.5
4,3409,Final Destination,"[Drama, Thriller]",2000,4.9
5,3527,Predator,"[Action, Sci-Fi, Thriller]",1987,4.9


In [20]:
Nhien_movie_ratings = Nhien_movie_ratings.drop(['genre','year'], 1)

Nhien_movie_ratings

Unnamed: 0,movieid,title,rating
0,4,Waiting to Exhale,3.9
1,1350,"Omen, The",5.0
2,1997,"Exorcist, The",4.8
3,2153,"Avengers, The",4.5
4,3409,Final Destination,4.9
5,3527,Predator,4.9


Step 2: Learning Nhien's Profile


In [21]:
Nhien_genres_df = movies_with_genres[movies_with_genres.movieid.isin(Nhien_movie_ratings.movieid)]
Nhien_genres_df

Unnamed: 0,movieid,title,genre,year,Animation,Children's,Comedy,Adventure,Fantasy,Romance,...,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
3,4,Waiting to Exhale,"[Comedy, Drama]",1995,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1329,1350,"Omen, The",[Horror],1976,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1928,1997,"Exorcist, The",[Horror],1973,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2084,2153,"Avengers, The","[Action, Adventure]",1998,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3340,3409,Final Destination,"[Drama, Thriller]",2000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3458,3527,Predator,"[Action, Sci-Fi, Thriller]",1987,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
Nhien_genres_df.reset_index(drop=True, inplace=True)


Nhien_genres_df.drop(['movieid','title','genre','year'], axis=1, inplace=True)


Nhien_genres_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


Step 3: Building Nhien's Profile

In [23]:
Nhien_profile = Nhien_genres_df.T.dot(Nhien_movie_ratings.rating)

Nhien_profile

Animation      0.0
Children's     0.0
Comedy         3.9
Adventure      4.5
Fantasy        0.0
Romance        0.0
Drama          8.8
Action         9.4
Crime          0.0
Thriller       9.8
Horror         9.8
Sci-Fi         4.9
Documentary    0.0
War            0.0
Musical        0.0
Mystery        0.0
Film-Noir      0.0
Western        0.0
dtype: float64

Step 4: Deploying The Content-Based Recommender System.

In [24]:
movies_with_genres = movies_with_genres.set_index(movies_with_genres.movieid)

movies_with_genres.head()

Unnamed: 0_level_0,movieid,title,genre,year,Animation,Children's,Comedy,Adventure,Fantasy,Romance,...,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,Toy Story,"[Animation, Children's, Comedy]",1995,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Jumanji,"[Adventure, Children's, Fantasy]",1995,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Waiting to Exhale,"[Comedy, Drama]",1995,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
movies_with_genres.drop(['movieid','title','genre','year'], axis=1, inplace=True)
movies_with_genres.head()

Unnamed: 0_level_0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
recommendation_table_df = (movies_with_genres.dot(Nhien_profile)) / Nhien_profile.sum()

recommendation_table_df.head()

movieid
1    0.076321
2    0.088063
3    0.076321
4    0.248532
5    0.076321
dtype: float64

In [27]:
recommendation_table_df.sort_values(ascending=False, inplace=True)

recommendation_table_df.head(20)

movieid
1320    0.663405
1214    0.663405
2288    0.663405
2617    0.655577
1876    0.643836
70      0.643836
2344    0.636008
1215    0.636008
2826    0.567515
610     0.559687
2916    0.559687
1127    0.559687
1544    0.559687
2322    0.559687
1591    0.559687
849     0.559687
1129    0.559687
1917    0.559687
2488    0.555773
591     0.547945
dtype: float64

### Các Bộ Phim Được Đề Xuất

In [28]:
copy = movies.copy(deep=True)

copy = copy.set_index('movieid', drop=True)

top_20_index = recommendation_table_df.index[:20].tolist()

recommended_movies = copy.loc[top_20_index, :]

recommended_movies

Unnamed: 0_level_0,title,genre,year
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1320,AlienÂ³,"[Action, Horror, Sci-Fi, Thriller]",1992
1214,Alien,"[Action, Horror, Sci-Fi, Thriller]",1979
2288,"Thing, The","[Action, Horror, Sci-Fi, Thriller]",1982
2617,"Mummy, The","[Action, Adventure, Horror, Thriller]",1999
1876,Deep Impact,"[Action, Drama, Sci-Fi, Thriller]",1998
70,From Dusk Till Dawn,"[Action, Comedy, Crime, Horror, Thriller]",1996
2344,Runaway Train,"[Action, Adventure, Drama, Thriller]",1985
1215,Army of Darkness,"[Action, Adventure, Comedy, Horror, Sci-Fi]",1993
2826,"13th Warrior, The","[Action, Horror, Thriller]",1999
610,Heavy Metal,"[Action, Adventure, Animation, Horror, Sci-Fi]",1981
