# Movie Recommendation System

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.model_selection import train_test_split
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import warnings; warnings.simplefilter('ignore')
from gensim.models import Word2Vec
import ast
import re
from datetime import datetime, timedelta
import random

In [2]:
# Đọc dữ liệu
md = pd.read_csv("data/metadata.csv")
md.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


## Simple Movie Recommender - Gợi ý phim có vote cao nhất theo thể loại

In [3]:
# Chuyển genres từ định dạng json về list
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['genres']

0        [Animation, Comedy, Family]
1       [Adventure, Fantasy, Family]
2                  [Romance, Comedy]
3           [Comedy, Drama, Romance]
4                           [Comedy]
                    ...             
1234                         [Drama]
1235       [Foreign, Drama, Fantasy]
1236                         [Drama]
1237                [Drama, Romance]
1238      [TV Movie, Drama, Romance]
Name: genres, Length: 1239, dtype: object

### Test thử

Công thức tính phim có lượng rating cao nhất dựa vào điểm vote trung bình và số lượng vote <br>
<br>
**Weighted Rating (WR) =  (v / (v+m) * R) + (m / (v+m) * C)** <br>
<br>
where, <br>

v is the number of votes for the movie (Số lượng vote của 1 phim) <br>
m is the minimum votes required to be listed in the chart (Số lượng vote tối thiểu để được đưa vào bảng đánh giá) <br>
R is the average rating of the movie (Điểm vote trung bình của phim) <br>
C is the mean vote across the whole report (Điểm vote trung bình của toàn bộ phim) <br>

In [4]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int') 
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
m = vote_counts.quantile(0.95) # Chỉ lấy các phim có lượng vote lớn hơn 95% tổng số lượt vote
C, m 

(5.710250201775626, 1569.199999999999)

In [5]:
# Trích xuất năm ra 1 cột riêng
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [6]:
# Lọc ra các hàng giá trị với điều kiện: số lượt vote lớn hơn 95% tổng, vote count khác rỗng, vote average khác rỗng và giữ lại các cột cần thiết
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(62, 6)

In [7]:
# Hàm trả về điểm rating đã được chuẩn hoá
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)
# Danh sách rating trên bộ dữ liệu
qualified['wr'] = qualified.apply(weighted_rating, axis = 1)
qualified = qualified.sort_values('wr', ascending=False).head(250)
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
213,Pulp Fiction,1994,8670,8,140.950236,"[Thriller, Crime]",7.649086
251,Forrest Gump,1994,8147,8,48.307194,"[Comedy, Drama, Romance]",7.630197
187,Star Wars,1977,6778,8,42.149697,"[Adventure, Action, Science Fiction]",7.569547
780,Back to the Future,1985,6239,8,25.778509,"[Adventure, Comedy, Science Fiction, Family]",7.539833
368,Schindler's List,1993,4436,8,41.725123,"[Drama, History, War]",7.401673
690,Reservoir Dogs,1992,3821,8,12.22034,"[Crime, Thriller]",7.333406
748,GoodFellas,1990,3211,8,15.424092,"[Drama, Crime]",7.248342
736,One Flew Over the Cuckoo's Nest,1975,3001,8,35.529554,[Drama],7.213803
767,Dead Poets Society,1989,2786,8,19.905716,[Drama],7.174992
84,Taxi Driver,1976,2632,8,14.092713,"[Crime, Drama]",7.14475


### Tạo hàm để gợi ý 15 phim có lượng rating cao nhất dựa theo thể loại

In [8]:
# Tách các genres thành các giá trị genre riêng, tạo 1 cột mới chứa từng genre (1 phim có 2 genres sẽ có 2 hàng chứa 2 genre tương ứng)
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

In [9]:
# Hàm gợi ý
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [10]:
# Top 15 phim hành động
build_chart("Romance").head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
251,Forrest Gump,1994,8147,8,48.307194,7.91692
573,Vertigo,1958,1162,8,18.20822,7.526974
578,Some Like It Hot,1959,835,8,11.845107,7.393767
725,Cinema Paradiso,1988,834,8,14.177005,7.393245
1059,Titanic,1997,7770,7,26.88907,6.953341
404,Aladdin,1992,3495,7,16.357419,6.901142
410,Beauty and the Beast,1991,3029,7,23.433511,6.887411
777,Groundhog Day,1993,2358,7,12.989627,6.859266
1019,Gattaca,1997,1846,7,12.89312,6.826095
412,Pretty Woman,1990,1807,7,13.348451,6.822915


In [11]:
# Top 15 phim hành động
build_chart("Action").head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
187,Star Wars,1977,6778,8,42.149697,7.699212
746,Return of the Jedi,1983,4763,7,14.586087,6.752575
749,Alien,1979,4564,7,23.37742,6.743578
405,Terminator 2: Judgment Day,1991,4274,7,22.661695,6.72923
661,Die Hard,1988,4005,7,16.640522,6.714406
738,Raiders of the Lost Ark,1981,3949,7,19.901576,6.711114
83,Braveheart,1995,3404,7,20.755149,6.674608
740,Aliens,1986,3282,7,21.761179,6.665135
799,Indiana Jones and the Last Crusade,1989,3221,7,14.788987,6.660189
408,Batman,1989,2145,7,19.10673,6.540455


## Content-based Filtering Recommender - Gợi ý dựa trên tương đồng loại phim

### Movie Overview-based Recommender (TF-IDF Overview)

In [12]:
# Gộp cột tagline và cột overview vào description
md['tagline'] = md['tagline'].fillna('')
md['description'] = md['overview'] + md['tagline']
md['description'] = md['description'].fillna('')

In [13]:
# Khởi tạo ma trận TF-IDF 
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df= 1, stop_words='english')
tfidf_matrix = tf.fit_transform(md['description'])
tfidf_matrix.shape

(1239, 46887)

In [14]:
# Tính ma trận cosine similarity
cosine_sim_1 = linear_kernel(tfidf_matrix, tfidf_matrix)

Ma trận cosine trả về ma trận 2D với hàng và cột đều là đánh số các phim, thể hiện quan hệ mỗi phim: cosine_sim[0] trả về độ tương quan giữa phim đầu tiên với toàn bộ phim còn lại (phần tử đầu tiên là độ tương đồng giữa chính bộ phim đó nên bằng 1)

In [15]:
def get_recommendations_tfidf_overview(movie_id, top_k=10):
    """
    Get top K movie recommendations based on cosine similarity and TF-IDF, using movie IDs.
    
    Parameters:
        movie_id (int/str): The ID of the movie to find recommendations for.
        top_k (int): Number of top recommendations to return.
    
    Returns:
        List: IDs of the recommended movies.
    """
    # Convert ID to string to maintain consistency
    movie_id = str(movie_id)

    # Check if movie_id exists in the metadata
    if movie_id not in md['id'].astype(str).values:
        print(f"Warning: Movie ID {movie_id} not found in metadata.")
        return []

    # Find the index of the movie ID in the DataFrame
    idx = md[md['id'].astype(str) == movie_id].index[0]

    # Calculate cosine similarity scores for the movie
    sim_scores = list(enumerate(cosine_sim_1[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Extract indices of the top K most similar movies (excluding itself)
    movie_indices = [i[0] for i in sim_scores[1:top_k + 1]]

    # Return the IDs of the recommended movies
    return md.iloc[movie_indices]['id'].tolist()

In [16]:
for i in get_recommendations_tfidf_overview(862, 10): # Toy Story
    print(md[md['id'] == i]['title'])

700    Rebel Without a Cause
Name: title, dtype: object
765    Manhattan
Name: title, dtype: object
317    For Love or Money
Name: title, dtype: object
345    Malice
Name: title, dtype: object
683    Sleeper
Name: title, dtype: object
501    I Shot Andy Warhol
Name: title, dtype: object
413    Window to Paris
Name: title, dtype: object
412    Pretty Woman
Name: title, dtype: object
806    Paris, Texas
Name: title, dtype: object
132    Mute Witness
Name: title, dtype: object


## Metadata-based Recommender - Gợi ý dựa trên keywords, genres, tác giả, dàn diễn viên,...

In [17]:
credits = pd.read_csv('data/credits.csv')
keywords = pd.read_csv('data/keywords.csv')

In [18]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [19]:
# Merge các bảng dữ liệu tương ứng với id
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [20]:
# Chuyển đổi dữ liệu từ dạng JSON về dạng list, sau đó đếm số lượng caster, và số lượng crew
md['cast'] = md['cast'].apply(literal_eval)
md['crew'] = md['crew'].apply(literal_eval)
md['keywords'] = md['keywords'].apply(literal_eval)
md['cast_size'] = md['cast'].apply(lambda x: len(x))
md['crew_size'] = md['crew'].apply(lambda x: len(x))

Chiến thuật sẽ là trong crew sẽ chỉ lấy ra tên của đạo diễn, còn trong cast thì sẽ lấy ra tên 3 nhân vật chính gây ảnh hưởng tới phim

In [21]:
# Lấy tên đạo diễn
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [22]:
md['director'] = md['crew'].apply(get_director)

In [23]:
# Lấy 3 diễn viên cast nhân vật chính
md['cast'] = md['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['cast'] = md['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [24]:
md['keywords'] = md['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

Bây giờ cách thực hiện rất đơn giản: Gom tất cả các cột keywords, cast, director, genres lại với nhau, sau đó sử dụng Count Vectorizer để tạo ma trận, rồi lại tính cosine similarity <br>
<br>
Phần này sẽ hơi thủ thuật một chút: <br>
- Copy 3 lần tên tác giả để nhấn mạnh tác giả <br>
- Bỏ hết dấu cách và chuyển thành lower case hết (để có thể tách biệt tên VD: Johnny Depp vs Johnny Galecki)

In [25]:
# Lower case và bỏ dấu cách trong cast
md['cast'] = md['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [26]:
# Lower case, bỏ dấu cách và tạo 3 lần tên tác giả
md['director'] = md['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
md['director'] = md['director'].apply(lambda x: [x, x, x])

In [27]:
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,vote_average,vote_count,year,description,cast,crew,keywords,cast_size,crew_size,director
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...","[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy, friendship, friends, riva...",13,106,"[johnlasseter, johnlasseter, johnlasseter]"
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,"[robinwilliams, jonathanhyde, kirstendunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[board game, disappearance, based on children'...",26,16,"[joejohnston, joejohnston, joejohnston]"
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,6.5,92.0,1995,A family wedding reignites the ancient feud be...,"[waltermatthau, jacklemmon, ann-margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fishing, best friend, duringcreditsstinger, o...",7,4,"[howarddeutch, howarddeutch, howarddeutch]"
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...","[whitneyhouston, angelabassett, lorettadevine]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[based on novel, interracial relationship, sin...",10,10,"[forestwhitaker, forestwhitaker, forestwhitaker]"
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,5.7,173.0,1995,Just when George Banks has recovered from his ...,"[stevemartin, dianekeaton, martinshort]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlife crisis, confidence, aging, daug...",12,7,"[charlesshyer, charlesshyer, charlesshyer]"


In [28]:
# Sử dụng Snowball Stemmer để biến số nhiều thành số ít
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [29]:
# Loại bỏ số nhiều, bỏ dấu cách, lower case
md['keywords'] = md['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
md['keywords'] = md['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [30]:
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,vote_average,vote_count,year,description,cast,crew,keywords,cast_size,crew_size,director
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...","[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousi, toy, boy, friendship, friend, rival...",13,106,"[johnlasseter, johnlasseter, johnlasseter]"
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,"[robinwilliams, jonathanhyde, kirstendunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[boardgam, disappear, basedonchildren'sbook, n...",26,16,"[joejohnston, joejohnston, joejohnston]"
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,6.5,92.0,1995,A family wedding reignites the ancient feud be...,"[waltermatthau, jacklemmon, ann-margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fish, bestfriend, duringcreditssting, oldmen]",7,4,"[howarddeutch, howarddeutch, howarddeutch]"
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...","[whitneyhouston, angelabassett, lorettadevine]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[basedonnovel, interracialrelationship, single...",10,10,"[forestwhitaker, forestwhitaker, forestwhitaker]"
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,5.7,173.0,1995,Just when George Banks has recovered from his ...,"[stevemartin, dianekeaton, martinshort]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[babi, midlifecrisi, confid, age, daughter, mo...",12,7,"[charlesshyer, charlesshyer, charlesshyer]"


Dữ liệu có vẻ đúng định dạng rồi, bây giờ sẽ gộp các cột để tính thôi

In [31]:
# Đặt tên cột gộp là soup, thêm dấu cách cho mỗi phần tử
md['soup'] = md['keywords'] + md['cast'] + md['director'] + md['genres']
md['soup'] = md['soup'].apply(lambda x: ' '.join(x))


In [32]:
md['soup']

0       jealousi toy boy friendship friend rivalri boy...
1       boardgam disappear basedonchildren'sbook newho...
2       fish bestfriend duringcreditssting oldmen walt...
3       basedonnovel interracialrelationship singlemot...
4       babi midlifecrisi confid age daughter motherda...
                              ...                        
1235    biographi claudiacardinale omarsharif nathalie...
1236    tadanobuasano chara koichihashizume shunjiiwai...
1237    alibi mamievandoren meltormé elinordonahue cha...
1238    basedonnovel russia romanc vivienleigh ralphri...
1239    ciaránhinds deborahfindlay lauraharling robert...
Name: soup, Length: 1240, dtype: object

In [33]:
# Khởi tạo ma trận CountVectorizer
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1, stop_words='english')
count_matrix = count.fit_transform(md['soup'])

In [34]:
# Thay thế ma trận cosine mới
cosine_sim_2 = cosine_similarity(count_matrix, count_matrix)

In [35]:
def get_recommendations_tfidf_metadata(movie_id, top_k=10):
    """
    Get top K movie recommendations based on cosine similarity and TF-IDF, using movie IDs.
    
    Parameters:
        movie_id (int/str): The ID of the movie to find recommendations for.
        top_k (int): Number of top recommendations to return.
    
    Returns:
        List: IDs of the recommended movies.
    """
    # Convert ID to string to maintain consistency
    movie_id = str(movie_id)

    # Check if movie_id exists in the metadata
    if movie_id not in md['id'].astype(str).values:
        print(f"Warning: Movie ID {movie_id} not found in metadata.")
        return []

    # Find the index of the movie ID in the DataFrame
    idx = md[md['id'].astype(str) == movie_id].index[0]

    # Calculate cosine similarity scores for the movie
    sim_scores = list(enumerate(cosine_sim_2[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Extract indices of the top K most similar movies (excluding itself)
    movie_indices = [i[0] for i in sim_scores[1:top_k + 1]]

    # Return the IDs of the recommended movies
    return md.iloc[movie_indices]['id'].tolist()

In [36]:
for i in get_recommendations_tfidf_metadata(862, 10): # Toy Story
    print(md[md['id'] == i]['title'])

657    Pete's Dragon
Name: title, dtype: object
474    Oliver & Company
Name: title, dtype: object
658    Bedknobs and Broomsticks
Name: title, dtype: object
664    That Thing You Do!
Name: title, dtype: object
914    Cats Don't Dance
Name: title, dtype: object
109    Casper
Name: title, dtype: object
940    Fathers' Day
Name: title, dtype: object
178    Houseguest
Name: title, dtype: object
1052    Home Alone 3
Name: title, dtype: object
155    Boys on the Side
Name: title, dtype: object


# Fully Content-based Recommendation System

## Fully Connected - Total Average

In [37]:
def safe_list_parse(x):
    """Chuyển đổi chuỗi JSON thành list hoặc trả về list rỗng khi không hợp lệ."""
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return []
    if isinstance(x, (list, np.ndarray)):
        return x
    return []

for col in ['genres', 'cast', 'keywords']:
    md[col] = md[col].apply(safe_list_parse)
    # Xóa khoảng trắng và chuyển về chữ thường
    md[col] = md[col].apply(lambda x: [str(g).lower().replace(" ", "") for g in x])

# Làm sạch cột 'director'
md['director'] = md['director'].fillna('').apply(lambda x: str(x).lower().replace(" ", ""))

# Đảm bảo cột 'year' là chuỗi
md['year'] = md['year'].fillna('').astype(str)


# Tạo câu phim bao gồm: director, genres, cast, keywords, year
def create_movie_sentence(row):
    # movie_sentence = [director] + genres + cast + keywords + overview_tokens + [year]
    return [row['director']] + row['genres'] + row['cast'] + row['keywords'] + [row['year']]

md['movie_sentence'] = md.apply(create_movie_sentence, axis=1)

# Xử lý NaN trong cột 'vote_average' trước khi làm tròn
md['vote_average'] = md['vote_average'].fillna(0)

# Làm tròn rating
md['rounded_rating'] = md['vote_average'].apply(lambda x: max(1, int(round(x))))

# Tạo Weighted Movie Sentence Dataset
weighted_sentences = []
for idx, row in md.iterrows():
    count = row['rounded_rating']  # Số lần lặp
    weighted_sentences.extend([row['movie_sentence']] * count)

# Kiểm tra kết quả
print("Số lượng câu phim đã tạo:", len(weighted_sentences))
print("Ví dụ câu phim:", weighted_sentences[:3])

Số lượng câu phim đã tạo: 7637
Ví dụ câu phim: [["['johnlasseter','johnlasseter','johnlasseter']", 'animation', 'comedy', 'family', 'tomhanks', 'timallen', 'donrickles', 'jealousi', 'toy', 'boy', 'friendship', 'friend', 'rivalri', 'boynextdoor', 'newtoy', 'toycomestolif', '1995'], ["['johnlasseter','johnlasseter','johnlasseter']", 'animation', 'comedy', 'family', 'tomhanks', 'timallen', 'donrickles', 'jealousi', 'toy', 'boy', 'friendship', 'friend', 'rivalri', 'boynextdoor', 'newtoy', 'toycomestolif', '1995'], ["['johnlasseter','johnlasseter','johnlasseter']", 'animation', 'comedy', 'family', 'tomhanks', 'timallen', 'donrickles', 'jealousi', 'toy', 'boy', 'friendship', 'friend', 'rivalri', 'boynextdoor', 'newtoy', 'toycomestolif', '1995']]


In [38]:
# weighted_sentences đã tạo ở bước trước
model = Word2Vec(weighted_sentences, vector_size=100, window=100, min_count=1, workers=4, sg=0)
word_vectors = model.wv

In [39]:
def sentence_to_vectors(sentence, wv):
    vectors = [wv[word] for word in sentence if word in wv]
    if len(vectors) == 0:
        # Nếu câu phim không có từ nào trong từ điển, trả về vector rỗng
        return np.empty((0, wv.vector_size))
    return np.array(vectors)

def movie_similarity_fc_ta(sentence1, sentence2, wv):
    # Fully connected + TA: tính cosine similarity cho mọi cặp từ giữa 2 câu
    vectors1 = sentence_to_vectors(sentence1, wv)
    vectors2 = sentence_to_vectors(sentence2, wv)
    if vectors1.shape[0] == 0 or vectors2.shape[0] == 0:
        return 0.0
    sim_matrix = cosine_similarity(vectors1, vectors2)
    # Total Average: trung bình tất cả giá trị trong ma trận
    return np.mean(sim_matrix)

In [40]:
movie_sentences = md['movie_sentence'].tolist()
movie_ids = md['id'].tolist()

# Tạo mapping id -> index và index -> id
id_to_index = {m_id: i for i, m_id in enumerate(movie_ids)}

def get_recommendations_FC_TA(movie_id, top_k=10):
    if movie_id not in id_to_index:
        print("Movie ID không tồn tại")
        return []
    idx = id_to_index[movie_id]
    sentence_main = movie_sentences[idx]
    similarities = []
    
    for i, sent in enumerate(movie_sentences):
        if i == idx:
            continue
        sim = movie_similarity_fc_ta(sentence_main, sent, word_vectors)
        similarities.append((movie_ids[i], sim))
        
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_k]

# Ví dụ: Gợi ý top 10 phim giống với phim có id=862 (Toy Story)
recommendations = get_recommendations_FC_TA(862, top_k=10)
print("Gợi ý phim tương tự:")
for rec_id, sim_score in recommendations:
    print("Phim", md[md['id'] == rec_id]['title'], " | Similarity:", sim_score)


Gợi ý phim tương tự:
Phim 570    Singin' in the Rain
Name: title, dtype: object  | Similarity: 0.94248027
Phim 652    Winnie the Pooh and the Blustery Day
Name: title, dtype: object  | Similarity: 0.9419588
Phim 602    Top Hat
Name: title, dtype: object  | Similarity: 0.941727
Phim 659    Alice in Wonderland
Name: title, dtype: object  | Similarity: 0.94147086
Phim 527    Joe's Apartment
Name: title, dtype: object  | Similarity: 0.9413759
Phim 656    Dumbo
Name: title, dtype: object  | Similarity: 0.9407806
Phim 655    Mary Poppins
Name: title, dtype: object  | Similarity: 0.94073623
Phim 581    My Fair Lady
Name: title, dtype: object  | Similarity: 0.93970114
Phim 560    Bogus
Name: title, dtype: object  | Similarity: 0.93937546
Phim 584    Meet Me in St. Louis
Name: title, dtype: object  | Similarity: 0.9391597


## Same Metadata Only - Metadata-based

In [41]:
def safe_list_parse(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return []
    if isinstance(x, (list, np.ndarray)):
        return x
    return []

for col in ['genres', 'cast', 'keywords']:
    md[col] = md[col].apply(safe_list_parse)
    md[col] = md[col].apply(lambda x: [str(g).lower().replace(" ", "") for g in x])

md['director'] = md['director'].fillna('').apply(lambda x: str(x).lower().replace(" ", ""))

md['year'] = md['year'].fillna('').astype(str)
movie_metadata = []
for idx, row in md.iterrows():
    data = {
        'director': [row['director']] if row['director'] else [],
        'genres': row['genres'],
        'cast': row['cast'],
        'keywords': row['keywords'],
        # 'overview': row['overview_tokens'],
        'year': [row['year']] if row['year'] else []
    }
    movie_metadata.append(data)

md['metadata_dict'] = movie_metadata

# Làm tròn rating
md['vote_average'] = md['vote_average'].fillna(0)
md['rounded_rating'] = md['vote_average'].apply(lambda x: max(1, int(round(x))))

# Tạo Weighted Movie Sentence Dataset:
# Ở đây, thay vì 1 câu phim, chúng ta có nhiều category.
# Chúng ta sẽ kết hợp tất cả category lại thành 1 câu để huấn luyện Word2Vec.
weighted_sentences = []
for idx, row in md.iterrows():
    # Kết hợp tất cả từ trong các metadata để huấn luyện
    full_sentence = (row['metadata_dict']['director'] 
                     + row['metadata_dict']['genres'] 
                     + row['metadata_dict']['cast'] 
                     + row['metadata_dict']['keywords'] 
                    #  + row['metadata_dict']['overview'] 
                     + row['metadata_dict']['year'])
    count = row['rounded_rating']
    for _ in range(count):
        weighted_sentences.append(full_sentence)

# Huấn luyện mô hình CBOW với window lớn
model = Word2Vec(weighted_sentences, vector_size=100, window=50, min_count=1, workers=4, sg=0)
word_vectors = model.wv

In [42]:
def sentence_to_vectors(words, wv):
    vectors = [wv[w] for w in words if w in wv]
    return np.array(vectors)

def metadata_similarity(words1, words2, wv):
    # Tính trung bình tất cả cosin similarity giữa mọi cặp từ trong cùng metadata
    # Nếu một trong hai rỗng, similarity = 0
    if len(words1) == 0 or len(words2) == 0:
        return 0.0
    v1 = sentence_to_vectors(words1, wv)
    v2 = sentence_to_vectors(words2, wv)
    if v1.shape[0] == 0 or v2.shape[0] == 0:
        return 0.0
    sim_matrix = cosine_similarity(v1, v2)
    return np.mean(sim_matrix)

def movie_similarity_smo_mb(meta1, meta2, wv):
    # meta1, meta2 là dictionary {'director': [...], 'genres': [...], ...}
    categories = ['director', 'genres', 'cast', 'keywords', 'year']
    total_similarity = 0.0
    for cat in categories:
        sim = metadata_similarity(meta1[cat], meta2[cat], wv)
        total_similarity += sim
    return total_similarity


In [43]:
movie_ids = md['id'].tolist()
id_to_index = {m_id: i for i, m_id in enumerate(movie_ids)}
metadata_list = md['metadata_dict'].tolist()

def get_recommendations_SMO_MB(movie_id, top_k=10):
    if movie_id not in id_to_index:
        print("Movie ID không tồn tại trong dataset")
        return []
    idx = id_to_index[movie_id]
    meta_main = metadata_list[idx]
    
    similarities = []
    for i, meta_other in enumerate(metadata_list):
        if i == idx:
            continue
        sim = movie_similarity_smo_mb(meta_main, meta_other, word_vectors)
        similarities.append((movie_ids[i], sim))
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_k]

# Ví dụ: Gợi ý top 10 phim giống phim id=862
recommendations = get_recommendations_SMO_MB(862, top_k=10)
for rec_id, sim_score in recommendations:
    print("Phim", md[md['id'] == rec_id]['title'], " | Similarity:", sim_score)

Phim 651    Cinderella
Name: title, dtype: object  | Similarity: 4.657984614372253
Phim 410    Beauty and the Beast
Name: title, dtype: object  | Similarity: 4.627187252044678
Phim 444    James and the Giant Peach
Name: title, dtype: object  | Similarity: 4.61075359582901
Phim 1216    Alice in Wonderland
Name: title, dtype: object  | Similarity: 4.593343794345856
Phim 1044    Anastasia
Name: title, dtype: object  | Similarity: 4.592058181762695
Phim 664    That Thing You Do!
Name: title, dtype: object  | Similarity: 4.570770978927612
Phim 474    Oliver & Company
Name: title, dtype: object  | Similarity: 4.556844651699066
Phim 534    Matilda
Name: title, dtype: object  | Similarity: 4.5559751987457275
Phim 946    Shall We Dance?
Name: title, dtype: object  | Similarity: 4.553898930549622
Phim 975    Wild America
Name: title, dtype: object  | Similarity: 4.5536781549453735
