In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import (
    TfidfVectorizer, # i want to down weight a word in global
    CountVectorizer # i don't want to down weight a word in global
)
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval

In [2]:
metadata = pd.read_csv('data/movies_metadata.csv', low_memory=False)
metadata

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0


In [3]:
# Remove rows with bad IDs.
metadata = metadata.drop([19730, 29503, 35587])

In [4]:
metadata['id'] = metadata['id'].astype('int')

In [5]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45463 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45463 non-null  object 
 1   belongs_to_collection  4491 non-null   object 
 2   budget                 45463 non-null  object 
 3   genres                 45463 non-null  object 
 4   homepage               7779 non-null   object 
 5   id                     45463 non-null  int32  
 6   imdb_id                45446 non-null  object 
 7   original_language      45452 non-null  object 
 8   original_title         45463 non-null  object 
 9   overview               44509 non-null  object 
 10  popularity             45460 non-null  object 
 11  poster_path            45077 non-null  object 
 12  production_companies   45460 non-null  object 
 13  production_countries   45460 non-null  object 
 14  release_date           45376 non-null  object 
 15  re

We shoud compute pairwise `cosine` similarity scores for all movies based on their descriptions and recommend movies based on that similarity score threshold.

In [6]:
metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [7]:
metadata['overview'][0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

Just use the subset of large datasets for memory saving issues

In [8]:
metadata_sampled = metadata.copy().sample(20000, random_state=10)
metadata_sampled = metadata_sampled.reset_index().rename(columns={'index': 'old_index'})
metadata_sampled.head(5)

Unnamed: 0,old_index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,7654,False,,0,"[{'id': 36, 'name': 'History'}, {'id': 35, 'na...",,56167,tt0044487,it,Le Carrosse d'or,...,1952-12-05,0.0,103.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,The Golden Coach,False,7.3,13.0
1,2233,False,"{'id': 374384, 'name': 'Stepford Collection', ...",0,"[{'id': 27, 'name': 'Horror'}, {'id': 9648, 'n...",,12223,tt0073747,en,The Stepford Wives,...,1975-02-12,0.0,117.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Something strange is happening in the town of ...,The Stepford Wives,False,6.7,79.0
2,37093,False,,0,"[{'id': 27, 'name': 'Horror'}]",,77257,tt0071478,es,Exorcismo,...,1975-03-10,0.0,90.0,"[{'iso_639_1': 'es', 'name': 'Español'}]",Released,A theme that has thrilled audiences all over t...,Exorcismo,False,7.5,2.0
3,5792,False,,0,"[{'id': 9648, 'name': 'Mystery'}, {'id': 18, '...",,73116,tt0083693,en,Brimstone & Treacle,...,1982-10-01,0.0,87.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Brimstone & Treacle,False,6.8,8.0
4,35119,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,17223,tt0867205,en,Two Tigers,...,2007-02-14,0.0,0.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}]",Released,,Two Tigers,False,3.5,2.0


In [9]:
metadata_sampled.to_csv('data/metadata_sampled.csv', index=False)

In [10]:
overview = metadata_sampled['overview']

Calculate the `TF-IDF` of `overfiew`

In [11]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf

TfidfVectorizer(stop_words='english')

In [12]:
overview.isna().sum()

406

In [13]:
overview.fillna('', inplace=True)

In [14]:
%%time

tfidf_matrix = tfidf.fit_transform(overview)
tfidf_matrix.shape

Wall time: 1.02 s


(20000, 50236)

Example of some word in `tfidf`

In [15]:
tfidf.get_feature_names()[1000:1020]

['activates',
 'active',
 'actively',
 'activism',
 'activist',
 'activists',
 'activities',
 'activity',
 'activités',
 'actor',
 'actors',
 'actress',
 'actresses',
 'acts',
 'actt',
 'actual',
 'actuality',
 'actualization',
 'actually',
 'acumen']

**cosine similarity** calculate a numeric that denotes the similarity between two movies.

$\mathbf{\cos}(\mathbf{x}, \mathbf{y}) = \dfrac{\mathbf{x} \cdot \mathbf{y}^\intercal}{\left\Vert \mathbf{x} \right\Vert \cdot \left\Vert \mathbf{y} \right\Vert} = \dfrac{\sum_{i=1}^{n} \mathbf{x}_i \cdot \mathbf{y}^\intercal
}{\sqrt{\sum_{i=1}^{n} (\mathbf{x}_i)^2}\sqrt{\sum_{i=1}^{n} (\mathbf{y}_i)^2}}$

### Calculate Cosine similarity using Overview

In [16]:
%%time

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

Wall time: 2.97 s


(20000, 20000)

In [17]:
cosine_sim[1]

array([0.        , 1.        , 0.        , ..., 0.        , 0.        ,
       0.02206402])

Created a dataframe to map index with title of a movie

In [18]:
indices = pd.Series(metadata_sampled.index, index=metadata_sampled['title'])
indices

title
The Golden Coach           0
The Stepford Wives         1
Exorcismo                  2
Brimstone & Treacle        3
Two Tigers                 4
                       ...  
Fire                   19995
Voyager                19996
Adam                   19997
Vabank II              19998
Tinpis Run             19999
Length: 20000, dtype: int64

In [19]:
indices.to_csv('data/movies_sample_title.csv')

In [20]:
def get_recommendation(title, cosine_sim=cosine_sim):
    
    try:
        idx = indices[title]
    except:
        print(f"No movies has title '{title}' in database!")
        return
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # sort decending
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # remove the first one, the first one is that title
    # get top 10
    sim_scores = sim_scores[1:11]
    
    movie_indices = [i[0] for i in sim_scores]
    
    return metadata_sampled['title'].loc[movie_indices]

In [21]:
get_recommendation('Voyager')

12497       Intimate Strangers
9757                     Oasis
1612     Brave Hearts: Umizaru
3971             Dead Men Tell
6564                Death Ship
12819              Road to Rio
1305           Going Overboard
18643               Ghost Ship
6484        The Legend of 1900
373                   Area 407
Name: title, dtype: object

### Another way to calculate Cosine Similarity: using keywords, director, cast, genres

In [22]:
credits = pd.read_csv('data/credits_sampled.csv')
keywords = pd.read_csv('data/keywords_sampled.csv')

In [23]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20001 entries, 0 to 20000
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    20001 non-null  object
 1   crew    20001 non-null  object
 2   id      20001 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 468.9+ KB


In [24]:
keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19997 entries, 0 to 19996
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        19997 non-null  int64 
 1   keywords  19997 non-null  object
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [25]:
credits['id'] = credits['id'].astype('int')
keywords['id'] = keywords['id'].astype('int')

In [26]:
metadata_sampled = metadata_sampled.merge(credits, on='id')
metadata_sampled = metadata_sampled.merge(keywords, on='id')

Parse from string to Python object

In [27]:
%%time

features = ['cast', 'crew', 'keywords', 'genres']

for feature in features:
    metadata_sampled[feature] = metadata_sampled[feature].apply(literal_eval)

Wall time: 14 s


In [28]:
metadata_sampled['cast'][0]

[{'cast_id': 2,
  'character': 'Camilla',
  'credit_id': '52fe48f2c3a36847f8181b15',
  'gender': 1,
  'id': 4421,
  'name': 'Anna Magnani',
  'order': 0,
  'profile_path': '/ygpE4VtT0TtrF2mfceJBT8u2Nhh.jpg'},
 {'cast_id': 3,
  'character': 'Don Antonio',
  'credit_id': '52fe48f2c3a36847f8181b19',
  'gender': 0,
  'id': 85861,
  'name': 'Odoardo Spadaro',
  'order': 1,
  'profile_path': None},
 {'cast_id': 4,
  'character': 'Isabella',
  'credit_id': '52fe48f2c3a36847f8181b1d',
  'gender': 0,
  'id': 1112933,
  'name': 'Nada Fiorelli',
  'order': 2,
  'profile_path': None},
 {'cast_id': 5,
  'character': 'Arlequin',
  'credit_id': '52fe48f2c3a36847f8181b21',
  'gender': 0,
  'id': 132838,
  'name': 'Dante',
  'order': 3,
  'profile_path': '/mRBsRkw27vCVl9Q3owBnPn3mieH.jpg'},
 {'cast_id': 7,
  'character': 'Ferdinand, Le Viceroy',
  'credit_id': '552d7316c3a368618e005a01',
  'gender': 2,
  'id': 29659,
  'name': 'Duncan Lamont',
  'order': 4,
  'profile_path': '/xIGr7HDfHlZoIB4K0amQO3uim

In [29]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [30]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        return names[:3]
    return []

In [31]:
metadata_sampled['director'] = metadata_sampled['crew'].apply(get_director)

In [32]:
features = ['cast', 'keywords', 'genres']

for feature in features:
    metadata_sampled[feature] = metadata_sampled[feature].apply(get_list)

In [33]:
metadata_sampled[['title', 'cast', 'director', 'keywords', 'genres']].head(5)

Unnamed: 0,title,cast,director,keywords,genres
0,The Golden Coach,"[Anna Magnani, Odoardo Spadaro, Nada Fiorelli]",Jean Renoir,"[theater, latin america, acting]","[History, Comedy, Drama]"
1,The Stepford Wives,"[Katharine Ross, Paula Prentiss, Nanette Newman]",Bryan Forbes,"[suspense, independent film, cult film]","[Horror, Mystery, Science Fiction]"
2,Exorcismo,"[Paul Naschy, Maria Perschy, María Kosty]",Juan Bosch,"[female nudity, sex, nudity]",[Horror]
3,Brimstone & Treacle,"[Sting, Denholm Elliott, Joan Plowright]",Richard Loncraine,[gothic],"[Mystery, Drama, Horror]"
4,Two Tigers,"[Selena Khoo, Jay Natelle, Andrea Osvárt]",Sandro Cecca,[],"[Action, Adventure, Foreign]"


In [34]:
def clean_data(x):
    if isinstance(x, list):
        return [i.replace(' ', '').lower() for i in x]
    if isinstance(x, str):
        return x.replace(' ', '').lower()
    return ''

In [35]:
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata_sampled[feature] = metadata_sampled[feature].apply(clean_data)

In [36]:
def create_soup(x):
    features = ['keywords', 'cast', 'genres']
    soup = x['director']
    for feature in features:
        soup += ' '.join(x[feature]) + ' '
    return soup

In [37]:
metadata_sampled['soup'] = metadata_sampled.apply(create_soup, axis=1)

In [38]:
metadata_sampled.head()

Unnamed: 0,old_index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,tagline,title,video,vote_average,vote_count,cast,crew,keywords,director,soup
0,7654,False,,0,"[history, comedy, drama]",,56167,tt0044487,it,Le Carrosse d'or,...,,The Golden Coach,False,7.3,13.0,"[annamagnani, odoardospadaro, nadafiorelli]","[{'credit_id': '58a9b825c3a3680bc20094e5', 'de...","[theater, latinamerica, acting]",jeanrenoir,jeanrenoirtheater latinamerica acting annamagn...
1,2233,False,"{'id': 374384, 'name': 'Stepford Collection', ...",0,"[horror, mystery, sciencefiction]",,12223,tt0073747,en,The Stepford Wives,...,Something strange is happening in the town of ...,The Stepford Wives,False,6.7,79.0,"[katharineross, paulaprentiss, nanettenewman]","[{'credit_id': '52fe44cd9251416c7504166d', 'de...","[suspense, independentfilm, cultfilm]",bryanforbes,bryanforbessuspense independentfilm cultfilm k...
2,37093,False,,0,[horror],,77257,tt0071478,es,Exorcismo,...,A theme that has thrilled audiences all over t...,Exorcismo,False,7.5,2.0,"[paulnaschy, mariaperschy, maríakosty]","[{'credit_id': '52fe4964c3a368484e129173', 'de...","[femalenudity, sex, nudity]",juanbosch,juanboschfemalenudity sex nudity paulnaschy ma...
3,5792,False,,0,"[mystery, drama, horror]",,73116,tt0083693,en,Brimstone & Treacle,...,,Brimstone & Treacle,False,6.8,8.0,"[sting, denholmelliott, joanplowright]","[{'credit_id': '52fe488dc3a368484e0fe971', 'de...",[gothic],richardloncraine,richardloncrainegothic sting denholmelliott jo...
4,35119,False,,0,"[action, adventure, foreign]",,17223,tt0867205,en,Two Tigers,...,,Two Tigers,False,3.5,2.0,"[selenakhoo, jaynatelle, andreaosvárt]","[{'credit_id': '52fe47119251416c7508cc03', 'de...",[],sandrocecca,sandrocecca selenakhoo jaynatelle andreaosvárt...


In [39]:
count_vec = CountVectorizer(stop_words='english')
count_vec

CountVectorizer(stop_words='english')

In [40]:
count_matrix = count_vec.fit_transform(metadata_sampled['soup'])

In [41]:
count_matrix.shape

(20003, 51485)

In [42]:
%%time

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
cosine_sim2.shape

Wall time: 10.7 s


(20003, 20003)

In [43]:
metadata_sampled.reset_index(drop=True, inplace=True)
indices = pd.Series(metadata_sampled.index, index=metadata_sampled['title'])

In [44]:
indices

title
The Golden Coach           0
The Stepford Wives         1
Exorcismo                  2
Brimstone & Treacle        3
Two Tigers                 4
                       ...  
Fire                   19998
Voyager                19999
Adam                   20000
Vabank II              20001
Tinpis Run             20002
Length: 20003, dtype: int64

In [45]:
get_recommendation('Brimstone & Treacle', cosine_sim2)

1609               Raising Jeffrey Dahmer
4485                        The Unseeable
9173                 404: Error Not Found
14908                             Absurda
1460                            Ghost Son
2687                      Robin Redbreast
3845                     The Silent House
5709                         The Unwanted
11532       The Hound of the Baskervilles
14948    The Mystery of the Marie Celeste
Name: title, dtype: object

## Conclusion

trên đây là 2 cách sử dụng để tạo hệ thống khuyến nghị dựa trên nội dung: sử dụng overview của phim hoặc là sử dụng các tham số khác như tên đạo diễn, từ khóa, diễn viên phim casting, thể loại.

Có thể kết hợp `cosine_sim` và `cosine_sim2` để đưa ra gợi ý chính xác hơn: dựa vào cả nội dung phim và cả các chi tiết tham số khác