In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## Data Understanding

In [10]:
df = pd.read_csv('netflix_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


## Univariate Data Analysis

In [12]:
print('Jumlah Film: ', len(df.title.unique()))
print('Jumlah Sutradara: ', len(df.director.unique()))
print('Jumlah Asal Negara: ', len(df.country.unique()))
print('Jumlah Rate Film: ', len(df.rating.unique()))
print('Jumlah Kategori: ', len(df.listed_in.unique()))

Jumlah Film:  8807
Jumlah Sutradara:  4529
Jumlah Asal Negara:  749
Jumlah Rate Film:  18
Jumlah Kategori:  514


In [42]:
df.listed_in.describe()

count                             8807
unique                             514
top       Dramas, International Movies
freq                               362
Name: listed_in, dtype: object

## Data Preparation

In [14]:
#  Mengecek data yang kosong
df.isnull().sum()


show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [16]:
# Cek jumlah film
len(df.show_id.unique())

8807

In [None]:
# Cek kategori film
df.listed_in.unique()

In [22]:
# Mengecek kategori Film Children & Family Movies, Documentaries
df[df['listed_in'] == 'Children & Family Movies, Documentaries']

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
1636,s1637,Movie,Dance Dreams: Hot Chocolate Nutcracker,Oliver Bokelberg,Debbie Allen,United States,"November 27, 2020",2020,TV-PG,81 min,"Children & Family Movies, Documentaries",This documentary spotlights Debbie Allen's car...
2431,s2432,Movie,Spelling the Dream,Sam Rega,,United States,"June 3, 2020",2020,TV-G,83 min,"Children & Family Movies, Documentaries","Following four hopeful competitors, this docum..."
4999,s5000,Movie,Expedition China,Ben Wallis,Maggie Q,,"March 4, 2018",2017,TV-G,78 min,"Children & Family Movies, Documentaries",Cinematographers visit the remote forests and ...
5273,s5274,Movie,Ghost of the Mountains,Ben Wallis,Antoine Fuqua,United States,"September 13, 2017",2017,G,78 min,"Children & Family Movies, Documentaries",An international group of filmmakers sets out ...
5496,s5497,Movie,The Mars Generation,Michael Barnett,"Neil deGrasse Tyson, Bill Nye, Michio Kaku",United States,"May 5, 2017",2017,TV-PG,98 min,"Children & Family Movies, Documentaries","Self-professed teenage ""space nerds"" at Space ..."
5596,s5597,Movie,Growing Up Wild,Keith Scholey,Daveed Diggs,United States,"February 19, 2017",2016,G,78 min,"Children & Family Movies, Documentaries",Five baby animals from different parts of the ...
7045,s7046,Movie,I Dream Of Dance,Maria Demeshkina Peek,,,"January 31, 2019",2017,TV-14,106 min,"Children & Family Movies, Documentaries",Experience the joy and sacrifice of one of the...
7599,s7600,Movie,NOVA: Bird Brain,,Craig Sechler,United States,"July 1, 2019",2017,TV-G,53 min,"Children & Family Movies, Documentaries",Scientists test avian aptitude and challenge c...
7740,s7741,Movie,Pick of the Litter,"Dana Nachman, Don Hardy Jr.",,United States,"August 2, 2020",2018,TV-PG,81 min,"Children & Family Movies, Documentaries",Five Labrador puppies embark on a 20-month tra...


In [23]:
# Mengecek kategori Film TV Action & Adventure, TV Dramas
df[df['listed_in'] == 'TV Action & Adventure, TV Dramas']

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
1717,s1718,TV Show,The Liberator,,"Bradley James, Martin Sensmeier, Jose Miguel V...",United States,"November 11, 2020",2020,TV-MA,1 Season,"TV Action & Adventure, TV Dramas","A diverse, deeply brave crew of ragtag soldier..."
5046,s5047,TV Show,Valor,,"Christina Ochoa, Matt Barr, Corbin Reid, Charl...",United States,"February 6, 2018",2017,TV-14,1 Season,"TV Action & Adventure, TV Dramas","Following an unsuccessful mission in Somalia, ..."
5836,s5837,TV Show,Marco Polo,,"Lorenzo Richelmy, Benedict Wong, Chin Han, Joa...",United States,"July 1, 2016",2016,TV-MA,2 Seasons,"TV Action & Adventure, TV Dramas","Set in a world of greed, betrayal, sexual intr..."
8064,s8065,TV Show,Spartacus,,"Andy Whitfield, Liam McIntyre, Dustin Clare, J...",United States,"February 1, 2015",2013,TV-MA,4 Seasons,"TV Action & Adventure, TV Dramas",A Thracian man is condemned to a brutal death ...


In [25]:
# Membuat variabel preparation yang berisi dataframe kemudian mengurutkan berdasarkan show_id
preparation = df
preparation

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [26]:
# Membuang data duplikat pada variabel preparation
preparation = preparation.drop_duplicates('show_id')
preparation

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [27]:
# Mengonversi data series ‘show_id’ menjadi dalam bentuk list
movie_id = preparation['show_id'].tolist()
 
# Mengonversi data series ‘title’ menjadi dalam bentuk list
movie_title = preparation['title'].tolist()
 
# Mengonversi data series ‘listed_in’ menjadi dalam bentuk list
movie_listed_in = preparation['listed_in'].tolist()
 
print(len(movie_id))
print(len(movie_title))
print(len(movie_listed_in))

8807
8807
8807


In [29]:
# Membuat dictionary untuk data ‘movie_id’, ‘movie_title’, dan ‘listed_in’
movie_new = pd.DataFrame({
    'id': movie_id,
    'title': movie_title,
    'listed_in': movie_listed_in
})
movie_new

Unnamed: 0,id,title,listed_in
0,s1,Dick Johnson Is Dead,Documentaries
1,s2,Blood & Water,"International TV Shows, TV Dramas, TV Mysteries"
2,s3,Ganglands,"Crime TV Shows, International TV Shows, TV Act..."
3,s4,Jailbirds New Orleans,"Docuseries, Reality TV"
4,s5,Kota Factory,"International TV Shows, Romantic TV Shows, TV ..."
...,...,...,...
8802,s8803,Zodiac,"Cult Movies, Dramas, Thrillers"
8803,s8804,Zombie Dumb,"Kids' TV, Korean TV Shows, TV Comedies"
8804,s8805,Zombieland,"Comedies, Horror Movies"
8805,s8806,Zoom,"Children & Family Movies, Comedies"


## **Model Development**

In [30]:
data = movie_new
data.sample(5)

Unnamed: 0,id,title,listed_in
1662,s1663,The Suit,"Action & Adventure, Comedies, International Mo..."
4933,s4934,AMO,"International TV Shows, TV Dramas"
7244,s7245,Kurt & Courtney,"Documentaries, Music & Musicals"
3316,s3317,Bella and the Bulldogs,"Kids' TV, TV Comedies"
5256,s5257,Out of Thin Air,"Documentaries, International Movies"


In [32]:
# Inisialisasi TfidfVectorizer
tf = TfidfVectorizer()
 
# Melakukan perhitungan idf pada data listed_in
tf.fit(data['listed_in']) 
 
# Mapping array dari fitur index integer ke fitur nama
tf.get_feature_names() 



['action',
 'adventure',
 'anime',
 'british',
 'children',
 'classic',
 'comedies',
 'comedy',
 'crime',
 'cult',
 'documentaries',
 'docuseries',
 'dramas',
 'faith',
 'family',
 'fantasy',
 'features',
 'fi',
 'horror',
 'independent',
 'international',
 'kids',
 'korean',
 'language',
 'lgbtq',
 'movies',
 'music',
 'musicals',
 'mysteries',
 'nature',
 'reality',
 'romantic',
 'sci',
 'science',
 'series',
 'shows',
 'spanish',
 'spirituality',
 'sports',
 'stand',
 'talk',
 'teen',
 'thrillers',
 'tv',
 'up']

In [33]:
# Melakukan fit lalu ditransformasikan ke bentuk matrix
tfidf_matrix = tf.fit_transform(data['listed_in']) 
 
# Melihat ukuran matrix tfidf
tfidf_matrix.shape 

(8807, 45)

In [34]:
# Mengubah vektor tf-idf dalam bentuk matriks dengan fungsi todense()
tfidf_matrix.todense()

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.71023461,
         0.        ],
        [0.30013058, 0.30013058, 0.        , ..., 0.        , 0.63809531,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [35]:
# Membuat dataframe untuk melihat tf-idf matrix
# Kolom diisi dengan kategori film
# Baris diisi dengan judul film
 
pd.DataFrame(
    tfidf_matrix.todense(), 
    columns=tf.get_feature_names(),
    index=data.title
).sample(22, axis=1).sample(10, axis=0)



Unnamed: 0_level_0,comedies,children,movies,up,adventure,shows,independent,fantasy,features,sports,...,thrillers,musicals,tv,sci,docuseries,language,reality,teen,spirituality,talk
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Book of Eli,0.0,0.0,0.0,0.0,0.363401,0.0,0.0,0.495271,0.0,0.0,...,0.0,0.0,0.0,0.495271,0.0,0.0,0.0,0.0,0.0,0.0
The Debt Collector 2,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
High & Low The Red Rain,0.0,0.0,0.329454,0.0,0.620698,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Seven Souls in the Skull Castle: Season Bird,0.0,0.0,0.306168,0.0,0.576827,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Blurred Lines: Inside the Art World,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Time: The Kalief Browder Story,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
LEGO Ninjago,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.489928,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cabin Fever,0.0,0.0,0.295062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.640972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Silicon Cowboys,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Stargate SG-1,0.0,0.0,0.0,0.0,0.237342,0.0,0.0,0.323468,0.0,0.0,...,0.0,0.0,0.504602,0.323468,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
 
# Menghitung cosine similarity pada matrix tf-idf
cosine_sim = cosine_similarity(tfidf_matrix) 
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.62052765, ..., 0.        , 0.        ,
        0.11412729],
       [0.        , 0.62052765, 1.        , ..., 0.        , 0.        ,
        0.04447042],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.28783572,
        0.08462917],
       [0.        , 0.        , 0.        , ..., 0.28783572, 1.        ,
        0.0712125 ],
       [0.        , 0.11412729, 0.04447042, ..., 0.08462917, 0.0712125 ,
        1.        ]])

In [37]:
# Membuat dataframe dari variabel cosine_sim dengan baris dan kolom berupa judul film
cosine_sim_df = pd.DataFrame(cosine_sim, index=data['title'], columns=data['title'])
print('Shape:', cosine_sim_df.shape)
 
# Melihat similarity matrix pada setiap film
cosine_sim_df.sample(5, axis=1).sample(10, axis=0)

Shape: (8807, 8807)


title,The Roommate,Arjun: The Warrior Prince,The Disaster Artist,Secrets of Underground London,Incarnate
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Humpty Sharma Ki Dulhania,0.0,0.290739,0.327927,0.386071,0.179949
A Man Called God,0.0,0.046636,0.0,0.061928,0.0
Jem and the Holograms,0.0,0.152789,0.15915,0.082652,0.059985
Daybreak,0.0,0.390949,0.193432,0.0,0.306076
Strike a Pose,0.0,0.214905,0.0,0.646042,0.133012
Warrior Nun,0.0,0.296571,0.0,0.0,0.0
High & Low The Movie,0.0,0.92932,0.0,0.283172,0.097209
Verses of Love,0.0,0.437762,0.24499,0.396216,0.184677
Ricardo O'Farrill Abrazo Genial,0.0,0.0,0.0,0.0,0.0
Midnight Sun,0.0,0.304381,0.317053,0.164656,0.1195


In [38]:
def movie_recommendations(title, similarity_data=cosine_sim_df, items=data[['title', 'listed_in']], k=5):
    """
    Rekomendasi Film berdasarkan kemiripan dataframe
 
    Parameter:
    ---
    title : tipe data string (str)
                Judul film (index kemiripan dataframe)
    similarity_data : tipe data pd.DataFrame (object)
                      Kesamaan dataframe, simetrik, dengan judul film sebagai 
                      indeks dan kolom
    items : tipe data pd.DataFrame (object)
            Mengandung kedua nama dan fitur lainnya yang digunakan untuk mendefinisikan kemiripan
    k : tipe data integer (int)
        Banyaknya jumlah rekomendasi yang diberikan
    ---
 
 
    Pada index ini, kita mengambil k dengan nilai similarity terbesar 
    pada index matrix yang diberikan (i).
    """
 
 
    # Mengambil data dengan menggunakan argpartition untuk melakukan partisi secara tidak langsung sepanjang sumbu yang diberikan    
    # Dataframe diubah menjadi numpy
    # Range(start, stop, step)
    index = similarity_data.loc[:,title].to_numpy().argpartition(
        range(-1, -k, -1))
    
    # Mengambil data dengan similarity terbesar dari index yang ada
    closest = similarity_data.columns[index[-1:-(k+2):-1]]

    # Drop title agar nama movie yang dicari tidak muncul dalam daftar rekomendasi
    closest = closest.drop(title, errors='ignore')
 
    return pd.DataFrame(closest).merge(items).head(k)

In [40]:
data[data.title.eq('Jaguar')]

Unnamed: 0,id,title,listed_in
19,s20,Jaguar,"International TV Shows, Spanish-Language TV Sh..."


In [41]:
# Mendapatkan rekomendasi film yang mirip dengan Jaguar
movie_recommendations('Jaguar')

Unnamed: 0,title,listed_in
0,The Ministry of Time,"International TV Shows, Spanish-Language TV Sh..."
1,Sky Rojo,"International TV Shows, Spanish-Language TV Sh..."
2,Diablero,"International TV Shows, Spanish-Language TV Sh..."
3,Victim Number 8,"International TV Shows, Spanish-Language TV Sh..."
4,El Chema,"Crime TV Shows, Spanish-Language TV Shows, TV ..."
