In [1]:
! pip install opendatasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Import necessary libraries

In [2]:
import opendatasets as od
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
plt.style.use('seaborn')

  plt.style.use('seaborn')


Download datasets

In [3]:
od.download('https://www.kaggle.com/datasets/prasertk/top-100-rotten-tomatoes-movies-by-genres')

Skipping, found downloaded files in "./top-100-rotten-tomatoes-movies-by-genres" (use force=True to force download)


Create dataframes

In [4]:
movies = pd.read_csv('/content/top-100-rotten-tomatoes-movies-by-genres/top_100_movies_by_genres.csv')

## Univariate Exploratory Data Analysis

In [5]:
movies

Unnamed: 0,Genre,Rank,RatingTomatometer,Title,No. of Reviews
0,Action & Adventure,1.0,96%,Black Panther (2018),525
1,Action & Adventure,2.0,94%,Avengers: Endgame (2019),547
2,Action & Adventure,3.0,97%,Mission: Impossible - Fallout (2018),437
3,Action & Adventure,4.0,97%,Mad Max: Fury Road (2015),434
4,Action & Adventure,5.0,97%,Spider-Man: Into the Spider-Verse (2018),393
...,...,...,...,...,...
1607,Western,82.0,15%,Priest (2011),101
1608,Western,83.0,16%,September Dawn (2007),55
1609,Western,84.0,14%,American Outlaws (2001),103
1610,Western,85.0,12%,Jonah Hex (2010),153


In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1612 entries, 0 to 1611
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Genre              1612 non-null   object 
 1   Rank               1612 non-null   float64
 2   RatingTomatometer  1612 non-null   object 
 3   Title              1612 non-null   object 
 4   No. of Reviews     1612 non-null   int64  
dtypes: float64(1), int64(1), object(3)
memory usage: 63.1+ KB


# Data Preparation

Check total data unique

In [7]:
movies.nunique()

Genre                  17
Rank                  100
RatingTomatometer      70
Title                1007
No. of Reviews        304
dtype: int64

Check missing values

In [8]:
movies.isna().sum()

Genre                0
Rank                 0
RatingTomatometer    0
Title                0
No. of Reviews       0
dtype: int64

Check duplicates

In [9]:
movies.duplicated().sum()

0

In [10]:
print('Jumlah Movies: ', len(movies.Title.unique()))
print('Jumlah Row:', len(movies))

Jumlah Movies:  1007
Jumlah Row: 1612


In [11]:
movies_duplicate = movies[movies.duplicated(['Title'], keep=False)]
fix_movies = movies_duplicate.sort_values('Title', ascending=True)
fix_movies

Unnamed: 0,Genre,Rank,RatingTomatometer,Title,No. of Reviews
1086,Mystery & Suspense,87.0,90%,10 Cloverfield Lane (2016),316
1284,Science Fiction & Fantasy,85.0,90%,10 Cloverfield Lane (2016),316
1070,Mystery & Suspense,71.0,98%,101 Dalmatians (1961),52
469,Comedy,70.0,98%,101 Dalmatians (1961),52
391,Classics,92.0,98%,101 Dalmatians (1961),52
...,...,...,...,...,...
142,Animation,43.0,98%,Your Name. (Kimi No Na Wa.) (2017),116
1152,Romance,53.0,98%,Your Name. (Kimi No Na Wa.) (2017),116
18,Action & Adventure,19.0,98%,Zootopia (2016),298
107,Animation,8.0,98%,Zootopia (2016),298


In [12]:
movies.drop_duplicates(subset=['Title'], keep='first', inplace=True)
movies

Unnamed: 0,Genre,Rank,RatingTomatometer,Title,No. of Reviews
0,Action & Adventure,1.0,96%,Black Panther (2018),525
1,Action & Adventure,2.0,94%,Avengers: Endgame (2019),547
2,Action & Adventure,3.0,97%,Mission: Impossible - Fallout (2018),437
3,Action & Adventure,4.0,97%,Mad Max: Fury Road (2015),434
4,Action & Adventure,5.0,97%,Spider-Man: Into the Spider-Verse (2018),393
...,...,...,...,...,...
1607,Western,82.0,15%,Priest (2011),101
1608,Western,83.0,16%,September Dawn (2007),55
1609,Western,84.0,14%,American Outlaws (2001),103
1610,Western,85.0,12%,Jonah Hex (2010),153


Membuat dataframe baru dengan kolum yang hanya ingin digunakan

In [13]:
movies = movies[['Title', 'Genre']]
movies

Unnamed: 0,Title,Genre
0,Black Panther (2018),Action & Adventure
1,Avengers: Endgame (2019),Action & Adventure
2,Mission: Impossible - Fallout (2018),Action & Adventure
3,Mad Max: Fury Road (2015),Action & Adventure
4,Spider-Man: Into the Spider-Verse (2018),Action & Adventure
...,...,...
1607,Priest (2011),Western
1608,September Dawn (2007),Western
1609,American Outlaws (2001),Western
1610,Jonah Hex (2010),Western


# Model Deployment
### Model Content Based Filtering

TF-IDF Vectorizer

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Inisialisasi TfidfVectorizer
tf = TfidfVectorizer(stop_words='english')

# Melakukan perhitungan idf pada data movies
tf.fit(movies['Genre']) 
 
# Mapping array dari fitur index integer ke fitur nama
tf.get_feature_names_out()

array(['action', 'adventure', 'animation', 'art', 'arts', 'classics',
       'comedy', 'documentary', 'drama', 'family', 'fantasy', 'fiction',
       'fitness', 'horror', 'house', 'international', 'kids', 'musical',
       'mystery', 'performing', 'romance', 'science', 'special', 'sports',
       'suspense', 'television', 'western'], dtype=object)

In [15]:
# Melakukan fit lalu ditransformasikan ke bentuk matrix
tfidf_matrix = tf.fit_transform(movies['Genre']) 
 
# Melihat ukuran matrix tfidf
tfidf_matrix.shape 

(1007, 27)

## Cosine Similiraty

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.]])

In [17]:
cosine_sim_df = pd.DataFrame(cosine_sim, index=movies['Title'],
                             columns=movies['Title'])
print('Shape:', cosine_sim_df.shape)

cosine_sim_df.sample(10, axis=1).sample(10, axis=0)

Shape: (1007, 1007)


Title,The Nightmare Before Christmas (1993),Man on Wire (2008),Snow White and the Seven Dwarfs (1937),Every Little Step (2009),101 Dalmatians (1961),Shrek 2 (2004),Wild Wild West (1999),Marley (2012),Host (2020),The Wolf Man (1941)
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Howards End (1992),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Monsoon Wedding (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
California Typewriter (2017),0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Prophet (Un prophete) (2010),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Crouching Tiger, Hidden Dragon (2001)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Slay the Dragon (2020),0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bus 174 (Ônibus 174) (2003),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ponyo (2009),1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
Dust to Glory (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (1961),1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


Fungsi untuk rekomendasi film

In [18]:
def MovieRecommendations(movies_title, similarity_data=cosine_sim_df, 
                         items=movies[['Title','Genre']], k=10):
  
    # Mengambil data dengan menggunakan argpartition untuk melakukan partisi secara tidak langsung sepanjang sumbu yang diberikan    
    # Dataframe diubah menjadi numpy
    # Range(start, stop, step)
    index = similarity_data.loc[:, movies_title].to_numpy().argpartition(
        range(-1, -k, -1)
    )

    # Mengambil data dengan similarity terbesar dari index yang ada
    closest = similarity_data.columns[index[-1:-(k+2):-1]]

    # Drop movie_title agar nama movie yang dicari tidak muncul dalam daftar rekomendasi
    closest = closest.drop(movies_title, errors='ignore')

    return pd.DataFrame(closest).merge(items).head(k)

In [19]:
# Tes melihat genre dari sebuah movie
find_title = movies[movies['Title'] == '1917 (2020)']
find_title

Unnamed: 0,Title,Genre
654,1917 (2020),Drama


In [20]:
movie_title = '1917 (2020)'
movie_recomend = MovieRecommendations(movie_title)
movie_recomend

Unnamed: 0,Title,Genre
0,12 Years a Slave (2013),Drama
1,Pain and Glory (Dolor y gloria) (2019),Drama
2,Nomadland (2021),Drama
3,Sound of Metal (2020),Drama
4,Schindler's List (1993),Drama
5,Never Rarely Sometimes Always (2020),Drama
6,Call Me by Your Name (2018),Drama
7,The Florida Project (2017),Drama
8,"The Godfather, Part II (1974)",Drama
9,Widows (2018),Drama
