**1. Memuat Data**

In [275]:
#mengimport library yang diperlukan
import pandas as pd
import numpy as np

In [276]:
#Memuat dataset dari file CSV ke dalam DataFrame
df = pd.read_csv('/content/movie_sample_dataset.csv')

# **2. Memeriksa data**

In [277]:
#Menampilkan beberapa baris pertama dari dataset untuk memahami struktur data
df.head()

Unnamed: 0,color,director_name,duration,gross,genres,movie_title,title_year,language,country,budget,imdb_score,actors,movie_facebook_likes
0,Color,Martin Scorsese,240,116866727.0,Biography|Comedy|Crime|Drama,The Wolf of Wall Street,2013,English,USA,100000000.0,8.2,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",138000
1,Color,Shane Black,195,408992272.0,Action|Adventure|Sci-Fi,Iron Man 3,2013,English,USA,200000000.0,7.2,"Robert Downey Jr.,Jon Favreau,Don Cheadle",95000
2,color,Quentin Tarantino,187,54116191.0,Crime|Drama|Mystery|Thriller|Western,The Hateful Eight,2015,English,USA,44000000.0,7.9,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",114000
3,Color,Kenneth Lonergan,186,46495.0,Drama,Margaret,2011,English,usa,14000000.0,6.5,"Matt Damon,Kieran Culkin,John Gallagher Jr.",0
4,Color,Peter Jackson,186,258355354.0,Adventure|Fantasy,The Hobbit: The Desolation of Smaug,2013,English,USA,225000000.0,7.9,"Aidan Turner,Adam Brown,James Nesbitt",83000


In [278]:
#Menampilkan informasi umum tentang dataset yaitu jumlah baris dan kolom
df.shape

(99, 13)

In [279]:
#Menampilkan tipe data tiap kolomnya
df.dtypes

Unnamed: 0,0
color,object
director_name,object
duration,int64
gross,float64
genres,object
movie_title,object
title_year,int64
language,object
country,object
budget,float64


In [280]:
# replace "?" menjadi NaN
df.replace("?", np.nan, inplace = True)

# cek banyaknya missing value pada masing-masing kolom
print(df.isnull().sum())

color                   11
director_name           11
duration                 0
gross                    8
genres                   1
movie_title              0
title_year               0
language                 0
country                  0
budget                   4
imdb_score               0
actors                   0
movie_facebook_likes     0
dtype: int64


**3. Membersihkan Data**

Penanganan baris missing data di kolom "Gross" dan "Budget" berdasarkan row

In [281]:
# simply drop whole row with NaN in "budget" column
df.dropna(subset=["budget"], axis=0, inplace=True)
# simply drop whole row with NaN in "gross" column
df.dropna(subset=["gross"], axis=0, inplace=True)

In [282]:
print(df.head())

    color      director_name  duration        gross  \
0   Color    Martin Scorsese       240  116866727.0   
1   Color        Shane Black       195  408992272.0   
2  color   Quentin Tarantino       187   54116191.0   
3   Color   Kenneth Lonergan       186      46495.0   
4   Color      Peter Jackson       186  258355354.0   

                                 genres                          movie_title  \
0          Biography|Comedy|Crime|Drama              The Wolf of Wall Street   
1               Action|Adventure|Sci-Fi                           Iron Man 3   
2  Crime|Drama|Mystery|Thriller|Western                    The Hateful Eight   
3                                 Drama                             Margaret   
4                     Adventure|Fantasy  The Hobbit: The Desolation of Smaug   

   title_year language country       budget  imdb_score  \
0        2013  English     USA  100000000.0         8.2   
1        2013  English     USA  200000000.0         7.2   
2        20

In [283]:
# cek banyaknya missing value pada masing-masing kolom setelah membersihkan missing value kolom "gross" dan "budget"
print(df.isnull().sum())

color                   10
director_name           10
duration                 0
gross                    0
genres                   1
movie_title              0
title_year               0
language                 0
country                  0
budget                   0
imdb_score               0
actors                   0
movie_facebook_likes     0
dtype: int64


In [284]:
#cek baris dan kolom setelah pembersihan missing value kolom "gross" dan "budget"
df.shape

(89, 13)

In [285]:
#Mengatasi nilai tidak konsisten pada kolom "color"
df["color"] = df["color"].str.strip().str.lower()

#Mengecek nilai kolom "color"
print(df["color"].value_counts())

color
color              78
black and white     1
Name: count, dtype: int64


In [286]:
#Menghapus nilai-nilai yang tidak standar
df = df.dropna()

In [287]:
# cek banyaknya missing value pada masing-masing kolom setelah penghapusan nilai tidak standar
print(df.isnull().sum())

color                   0
director_name           0
duration                0
gross                   0
genres                  0
movie_title             0
title_year              0
language                0
country                 0
budget                  0
imdb_score              0
actors                  0
movie_facebook_likes    0
dtype: int64


**4. Transformasi Data**

In [288]:
#Cek kesesuaian tipe data kolom
df.dtypes

Unnamed: 0,0
color,object
director_name,object
duration,int64
gross,float64
genres,object
movie_title,object
title_year,int64
language,object
country,object
budget,float64


In [289]:
#tipe data sudah sesuai

In [290]:
#Memisahkan genre yang bergabung dalam satu kolom menjadi beberapa kolom terpisah
# buat dummy variable untuk semua genre
df_genres_ohe = df['genres'].str.get_dummies(sep='|')

# gabungkan ke dataframe asli, sehingga selanjutnya akan menggunakan data ini
df_ohe = pd.concat([df, df_genres_ohe], axis=1)

print(df_ohe.head(10))

    color      director_name  duration        gross  \
0   color    Martin Scorsese       240  116866727.0   
1   color        Shane Black       195  408992272.0   
2   color  Quentin Tarantino       187   54116191.0   
3   color   Kenneth Lonergan       186      46495.0   
4   color      Peter Jackson       186  258355354.0   
6   color      Peter Jackson       -50  303001229.0   
8   color        Joss Whedon       173  623279547.0   
9   color        Joss Whedon       173  623279547.0   
11  color               Null       158  102515793.0   
13  color  Christopher Nolan       169  187991439.0   

                                  genres                          movie_title  \
0           Biography|Comedy|Crime|Drama              The Wolf of Wall Street   
1                Action|Adventure|Sci-Fi                           Iron Man 3   
2   Crime|Drama|Mystery|Thriller|Western                    The Hateful Eight   
3                                  Drama                             M

In [291]:
#Normalisasi teks untuk memastikan konsistensi, yaitu mengubah semua teks menjadi huruf kecil
for col in df_ohe.select_dtypes(include='object').columns:
    df_ohe[col] = df_ohe[col].str.strip().str.lower()
    df_ohe.columns = df_ohe.columns.str.strip().str.lower()

print(df_ohe.head(10))

    color      director_name  duration        gross  \
0   color    martin scorsese       240  116866727.0   
1   color        shane black       195  408992272.0   
2   color  quentin tarantino       187   54116191.0   
3   color   kenneth lonergan       186      46495.0   
4   color      peter jackson       186  258355354.0   
6   color      peter jackson       -50  303001229.0   
8   color        joss whedon       173  623279547.0   
9   color        joss whedon       173  623279547.0   
11  color               null       158  102515793.0   
13  color  christopher nolan       169  187991439.0   

                                  genres                          movie_title  \
0           biography|comedy|crime|drama              the wolf of wall street   
1                action|adventure|sci-fi                           iron man 3   
2   crime|drama|mystery|thriller|western                    the hateful eight   
3                                  drama                             m

In [292]:
# cek banyaknya missing value pada masing-masing kolom, setelah semua proses
print(df.isnull().sum())

color                   0
director_name           0
duration                0
gross                   0
genres                  0
movie_title             0
title_year              0
language                0
country                 0
budget                  0
imdb_score              0
actors                  0
movie_facebook_likes    0
dtype: int64


**5. Penyimpanan Data**

In [293]:
#Menyimpan data yang telah diproses ke dalam file CSV baru
df_ohe.to_csv('movie_dataset_cleaned.csv')