<a href="https://colab.research.google.com/github/yumnazakkiya/TMDB_PROJECT/blob/main/Preprocessing/pre_processing_TMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Bagian 1: Setup dan Upload CSV
import os
from google.colab import files

# Path file target
file_path = '/content/tmdb_movies_with_country_final.csv'

# Upload file hanya jika belum ada
if not os.path.exists(file_path):
    uploaded = files.upload()
    for filename in uploaded.keys():
        os.rename(filename, file_path)
        print(f'File saved as: {file_path}')
else:
    print("File sudah ada, lanjut ke pemrosesan.")

Saving tmdb_movies_with_country_final.csv to tmdb_movies_with_country_final.csv
File saved as: /content/tmdb_movies_with_country_final.csv


In [None]:
# Bagian 2: Import Library
import pandas as pd
import re
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [None]:
# Bagian 3: Menghapus Kata-Kata Umum

# Download stopwords
nltk.download('stopwords')

# Load CSV
df = pd.read_csv(file_path)

# Tampilkan info awal
print(df.info())
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10500 entries, 0 to 10499
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            10500 non-null  int64  
 1   title         10500 non-null  object 
 2   overview      10469 non-null  object 
 3   popularity    10500 non-null  float64
 4   vote_average  10500 non-null  float64
 5   vote_count    10500 non-null  int64  
 6   release_date  10485 non-null  object 
 7   country       10500 non-null  object 
 8   label         10500 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 738.4+ KB
None


Unnamed: 0,id,title,overview,popularity,vote_average,vote_count,release_date,country,label
0,1233413,Sinners,"Trying to leave their troubled lives behind, t...",499.7936,7.5,1163,2025-04-16,United States of America,horror
1,574475,Final Destination Bloodlines,"Plagued by a violent recurring nightmare, coll...",265.4529,7.0,585,2025-05-14,United States of America,horror
2,1284120,The Ugly Stepsister,In a fairy-tale kingdom where beauty is a brut...,192.4116,7.1,128,2025-03-07,Denmark,horror
3,568770,The Containment,A girl is possessed by a dark and mysterious e...,113.7875,7.778,9,2025-06-05,Mexico,horror
4,1232546,Until Dawn,One year after her sister Melanie mysteriously...,128.4101,6.509,585,2025-04-23,United States of America,horror


In [None]:
# Bagian 4: Penanganan Missing Value
print("Missing values sebelum ditangani:\n", df.isnull().sum())

# Tangani nilai kosong pada 'overview'
df['overview'] = df['overview'].fillna('')

# Drop baris yang masih mengandung nilai kosong lainnya
df.dropna(inplace=True)

print("Missing values setelah ditangani:\n", df.isnull().sum())

Missing values sebelum ditangani:
 id               0
title            0
overview        31
popularity       0
vote_average     0
vote_count       0
release_date    15
country          0
label            0
dtype: int64
Missing values setelah ditangani:
 id              0
title           0
overview        0
popularity      0
vote_average    0
vote_count      0
release_date    0
country         0
label           0
dtype: int64


In [None]:
# Bagian 5: Pembersihan & Normalisasi Teks
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'\d+', '', text)  # hapus angka
    text = text.translate(str.maketrans('', '', string.punctuation))  # hapus tanda baca
    text = re.sub(r'\s+', ' ', text).strip()  # hapus spasi berlebih
    return text

df['cleaned_overview'] = df['overview'].apply(clean_text)
df[['overview', 'cleaned_overview']].head()

Unnamed: 0,overview,cleaned_overview
0,"Trying to leave their troubled lives behind, t...",trying to leave their troubled lives behind tw...
1,"Plagued by a violent recurring nightmare, coll...",plagued by a violent recurring nightmare colle...
2,In a fairy-tale kingdom where beauty is a brut...,in a fairytale kingdom where beauty is a bruta...
3,A girl is possessed by a dark and mysterious e...,a girl is possessed by a dark and mysterious e...
4,One year after her sister Melanie mysteriously...,one year after her sister melanie mysteriously...


In [None]:
# Bagian 6: Penghapusan Stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

df['no_stopwords'] = df['cleaned_overview'].apply(remove_stopwords)
df[['cleaned_overview', 'no_stopwords']].head()

Unnamed: 0,cleaned_overview,no_stopwords
0,trying to leave their troubled lives behind tw...,trying leave troubled lives behind twin brothe...
1,plagued by a violent recurring nightmare colle...,plagued violent recurring nightmare college st...
2,in a fairytale kingdom where beauty is a bruta...,fairytale kingdom beauty brutal business elvir...
3,a girl is possessed by a dark and mysterious e...,girl possessed dark mysterious entity fight el...
4,one year after her sister melanie mysteriously...,one year sister melanie mysteriously disappear...


In [None]:
# Bagian 7: Cek duplikat secara keseluruhan
duplicates_all = df[df.duplicated()]
print(f"Jumlah duplikat (semua kolom): {len(duplicates_all)}")

Jumlah duplikat (semua kolom): 0


In [None]:
# Bagian 8: Tokenisasi
df['tokens'] = df['no_stopwords'].apply(lambda x: x.split())
df[['no_stopwords', 'tokens']].head()

Unnamed: 0,no_stopwords,tokens
0,trying leave troubled lives behind twin brothe...,"[trying, leave, troubled, lives, behind, twin,..."
1,plagued violent recurring nightmare college st...,"[plagued, violent, recurring, nightmare, colle..."
2,fairytale kingdom beauty brutal business elvir...,"[fairytale, kingdom, beauty, brutal, business,..."
3,girl possessed dark mysterious entity fight el...,"[girl, possessed, dark, mysterious, entity, fi..."
4,one year sister melanie mysteriously disappear...,"[one, year, sister, melanie, mysteriously, dis..."


In [None]:
# Bagian 9:Penggabungan Token Menjadi Teks

# Buat kolom final overview dengan gabungan token
df['final_overview'] = df['tokens'].apply(lambda tokens: ' '.join(tokens))

# Cek hasilnya
for i in range(min(3, len(df))):
    print(f"Judul {i+1}:")
    print("Original Overview:\n", df['overview'].iloc[i])
    print("Cleaned Overview:\n", df['cleaned_overview'].iloc[i])
    print("Tokens:\n", df['tokens'].iloc[i])
    print("Final Overview:\n", df['final_overview'].iloc[i])
    print("-" * 50)

Judul 1:
Original Overview:
 Trying to leave their troubled lives behind, twin brothers return to their hometown to start again, only to discover that an even greater evil is waiting to welcome them back.
Cleaned Overview:
 trying to leave their troubled lives behind twin brothers return to their hometown to start again only to discover that an even greater evil is waiting to welcome them back
Tokens:
 ['trying', 'leave', 'troubled', 'lives', 'behind', 'twin', 'brothers', 'return', 'hometown', 'start', 'discover', 'even', 'greater', 'evil', 'waiting', 'welcome', 'back']
Final Overview:
 trying leave troubled lives behind twin brothers return hometown start discover even greater evil waiting welcome back
--------------------------------------------------
Judul 2:
Original Overview:
 Plagued by a violent recurring nightmare, college student Stefanie heads home to track down the one person who might be able to break the cycle and save her family from the grisly demise that inevitably awai

In [None]:
# Bagian 10: TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['no_stopwords'])

print("Shape matrix TF-IDF:", tfidf_matrix.shape)
print("Contoh fitur:", tfidf.get_feature_names_out()[:10])

Shape matrix TF-IDF: (10485, 5000)
Contoh fitur: ['aaron' 'abandoned' 'abducted' 'abigail' 'abilities' 'ability' 'able'
 'aboard' 'abroad' 'absence']


In [None]:
# Bagian 11: Simpan ke CSV
df.to_csv("preprocessed_tmdb.csv", index=False)
pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out()).to_csv("tfidf_matrix.csv", index=False)

print("Preprocessing selesai. File disimpan:")
print("- preprocessed_tmdb.csv")
print("- tfidf_matrix.csv")

Preprocessing selesai. File disimpan:
- preprocessed_tmdb.csv
- tfidf_matrix.csv
