In [6]:
import re
import unicodedata
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# ===============================
# 1. Case Folding
# ===============================
def case_folding(teks: str) -> str:
    if teks is None:
        return ""
    teks = unicodedata.normalize("NFC", teks)
    teks = teks.casefold()
    teks = re.sub(r"[^\w\s]", " ", teks)  # hilangkan tanda baca
    teks = re.sub(r"\s+", " ", teks).strip()  # rapikan spasi
    return teks

# ===============================
# 2. Parsing
# ===============================
def parsing(teks: str):
    return re.findall(r"\b\w+\b", teks)

# ===============================
# 3. Stemming
# ===============================
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemming(tokens):
    return [stemmer.stem(token) for token in tokens]

# ===============================
# 4. Stopword Removal
# ===============================
stop_factory = StopWordRemoverFactory()
stopword_list = set(stop_factory.get_stop_words())

def stopword_removal(tokens):
    return [token for token in tokens if token not in stopword_list]

# ===============================
# 5. Baca Dataset
# ===============================
file_path = "../dataset_youtube_comment.csv"  # pastikan file ini ada
df_input = pd.read_csv(file_path)

# ===============================
# 6. Proses Preprocessing
# ===============================
hasil_comment = []

for teks in df_input["Comment"]:
    teks_cf = case_folding(teks)
    tokens = parsing(teks_cf)
    tokens_stemmed = stemming(tokens)
    tokens_filtered = stopword_removal(tokens_stemmed)
    teks_final = " ".join(tokens_filtered)
    hasil_comment.append(teks_final)

# ===============================
# 7. Ganti kolom Comment
# ===============================
df_input["Comment"] = hasil_comment

# ===============================
# 8. Simpan ke CSV
# ===============================
output_file = "hasil_preprocessing.csv"
df_input.to_csv(output_file, index=False)
print(f"✅ File '{output_file}' berhasil dibuat!")


✅ File 'hasil_preprocessing.csv' berhasil dibuat!
