In [None]:
import pandas as pd
import re, string
from bs4 import BeautifulSoup
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from spellchecker import SpellChecker
from collections import Counter
from tqdm import tqdm

# Setup tqdm untuk apply pandas
tqdm.pandas()

# Kamus slang
contractions_dict = {
    "gak": "tidak", "ga": "tidak", "nggak": "tidak", "enggak": "tidak", "ngga": "tidak", "gk": "tidak",
    "gue": "saya", "gw": "saya", "gua": "saya", "lu": "kamu", "loe": "kamu",
    "dah": "sudah", "udah": "sudah", "aja": "saja", "yg": "yang", "utk": "untuk",
    "tp": "tetapi", "tapi": "tetapi", "bgt": "sekali", "lg": "lagi"
}

stop_words = set([
    "yang","di","ke","dan","dari","ini","itu","pada","untuk",
    "dengan","sebagai","adalah","merupakan","dalam","yaitu",
    "suatu","sebuah","akan","telah","bisa","agar","oleh",
    "bahwa","juga","atau","tidak","namun","tetapi","kemudian"
])

# Stemmer dan SpellChecker
factory = StemmerFactory()
stemmer = factory.create_stemmer()
spell = SpellChecker()

# --- Fungsi Preprocessing ---
def clean_base_text(text):
    if not isinstance(text, str): return ''
    text = text.lower()
    words = text.split()
    words = [contractions_dict.get(w, w) for w in words]
    text = ' '.join(words)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    text = BeautifulSoup(text, "html.parser").get_text()
    return text

def tokenize(text): return text.split()
def remove_stopwords(tokens): return [w for w in tokens if w not in stop_words]
def stemming(tokens): return stemmer.stem(' '.join(tokens)).split()
def correct(tokens): return [spell.correction(w) or w for w in tokens]

# --- Main Preprocessing ---
def preprocess():
    print("ðŸ“‚ Membaca file CSV...")
    df = pd.read_csv("pta_manajemen_raw.csv")
    print(f"âœ… Dataset terbaca, jumlah data: {len(df)} baris\n")

    print("ðŸ”„ Tahap 1: Cleaning...")
    df["abstrak_indonesia_clean"] = df["abstrak_indonesia"].progress_apply(clean_base_text)

    print("ðŸ”„ Tahap 2: Tokenizing...")
    df["abstrak_indonesia_tokens"] = df["abstrak_indonesia_clean"].progress_apply(tokenize)

    print("ðŸ”„ Tahap 3: Remove Stopwords...")
    df["abstrak_indonesia_stopwords"] = df["abstrak_indonesia_tokens"].progress_apply(remove_stopwords)

    print("ðŸ”„ Tahap 4: Stemming...")
    df["abstrak_indonesia_stemmed"] = df["abstrak_indonesia_stopwords"].progress_apply(stemming)

    print("ðŸ”„ Tahap 5: Spell Correction...")
    df["abstrak_indonesia_corrected"] = df["abstrak_indonesia_stemmed"].progress_apply(correct)

    print("ðŸ”„ Tahap 6: Hitung Frekuensi Kata...")
    df["frekuensi_kata_indonesia"] = df["abstrak_indonesia_stemmed"].progress_apply(lambda x: dict(Counter(x)))

    print("\nðŸ’¾ Menyimpan hasil preprocessing ke pta_manajemen_preprocessed.csv...")
    df.to_csv("pta_manajemen_preprocessed.csv", index=False)
    print("âœ… Preprocessing selesai.")

if __name__ == "__main__":
    preprocess()

In [1]:
import pandas as pd
import re, string
from bs4 import BeautifulSoup
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from spellchecker import SpellChecker
from collections import Counter
from tqdm import tqdm

# Setup tqdm untuk apply pandas
tqdm.pandas()

# Kamus slang
contractions_dict = {
    "gak": "tidak", "ga": "tidak", "nggak": "tidak", "enggak": "tidak", "ngga": "tidak", "gk": "tidak",
    "gue": "saya", "gw": "saya", "gua": "saya", "lu": "kamu", "loe": "kamu",
    "dah": "sudah", "udah": "sudah", "aja": "saja", "yg": "yang", "utk": "untuk",
    "tp": "tetapi", "tapi": "tetapi", "bgt": "sekali", "lg": "lagi"
}

stop_words = set([
    "yang","di","ke","dan","dari","ini","itu","pada","untuk",
    "dengan","sebagai","adalah","merupakan","dalam","yaitu",
    "suatu","sebuah","akan","telah","bisa","agar","oleh",
    "bahwa","juga","atau","tidak","namun","tetapi","kemudian"
])

# Stemmer dan SpellChecker
factory = StemmerFactory()
stemmer = factory.create_stemmer()
spell = SpellChecker()

# --- Fungsi Preprocessing ---
def clean_base_text(text):
    if not isinstance(text, str): return ''
    text = text.lower()
    words = text.split()
    words = [contractions_dict.get(w, w) for w in words]
    text = ' '.join(words)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    text = BeautifulSoup(text, "html.parser").get_text()
    return text

def tokenize(text): return text.split()
def remove_stopwords(tokens): return [w for w in tokens if w not in stop_words]
def stemming(tokens): return stemmer.stem(' '.join(tokens)).split()
def correct(tokens): return [spell.correction(w) or w for w in tokens]

# --- Main Preprocessing ---
def preprocess():
    print("ðŸ“‚ Membaca file CSV...")
    df = pd.read_csv("pta_manajemen_raw.csv")
    print(f"âœ… Dataset terbaca, jumlah data: {len(df)} baris\n")

    print("ðŸ”„ Tahap 1: Cleaning...")
    df["abstrak_indonesia_clean"] = df["abstrak_indonesia"].progress_apply(clean_base_text)

    print("ðŸ”„ Tahap 2: Tokenizing...")
    df["abstrak_indonesia_tokens"] = df["abstrak_indonesia_clean"].progress_apply(tokenize)

    print("ðŸ”„ Tahap 3: Remove Stopwords...")
    df["abstrak_indonesia_stopwords"] = df["abstrak_indonesia_tokens"].progress_apply(remove_stopwords)

    print("ðŸ”„ Tahap 4: Stemming...")
    df["abstrak_indonesia_stemmed"] = df["abstrak_indonesia_stopwords"].progress_apply(stemming)

    print("ðŸ”„ Tahap 5: Spell Correction...")
    df["abstrak_indonesia_corrected"] = df["abstrak_indonesia_stemmed"].progress_apply(correct)

    print("ðŸ”„ Tahap 6: Hitung Frekuensi Kata...")
    df["frekuensi_kata_indonesia"] = df["abstrak_indonesia_stemmed"].progress_apply(lambda x: dict(Counter(x)))

    print("\nðŸ’¾ Menyimpan hasil preprocessing ke pta_manajemen_preprocessed.csv...")
    df.to_csv("pta_manajemen_preprocessed.csv", index=False)
    print("âœ… Preprocessing selesai.")

if __name__ == "__main__":
    preprocess()

ðŸ“‚ Membaca file CSV...
âœ… Dataset terbaca, jumlah data: 1031 baris

ðŸ”„ Tahap 1: Cleaning...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1031/1031 [00:00<00:00, 3436.48it/s]


ðŸ”„ Tahap 2: Tokenizing...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1031/1031 [00:00<00:00, 41933.28it/s]


ðŸ”„ Tahap 3: Remove Stopwords...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1031/1031 [00:00<00:00, 45354.05it/s]


ðŸ”„ Tahap 4: Stemming...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1031/1031 [09:35<00:00,  1.79it/s]


ðŸ”„ Tahap 5: Spell Correction...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1031/1031 [9:05:44<00:00, 31.76s/it]  


ðŸ”„ Tahap 6: Hitung Frekuensi Kata...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1031/1031 [00:00<00:00, 53527.52it/s]



ðŸ’¾ Menyimpan hasil preprocessing ke pta_manajemen_preprocessed.csv...
âœ… Preprocessing selesai.
