In [1]:
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from rapidfuzz import process, fuzz
import nltk
import multiprocessing as mp
from functools import partial
import numpy as np
from tqdm import tqdm
import time

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv('dataset/processed_dataset/final_merged_dataset.csv')
slang_words = pd.read_csv('dataset/additional/slang_words.csv')

In [3]:
# Slang dictionary
slang_dict = {str(k): str(v) for k, v in zip(slang_words['slang'], slang_words['meaning'])}

# Load list kata KBBI (harus berupa list of words)
with open('dataset/additional/list_kbbi.txt', 'r', encoding='utf-8') as f:
    list_kbbi = [line.strip() for line in f.readlines() if line.strip()]

In [4]:
# Inisialisasi stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [5]:
def has_repeated_chars(token):
    """Cek apakah token punya huruf berulang lebih dari 2x"""
    return bool(re.search(r'(.)\1{2,}', token))

def correct_with_kbbi(token):
    """Koreksi token dengan RapidFuzz jika ada huruf berulang."""
    if has_repeated_chars(token):
        # Kurangi huruf berulang dulu (biarkan max 2)
        token = re.sub(r'(.)\1{2,}', r'\1\1', token)
        # Fuzzy match ke KBBI
        match = process.extractOne(token, list_kbbi, scorer=fuzz.ratio)
        if match and match[1] >= 70:
            return match[0]
    return token  # Tidak ada huruf berulang → langsung return

def clean_and_normalize_text(text):
    text = text.lower()

    # Tokenisasi awal
    tokens = word_tokenize(text)

    normalized_tokens = []
    for token in tokens:
        # Slang normalization
        if token in slang_dict:
            normalized_tokens.append(slang_dict[token])
        else:
            # Hanya koreksi jika ada huruf berulang
            corrected = correct_with_kbbi(token)
            normalized_tokens.append(corrected)

    # Gabungkan lagi
    text = " ".join(normalized_tokens)

    # Hapus angka
    text = re.sub(r'\d+', '', text)

    # Pertahankan tanda baca penting
    allowed_punct = "!?.,…"
    text = ''.join(ch for ch in text if ch.isalnum() or ch.isspace() or ch in allowed_punct)

    # Hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenisasi ulang + stemming
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    return " ".join(stemmed_tokens)

In [None]:
# Terapkan preprocessing
df['tweet_processed'] = df['tweet'].fillna('').astype(str).apply(clean_and_normalize_text)
df.to_csv('dataset/preprocessed_data_final.csv', index=False)