In [10]:
import mysql.connector
from mysql.connector import Error
import string
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# Import Stemmer dari Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# --- KONFIGURASI ---
DB_CONFIG = {
    'host': 'localhost',
    'user': 'root',
    'password': '',
    'database': 'twitter'
}
# Nama tabel data utama
TABLE_NAME = 'trending' 
ID_COLUMN = 'id'
TEXT_COLUMN = 'isi_twit'

# Nama tabel kamus slang
SLANG_TABLE_NAME = 'slangword'
SLANG_COLUMN = 'kata_tbaku'
FORMAL_COLUMN = 'kata_baku'


# Nama kolom untuk setiap langkah (ditambahkan langkah baru)
STEP_COLUMNS = {
    'lower': 'lowercase',
    'no_punct': 'no_punctuation',
    'tokenized': 'tokenized',
    'no_stopwords': 'no_stopwords',
    'no_slang': 'no_slang',
    'stemmed': 'stemmed',
    'final': 'final_processed_text'
}

# --- FUNGSI BARU UNTUK MEMUAT KAMUS SLANG DARI DB ---
def load_slang_dictionary(conn):
    """
    Mengambil data dari tabel kamus slang dan mengubahnya menjadi dictionary Python.
    """
    slang_dict = {}
    try:
        cursor = conn.cursor()
        query = f"SELECT {SLANG_COLUMN}, {FORMAL_COLUMN} FROM {SLANG_TABLE_NAME}"
        cursor.execute(query)
        rows = cursor.fetchall()
        
        # Mengubah hasil query (list of tuples) menjadi dictionary
        for row in rows:
            slang, formal = row
            if slang: # Pastikan kata slang tidak kosong
                slang_dict[slang.strip()] = formal.strip()
        
        print(f"Berhasil memuat {len(slang_dict)} kata dari kamus slang.")
        return slang_dict
        
    except Error as e:
        print(f"Error saat memuat kamus slang: {e}")
        return {} # Kembalikan dictionary kosong jika gagal
    finally:
        if cursor:
            cursor.close()

# --- FUNGSI PRA-PEMROSESAN YANG DIPERBARUI ---
# Sekarang fungsi ini memerlukan kamus slang sebagai argumen
def process_and_get_steps(text, slang_dict, stemmer, stop_words_combined):
    results = {}

    if not isinstance(text, str):
        # Mengembalikan nilai default kosong
        for col in STEP_COLUMNS.values():
            results[col] = '' if 'token' not in col else json.dumps([])
        return results

    # 1. Lowercase
    lower_text = text.lower()
    results[STEP_COLUMNS['lower']] = lower_text

    # 2. Hapus Tanda Baca
    translator = str.maketrans('', '', string.punctuation)
    no_punct_text = lower_text.translate(translator)
    results[STEP_COLUMNS['no_punct']] = no_punct_text

    # 3. Tokenisasi
    tokens = word_tokenize(no_punct_text)
    results[STEP_COLUMNS['tokenized']] = json.dumps(tokens)

    # 4. Hapus Stopwords
    no_stopwords_tokens = [word for word in tokens if word not in stop_words_combined and word.isalpha()]
    results[STEP_COLUMNS['no_stopwords']] = json.dumps(no_stopwords_tokens)

    # 5. Normalisasi Slang Word (menggunakan slang_dict dari argumen)
    no_slang_tokens = [slang_dict.get(word, word) for word in no_stopwords_tokens]
    results[STEP_COLUMNS['no_slang']] = json.dumps(no_slang_tokens)
    
    # 6. Stemming
    stemmed_tokens = [stemmer.stem(word) for word in no_slang_tokens]
    results[STEP_COLUMNS['stemmed']] = json.dumps(stemmed_tokens)

    # 7. Teks Final
    final_text = ' '.join(stemmed_tokens)
    results[STEP_COLUMNS['final']] = final_text
    
    return results

# --- FUNGSI DATABASE (SAMA SEPERTI SEBELUMNYA) ---
def setup_database_columns(conn, table_name):
    cursor = conn.cursor(buffered=True)
    print(f"Memeriksa struktur tabel '{table_name}'...")
    try:
        cursor.execute(f"DESCRIBE {table_name};")
        existing_columns = [col[0] for col in cursor.fetchall()]
        
        for col_name in STEP_COLUMNS.values():
            if col_name not in existing_columns:
                print(f"Kolom '{col_name}' tidak ditemukan. Menambahkan...")
                cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN {col_name} TEXT NULL")
                print(f"Kolom '{col_name}' berhasil ditambahkan.")
        conn.commit()
    except Error as e:
        print(f"Error saat setup kolom: {e}")
        conn.rollback()
    finally:
        cursor.close()

# --- FUNGSI UTAMA YANG DIPERBARUI ---
def main():
    conn = None
    try:
        conn = mysql.connector.connect(**DB_CONFIG)
        if not conn.is_connected():
            print("Gagal terhubung ke database.")
            return

        print("Berhasil terhubung ke database MySQL.")
        
        # INISIALISASI OBJECT PENTING (dilakukan sekali saja setelah koneksi berhasil)
        # 1. Muat Kamus Slang dari DB
        slang_dict = load_slang_dictionary(conn)
        if not slang_dict:
            print("Kamus slang kosong atau gagal dimuat. Proses dihentikan.")
            return

        # 2. Buat Stemmer
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        # 3. Muat Stopwords
        stop_words_id = set(stopwords.words('indonesian'))
        stop_words_en = set(stopwords.words('english'))
        stop_words_combined = stop_words_id.union(stop_words_en)

        # Siapkan kolom-kolom di tabel data
        setup_database_columns(conn, TABLE_NAME)

        cursor = conn.cursor()
        
        query = f"SELECT {ID_COLUMN}, {TEXT_COLUMN} FROM {TABLE_NAME} WHERE {TEXT_COLUMN} IS NOT NULL AND {STEP_COLUMNS['final']} IS NULL"
        cursor.execute(query)
        rows = cursor.fetchall()

        if not rows:
            print("Tidak ada data baru untuk diproses.")
            return

        print(f"Ditemukan {len(rows)} baris data baru untuk diproses.")
        
        update_data = []
        for row in rows:
            tweet_id, original_text = row
            # Kirim object yang diperlukan ke fungsi proses
            processed_steps = process_and_get_steps(original_text, slang_dict, stemmer, stop_words_combined)
            
            update_values = (
                processed_steps[STEP_COLUMNS['lower']], processed_steps[STEP_COLUMNS['no_punct']],
                processed_steps[STEP_COLUMNS['tokenized']], processed_steps[STEP_COLUMNS['no_stopwords']],
                processed_steps[STEP_COLUMNS['no_slang']], processed_steps[STEP_COLUMNS['stemmed']],
                processed_steps[STEP_COLUMNS['final']], tweet_id
            )
            update_data.append(update_values)
        
        if update_data:
            print("Menyimpan hasil ke database...")
            update_query = f"""
                UPDATE {TABLE_NAME} SET 
                    {STEP_COLUMNS['lower']} = %s, {STEP_COLUMNS['no_punct']} = %s,
                    {STEP_COLUMNS['tokenized']} = %s, {STEP_COLUMNS['no_stopwords']} = %s,
                    {STEP_COLUMNS['no_slang']} = %s, {STEP_COLUMNS['stemmed']} = %s,
                    {STEP_COLUMNS['final']} = %s
                WHERE {ID_COLUMN} = %s
            """
            cursor.executemany(update_query, update_data)
            conn.commit()
            print(f"{cursor.rowcount} baris berhasil diperbarui di database.")

    except Error as e:
        print(f"Error pada fungsi utama: {e}")
    finally:
        if conn and conn.is_connected():
            conn.close()
            print("Koneksi MySQL ditutup.")

if __name__ == '__main__':
    # Pastikan resource NLTK 'punkt' dan 'stopwords' sudah diunduh
    for package in ['punkt', 'stopwords']:
        try:
            nltk.data.find(f'tokenizers/{package}' if package == 'punkt' else f'corpora/{package}')
        except LookupError:
            print(f"Corpus '{package}' tidak ditemukan. Mengunduh...")
            nltk.download(package)
    
    main()

ModuleNotFoundError: No module named 'Sastrawi'