# 1.1 Data Preprocessing Baseline Model (Logistic Regression and Naive Bayes)

## 1.1.1 Load dataset

In [1]:
# -------------------------------
# Preprocessing untuk Model Baseline
# -------------------------------

import pandas as pd
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# -------------------------------
# STEP 1: Load Dataset
# -------------------------------
df = pd.read_csv("../dataset/dataset.csv")  # Pastikan kolomnya: 'tweet', 'sentimen'


## 1.1.2 Setup Tools

In [2]:
# -------------------------------
# STEP 2: Setup Tools
# -------------------------------
factory = StemmerFactory()
stemmer = factory.create_stemmer()

stop_factory = StopWordRemoverFactory()
default_stopwords = set(stop_factory.get_stop_words())

# Tambahan stopwords informal dan singkatan umum
custom_stopwords = {
    'dsb', 'dst', 'dll', 'ok', 'oh', 'ya', 'tuh', 'gitu', 'gimana', 'gmn', 'nggak', 'ga', 'gak',
    'mau', 'aja', 'nih', 'sih', 'dong', 'deh', 'lah', 'nya', 'punya', 'buat', 'jd', 'krn',
    'dr', 'dgn', 'lg', 'tp', 'trus', 'utk', 'pdhl', 'sm', 'sy', 'lgsg', 'blm', 'udh', 'tdk',
    'bgt', 'smua', 'skrg', 'td', 'trs', 'cmn', 'tp', 'bkn', 'dl', 'gw', 'loe', 'elu', 'luh',
    'gue', 'gw', 'gwa', 'lo', 'sama', 'sm', 'enak', 'cepat', 'terlalu', 'biasa', 'yuk', 'aja', 'nya', 'bro',
    'sis', 'wkwk', 'haha', 'hehe', 'anjir', 'banget', 'bgt', 'bangsat', 'nggk', 'yaudah', 'yaudahh',
    'okeh', 'sip', 'yoi', 'lah', 'coy', 'bodo', 'fix', 'relate', 'halah', 'lol', 'btw', 'makasih',
    'makasi', 'thx', 'thanks', 'seharusnya', 'sebetulnya', 'pasti', 'setidaknya', 'saja',
    'tentu', 'walau', 'tolong', 'apalagi', 'aplg', 'bagaimanapun', 'bgmnpn'
}

stopwords_id = default_stopwords.union(custom_stopwords)


## 1.1.3 Emoji Mapping

In [3]:
# -------------------------------
# STEP 3: Emoji Mapping
# -------------------------------
emoji_map = {
    "😠": "marah", "😡": "marah",
    "😢": "sedih", "😭": "sedih",
    "😂": "lucu", "🤣": "lucu",
    "😊": "senang", "😁": "senang", "😃": "senang", "😄": "senang",
    "😍": "cinta", "❤️": "cinta",
    "😒": "kesal", "😞": "kecewa",
    "😎": "bangga", "👍": "setuju", "👎": "tidak_setuju"
}

def replace_emojis(text):
    for emoji, word in emoji_map.items():
        text = text.replace(emoji, f" {word} ")
    return text

## 1.1.4 Clean Function

In [4]:
# -------------------------------
# STEP 4: Clean Function
# -------------------------------
def clean_text(text):
    text = str(text).lower()
    text = replace_emojis(text)
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)   # hapus URL
    text = re.sub(r"@\w+", '', text)                      # hapus mention
    text = re.sub(r"#", '', text)                         # hapus hashtag simbol
    text = re.sub(r"[^\w\s]", '', text)                   # hapus simbol non-alphanum
    text = re.sub(r"\d+", '', text)                       # hapus angka
    text = re.sub(r"\s+", " ", text).strip()              # hapus spasi ganda

    # Stopword removal
    words = text.split()
    words = [word for word in words if word not in stopwords_id]
    text = " ".join(words)

    # Stemming
    text = stemmer.stem(text)

    return text

## 1.1.5 Apply

In [5]:
# -------------------------------
# STEP 5: Apply Preprocessing
# -------------------------------
df["cleaned_tweet"] = df["tweet"].apply(clean_text)

## 1.1.6

In [6]:
# -------------------------------
# STEP 6: Simpan / Lanjut ke Modeling
# -------------------------------
df[["tweet", "sentimen", "cleaned_tweet"]].to_csv("../dataset/cleaned_dataset_baseline_model.csv", index=False)
print("✅ Dataset berhasil dibersihkan dan disimpan sebagai cleaned_dataset.csv")

✅ Dataset berhasil dibersihkan dan disimpan sebagai cleaned_dataset.csv
