In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [8]:
PATH_MANUAL = "dataset_manual_label.csv"
PATH_EMPTY  = "dataset_empty_label.csv"

df_manual = pd.read_csv(PATH_MANUAL, encoding='latin1', sep=';')
df_empty  = pd.read_csv(PATH_EMPTY, encoding='latin1', sep=',', on_bad_lines='skip')

print("df_manual shape:", df_manual.shape)
print("df_empty shape:", df_empty.shape)

df_manual.head()

df_manual shape: (250, 3)
df_empty shape: (748, 3)


Unnamed: 0,text,text_cleaned_full,manual
0,kepo bgt adekkkk??????????????,kepo adekkkk,negatif
1,"buah jatuh se pohon "" nya????",buah jatuh se pohon nya,negatif
2,Cushion Nya kegelapan ga sihhh??,cushion nya kegelapan sihhh,negatif
3,LU APAIN PONI BOCIL ITU KAK JENNN?!!! KENAPA L...,lu apain poni bocil kak jennn kenapa lu buat s...,negatif
4,mayi gamau kalah cantik sama mamanya????,mayi gamau kalah cantik sama mamanya,negatif


In [9]:
required_text_col = "text_cleaned_full"
required_label_col = "manual"

for col in [required_text_col, required_label_col]:
    if col not in df_manual.columns:
        raise ValueError(f"Kolom '{col}' tidak ada di df_manual. Kolom tersedia: {list(df_manual.columns)}")

if required_text_col not in df_empty.columns:
    raise ValueError(f"Kolom '{required_text_col}' tidak ada di df_empty. Kolom tersedia: {list(df_empty.columns)}")

print("OK: kolom yang dibutuhkan tersedia.")
print("Label unik di df_manual:", df_manual[required_label_col].dropna().unique()[:20])

OK: kolom yang dibutuhkan tersedia.
Label unik di df_manual: ['negatif' 'netral' 'positif']


In [10]:
train_df = df_manual.copy()

# buang label kosong
train_df = train_df.dropna(subset=[required_label_col])

# pastikan tipe string
train_df[required_text_col] = train_df[required_text_col].fillna("").astype(str)
train_df[required_label_col] = train_df[required_label_col].astype(str)

# buang teks kosong (opsional tapi biasanya membantu)
train_df = train_df[train_df[required_text_col].str.strip() != ""]

print("Training data shape (after clean):", train_df.shape)
train_df.head()


Training data shape (after clean): (250, 3)


Unnamed: 0,text,text_cleaned_full,manual
0,kepo bgt adekkkk??????????????,kepo adekkkk,negatif
1,"buah jatuh se pohon "" nya????",buah jatuh se pohon nya,negatif
2,Cushion Nya kegelapan ga sihhh??,cushion nya kegelapan sihhh,negatif
3,LU APAIN PONI BOCIL ITU KAK JENNN?!!! KENAPA L...,lu apain poni bocil kak jennn kenapa lu buat s...,negatif
4,mayi gamau kalah cantik sama mamanya????,mayi gamau kalah cantik sama mamanya,negatif


In [11]:
X = train_df[required_text_col].values
y = train_df[required_label_col].values

# stratify kalau label >= 2 kelas dan tiap kelas cukup
stratify_arg = y if len(np.unique(y)) > 1 else None

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=stratify_arg
)

model = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95
    )),
    ("clf", LogisticRegression(
        max_iter=2000,
        n_jobs=None,          # aman di banyak environment; kalau mau bisa set n_jobs=-1 untuk versi sklearn tertentu
        class_weight="balanced"  # sering membantu kalau class imbalance
    ))
])

model


In [12]:
model.fit(X_train, y_train)
print("Training selesai.")

Training selesai.


In [13]:
y_pred = model.predict(X_valid)
acc = accuracy_score(y_valid, y_pred)

print("Accuracy:", acc)
print("\nClassification report:\n")
print(classification_report(y_valid, y_pred))

Accuracy: 0.58

Classification report:

              precision    recall  f1-score   support

     negatif       0.56      0.75      0.64        20
      netral       0.29      0.20      0.24        10
     positif       0.75      0.60      0.67        20

    accuracy                           0.58        50
   macro avg       0.53      0.52      0.51        50
weighted avg       0.58      0.58      0.57        50



In [14]:
empty_df = df_empty.copy()
empty_df[required_text_col] = empty_df[required_text_col].fillna("").astype(str)

# Prediksi label
pred_label = model.predict(empty_df[required_text_col].values)

# Confidence: ambil prob tertinggi per row
# (LogReg pipeline punya predict_proba)
proba = model.predict_proba(empty_df[required_text_col].values)
pred_conf = proba.max(axis=1)

empty_df["manual_pred"] = pred_label
empty_df["manual_pred_conf"] = pred_conf

empty_df.head()

Unnamed: 0,text,text_cleaned_full,manual,manual_pred,manual_pred_conf
0,"""mayi minta"" ð«´",mayi minta,,negatif,0.40384
1,itu pengasuhnya kamari yang dulu kemana sih ka...,pengasuhnya kamari dulu kemana kalo boleh tauu...,,negatif,0.365054
2,siapa yang motong poni kamari ko agak miring d...,siapa motong poni kamari ko agak miring dikit,,negatif,0.651147
3,poni nya kapan tumbuh lagi mayi lama bgt dah p...,poni nya kapan tumbuh lagi mayi lama dah peras...,,negatif,0.685353
4,"maap jen, tp cantikan mayi ðð­",maap jen cantikan mayi,,negatif,0.392658


In [15]:
# Kalau di df_empty ada kolom 'manual', isi yang kosong saja
if "manual" in empty_df.columns:
    manual_existing = empty_df["manual"]
    # deteksi kosong: NaN atau string kosong/spasi
    is_empty = manual_existing.isna() | (manual_existing.astype(str).str.strip() == "")

    empty_df["manual_filled"] = empty_df["manual"]
    empty_df.loc[is_empty, "manual_filled"] = empty_df.loc[is_empty, "manual_pred"]
else:
    # Kalau tidak ada kolom manual, buat manual_filled langsung dari pred
    empty_df["manual_filled"] = empty_df["manual_pred"]

empty_df[["manual_pred", "manual_pred_conf", "manual_filled"]].head(10)

 'negatif' 'negatif' 'negatif' 'positif' 'positif' 'positif' 'positif'
 'positif' 'positif' 'positif' 'netral' 'netral' 'negatif' 'negatif'
 'positif' 'positif' 'positif' 'negatif' 'negatif' 'negatif' 'negatif'
 'netral' 'negatif' 'positif' 'positif' 'negatif' 'positif' 'positif'
 'negatif' 'positif' 'negatif' 'positif' 'positif' 'netral' 'positif'
 'positif' 'positif' 'positif' 'positif' 'negatif' 'negatif' 'positif'
 'negatif' 'negatif' 'positif' 'negatif' 'netral' 'negatif' 'netral'
 'positif' 'negatif' 'negatif' 'positif' 'positif' 'positif' 'positif'
 'positif' 'positif' 'positif' 'negatif' 'positif' 'positif' 'positif'
 'negatif' 'positif' 'negatif' 'netral' 'netral' 'negatif' 'positif'
 'positif' 'positif' 'positif' 'positif' 'positif' 'negatif' 'netral'
 'positif' 'netral' 'positif' 'positif' 'positif' 'negatif' 'negatif'
 'positif' 'negatif' 'positif' 'positif' 'negatif' 'negatif' 'positif'
 'negatif' 'positif' 'positif' 'positif' 'positif' 'positif' 'positif'
 'positif' 'posi

Unnamed: 0,manual_pred,manual_pred_conf,manual_filled
0,negatif,0.40384,negatif
1,negatif,0.365054,negatif
2,negatif,0.651147,negatif
3,negatif,0.685353,negatif
4,negatif,0.392658,negatif
5,negatif,0.404987,negatif
6,negatif,0.351181,negatif
7,negatif,0.351181,negatif
8,negatif,0.560499,negatif
9,negatif,0.382891,negatif


In [19]:
OUT_PATH = "dataset_empty_labeled_by_model.csv"
empty_df.to_csv(OUT_PATH, index=False)

print("Saved:", OUT_PATH)

Saved: dataset_empty_labeled_by_model.csv
