In [1]:
# -*- coding: utf-8 -*-
# Analisis Sentimen MBG - Save Model Version (Final Fixed)

import pandas as pd
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from joblib import dump

warnings.filterwarnings("ignore")

# =========================
# 1. Muat dataset
# =========================
data = pd.read_csv("dc_tweets_mbg_fix.csv")

print("ðŸ“Š Jumlah data awal:", len(data))

data = data.dropna(subset=['text'])
data = data[data['text'].str.strip() != '']
print("âœ… Jumlah data setelah pembersihan:", len(data))

# =========================
# 2. Fitur & Label
# =========================
X_text = data['text']
y = data['Sentimen_Title'].astype(str)

# =========================
# 3. Split Train & Test
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X_text, y, test_size=0.1, random_state=42, stratify=y
)

# =========================
# 4. TF-IDF awal untuk SMOTE
# =========================
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)

# =========================
# 5. SMOTE balancing
# =========================
smote = SMOTE(random_state=32)
X_train_res, y_train_res = smote.fit_resample(X_train_tfidf.toarray(), y_train)

# =========================
# 6. Train model SVM
# =========================
model = SVC(kernel="linear", probability=True, random_state=42)
model.fit(X_train_res, y_train_res)

# =========================
# 7. Evaluasi
# =========================
X_test_tfidf = tfidf.transform(X_test)
y_pred = model.predict(X_test_tfidf)

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

print(f"ðŸŽ¯ Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# =========================
# 8. Final pipeline (TF-IDF + SVM)
# =========================
pipeline = Pipeline([
    ("tfidf", tfidf),
    ("svm", model)
])

# =========================
# 9. Save Model
# =========================
dump(pipeline, "svm_mbg_sentiment.pkl")
print("\nðŸ’¾ Model berhasil disimpan sebagai svm_mbg_sentiment.pkl")


ðŸ“Š Jumlah data awal: 4899
âœ… Jumlah data setelah pembersihan: 4885


ValueError: cannot use sparse input in 'SVC' trained on dense data