Import Library

In [1]:
import pandas as pd
import numpy as np

# Preprocessing
import re
import string

# ML
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

# Evaluasi
from sklearn.metrics import classification_report, accuracy_score


Load Datase

In [3]:
df.columns = df.columns.str.strip().str.lower()
print(df.columns)


Index(['text,label'], dtype='object')


In [6]:
df = pd.read_csv('Dataset_Sentimen_Emosi.csv', sep=',', engine='python')


In [7]:
print(df.columns)


Index(['text,label'], dtype='object')


In [8]:
df.head()


Unnamed: 0,"text,label"
0,"saya sangat senang hari ini,senang"
1,"pelayanan toko ini membuat saya bahagia,senang"
2,"aku merasa puas dengan hasilnya,senang"
3,"senyumku tidak hilang sepanjang hari,senang"
4,"ini pengalaman yang menyenangkan,senang"


In [12]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['text_clean'] = df['text'].apply(clean_text)
df.head()


Unnamed: 0,text,label,text_clean
0,saya sangat senang hari ini,senang,saya sangat senang hari ini
1,pelayanan toko ini membuat saya bahagia,senang,pelayanan toko ini membuat saya bahagia
2,aku merasa puas dengan hasilnya,senang,aku merasa puas dengan hasilnya
3,senyumku tidak hilang sepanjang hari,senang,senyumku tidak hilang sepanjang hari
4,ini pengalaman yang menyenangkan,senang,ini pengalaman yang menyenangkan


In [13]:
from sklearn.model_selection import train_test_split

X = df['text_clean']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape)
print("Test :", X_test.shape)


Train: (40,)
Test : (10,)


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=(1,2),
        max_features=5000
    )),
    ('nb', MultinomialNB())
])

pipeline_nb.fit(X_train, y_train)

y_pred_nb = pipeline_nb.predict(X_test)

print("=== NAIVE BAYES ===")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


=== NAIVE BAYES ===
Accuracy: 0.7
              precision    recall  f1-score   support

       marah       1.00      1.00      1.00         2
      netral       1.00      0.50      0.67         2
       sedih       0.00      0.00      0.00         2
      senang       0.40      1.00      0.57         2
       takut       1.00      1.00      1.00         2

    accuracy                           0.70        10
   macro avg       0.68      0.70      0.65        10
weighted avg       0.68      0.70      0.65        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [15]:
from sklearn.neighbors import KNeighborsClassifier

pipeline_knn = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=(1,2),
        max_features=5000
    )),
    ('knn', KNeighborsClassifier(
        n_neighbors=5,
        metric='cosine'
    ))
])

pipeline_knn.fit(X_train, y_train)

y_pred_knn = pipeline_knn.predict(X_test)

print("=== KNN ===")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))


=== KNN ===
Accuracy: 0.3
              precision    recall  f1-score   support

       marah       0.40      1.00      0.57         2
      netral       1.00      0.50      0.67         2
       sedih       0.00      0.00      0.00         2
      senang       0.00      0.00      0.00         2
       takut       0.00      0.00      0.00         2

    accuracy                           0.30        10
   macro avg       0.28      0.30      0.25        10
weighted avg       0.28      0.30      0.25        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [16]:
import pandas as pd

hasil = pd.DataFrame({
    'Model': ['Naive Bayes', 'KNN'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_nb),
        accuracy_score(y_test, y_pred_knn)
    ]
})

hasil


Unnamed: 0,Model,Accuracy
0,Naive Bayes,0.7
1,KNN,0.3


In [17]:
kalimat = [
    "aku sangat bahagia hari ini",
    "aku kesal dengan pelayanan ini",
    "hari ini biasa saja"
]

print("NB :", pipeline_nb.predict(kalimat))
print("KNN:", pipeline_knn.predict(kalimat))


NB : ['senang' 'marah' 'netral']
KNN: ['netral' 'marah' 'netral']
