In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [4]:
health = pd.read_csv("https://raw.githubusercontent.com/yovelakalista23/BPML/refs/heads/main/Proyek-Analisis-Sentimen/Combined%20Data.csv")
health.head()

KeyboardInterrupt: 

In [None]:
health.shape

In [None]:
health.info()

In [None]:
health.isnull().sum()

In [None]:
print("Number of Duplicates: ", health.duplicated().sum())

In [None]:
health.describe()

In [None]:
health = health.dropna()

In [None]:
health.info()

In [None]:
# Memisahkan kolom 'statement' menjadi fitur (X) dan kolom 'status' menjadi target (y)
X = health['statement']
y = health['status']

In [None]:
# Inisialisasi TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,      # Hanya 1000 fitur terpenting
    min_df=5,               # Hanya muncul minimal di 5 dokumen
    max_df=0.85,            # Tidak lebih dari 85% dokumen
    ngram_range=(1,2)       # Menggunakan unigram dan bigram
)

# Transformasi menjadi matriks TF-IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(X)

# Konversi hasil menjadi DataFrame
X_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Menampilkan hasil TF-IDF
X_tfidf.head()

In [None]:
# Inisialisasi CountVectorizer untuk Bag of Words
bow_vectorizer = CountVectorizer(
    max_features=5000,      # Hanya 1000 fitur terpenting
    min_df=5,               # Hanya muncul minimal di 5 dokumen
    max_df=0.85,            # Tidak lebih dari 85% dokumen
    ngram_range=(1,2)       # Menggunakan unigram dan bigram
)

# Transformasi menjadi matriks BoW
bow_matrix = bow_vectorizer.fit_transform(X)

# Konversi hasil menjadi DataFrame
X_bow = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())

# Menampilkan hasil BoW
X_bow.head()

In [None]:
# 80/20 TF-IDF
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# 70/30 TF-IDF
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

# 80/20 BoW
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_bow, y, test_size=0.2, random_state=42)

In [None]:
# Model SVM (Support Vector Machine)
svm_model = SVC(kernel='linear', random_state=42)
# Training dan Evaluasi Model 1: SVM dengan TF-IDF (80/20)
svm_model.fit(X_train1, y_train1)
y_pred1 = svm_model.predict(X_test1)
accuracy_svm_1 = accuracy_score(y_test1, y_pred1)

In [None]:
# Model Random Forest (RF)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# Training dan Evaluasi Model 2: RF dengan TF-IDF (70/30)
rf_model.fit(X_train2, y_train2)
y_pred2 = rf_model.predict(X_test2)
accuracy_rf_2 = accuracy_score(y_test2, y_pred2)

In [None]:
# Model Random Forest (RF)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# Training dan Evaluasi Model 3: RF dengan BoW (80/20)
rf_model.fit(X_train3, y_train3)
y_pred3 = rf_model.predict(X_test3)
accuracy_rf_3 = accuracy_score(y_test3, y_pred3)


In [None]:
# Menampilkan hasil akurasi
print(f"Akurasi SVM dengan TF-IDF (80/20): {accuracy_svm_1:.4f}")
print(f"Akurasi RF dengan TF-IDF (70/30): {accuracy_rf_2:.4f}")
print(f"Akurasi RF dengan BoW (80/20): {accuracy_rf_3:.4f}")