Modeling Prediksi

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.decomposition import TruncatedSVD
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

# Langkah 1: Memuat Data
df = pd.read_excel('pre_processed_data_alpha(2).xlsx')

# Langkah 2: Inisialisasi model Spacy untuk Lemmatization
nlp = spacy.load('en_core_web_sm')

# Tokenisasi dan Lemmatization
def tokenize_and_lemmatize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_punct]

# Menerapkan Lemmatization pada kolom 'Abstrak publikasi'
df["lemmatized"] = df["Abstrak publikasi"].apply(tokenize_and_lemmatize)

# Inisialisasi TfidfVectorizer
vectorizer = TfidfVectorizer()

# Menghitung TF-IDF untuk kolom lemmatized
tfidf_matrix = vectorizer.fit_transform(df["lemmatized"].apply(lambda x: " ".join(x)))

# Mengonversi matriks TF-IDF menjadi DataFrame
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=vectorizer.get_feature_names_out()
)

# Membuat kolom untuk TF-IDF
df["tf-idf"] = tfidf_matrix.toarray().tolist()

# Langkah 3: Ekstraksi matriks TF-IDF sebagai array 2D
X = np.array(df["tf-idf"].tolist())  # Konversi kolom 'tf-idf' menjadi array NumPy
y = df["Rating (1-5)"]

# Langkah 4: Penanganan ketidakseimbangan kelas menggunakan SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Langkah 5: Reduksi Dimensi menggunakan Truncated SVD
svd = TruncatedSVD(n_components=1500, random_state=42)
X_reduced = svd.fit_transform(X_resampled)

# Langkah 6: Membagi data menjadi train-test setelah reduksi dimensi
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y_resampled, test_size=0.2, random_state=42)

# Langkah 7: Melatih dan mengevaluasi model XGBoost (dengan akselerasi GPU)
xgb_model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=500,
    learning_rate=0.05,
    max_depth=10,
    tree_method="hist",  
    device="cuda",       
    random_state=42
)

xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# Menampilkan hasil evaluasi untuk XGBoost
print("\U0001F534 Kinerja XGBoost (GPU):")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred_xgb))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred_xgb))
print("R² Score:", r2_score(y_test, y_pred_xgb))

# Langkah 8: Melatih dan mengevaluasi Random Forest
rf_model = RandomForestRegressor(n_estimators=500, max_depth=20, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Menampilkan hasil evaluasi untuk Random Forest
print("\n\U0001F535 Kinerja Random Forest:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred_rf))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred_rf))
print("R² Score:", r2_score(y_test, y_pred_rf))

# Langkah 9: Melatih dan mengevaluasi SVR
svr_model = SVR(kernel='rbf', C=10, epsilon=0.1)
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)

# Menampilkan hasil evaluasi untuk SVR
print("\n\U0001F535 Kinerja SVR:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred_svr))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred_svr))
print("R² Score:", r2_score(y_test, y_pred_svr))

# Langkah 10: Melatih dan mengevaluasi KNN
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

mae_knn = mean_absolute_error(y_test, y_pred_knn)
mse_knn = mean_squared_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

# Menampilkan hasil evaluasi untuk KNN
print("\n\U0001F7E2 Kinerja KNN:")
print("Mean Absolute Error (MAE):", mae_knn)
print("Mean Squared Error (MSE):", mse_knn)
print("R² Score:", r2_knn)


Menghitung Accuracy

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

# Fungsi untuk membagi rating menjadi grup
def group_ratings(rating):
    if rating in [4, 5]:
        return "Relevan"
    elif rating == 3:
        return "Netral"
    else:
        return "Tidak Relevan"

# Group Rating Aktual
actual_grouped = pd.Series(y_test).apply(group_ratings)

# Group Rating Prediksi Sesuai Model Algoritma
predicted_grouped_xgb = pd.Series(np.clip(np.round(y_pred_xgb), 1, 5)).apply(group_ratings)
predicted_grouped_rf = pd.Series(np.clip(np.round(y_pred_rf), 1, 5)).apply(group_ratings)
predicted_grouped_svr = pd.Series(np.clip(np.round(y_pred_svr), 1, 5)).apply(group_ratings)
predicted_grouped_knn = pd.Series(np.clip(np.round(y_pred_knn), 1, 5)).apply(group_ratings)

# Menghitung akurasi setiap grup rating prediksi
accuracy_grouped_xgb = accuracy_score(actual_grouped, predicted_grouped_xgb)
accuracy_grouped_rf = accuracy_score(actual_grouped, predicted_grouped_rf)
accuracy_grouped_svr = accuracy_score(actual_grouped, predicted_grouped_svr)
accuracy_grouped_knn = accuracy_score(actual_grouped, predicted_grouped_knn)

# Print akurasi grup rating setiap model
print("🔵 Grouped Accuracy for Each Model:")
print(f"XGBoost Regressor: {accuracy_grouped_xgb:.2%}")
print(f"Random Forest Regressor: {accuracy_grouped_rf:.2%}")
print(f"Support Vector Regressor (SVR): {accuracy_grouped_svr:.2%}")
print(f"K-Nearest Neighbors Regressor (KNN): {accuracy_grouped_knn:.2%}")
