Import Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV

Load Dataset

In [2]:
df = pd.read_csv("data_clustering_inverse.csv")
df.head()

Unnamed: 0,TransactionAmount,TransactionDate,TransactionType,Location,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,PreviousTransactionDate,Target,Cluster
0,14.09,680,1,36,0,70.0,0,81.0,1.0,5112.21,105,1,1
1,376.24,1178,1,15,0,68.0,0,141.0,1.0,13758.91,192,2,2
2,126.29,1262,1,23,2,19.0,3,56.0,1.0,1122.35,41,2,2
3,184.5,818,1,33,2,26.0,3,25.0,1.0,8569.06,163,1,1
4,92.15,635,1,28,0,18.0,3,172.0,1.0,781.68,13,1,1


Data Splitting

In [3]:
X = df.drop(columns=['Target', 'Cluster'])
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Model

In [6]:
knn = KNeighborsClassifier().fit(X_train, y_train)
dt = DecisionTreeClassifier().fit(X_train, y_train)
rf = RandomForestClassifier().fit(X_train, y_train)
svm = SVC().fit(X_train, y_train)
nb = GaussianNB().fit(X_train, y_train)
print("Model training selesai.")

Model training selesai.


In [7]:
# Classification Report

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    results = {
        'Confusion Matrix': cm,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision (macro)': precision_score(y_test, y_pred, average='macro'),
        'Recall (macro)': recall_score(y_test, y_pred, average='macro'),
        'F1-Score (macro)': f1_score(y_test, y_pred, average='macro'),
        'Classification Report': classification_report(y_test, y_pred)
    }
    return results

results = {
    'K-Nearest Neighbors (KNN)': evaluate_model(knn, X_test, y_test),
    'Decision Tree (DT)': evaluate_model(model, X_test, y_test),
    'Random Forest (RF)': evaluate_model(rf, X_test, y_test),
    'Support Vector Machine (SVM)': evaluate_model(svm, X_test, y_test),
    'Naive Bayes (NB)': evaluate_model(nb, X_test, y_test)
}

rows = []
for model_name, metrics in results.items():
    rows.append({
        'Model': model_name,
        'Accuracy': metrics['Accuracy'],
        'Precision': metrics['Precision (macro)'],
        'Recall': metrics['Recall (macro)'],
        'F1-Score': metrics['F1-Score (macro)']
    })

summary_df = pd.DataFrame(rows)
print(summary_df)

                          Model  Accuracy  Precision    Recall  F1-Score
0     K-Nearest Neighbors (KNN)  0.941667   0.941188  0.941012  0.940790
1            Decision Tree (DT)  0.997917   0.998004  0.997877  0.997934
2            Random Forest (RF)  0.997917   0.998004  0.997877  0.997934
3  Support Vector Machine (SVM)  0.979167   0.979325  0.978884  0.978825
4              Naive Bayes (NB)  0.985417   0.985390  0.985253  0.985275


In [8]:
# Saved Model

joblib.dump(knn, 'explore_KNN_classification.h5')
joblib.dump(dt, 'explore_KNN_classification.h5')
joblib.dump(rf, 'explore_RF_classification.h5')
joblib.dump(svm, 'explore_SVM_classification.h5')
joblib.dump(nb, 'explore_NB_classification.h5')

['explore_NB_classification.h5']

Hyperparameter Tuning

In [9]:
param_grid = {
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Inisialisasi GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Output hasil terbaik
print(f"Best parameters (Grid Search): {grid_search.best_params_}")
best_model_grid = grid_search.best_estimator_

# Evaluasi performa model pada test set
grid_search_score = best_model_grid.score(X_test, y_test)
print(f"Accuracy after Grid Search: {grid_search_score:.2f}")

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best parameters (Grid Search): {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy after Grid Search: 1.00


In [10]:
# Evaluation
y_pred_tuned = best_model_grid.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_tuned)
precision = precision_score(y_test, y_pred_tuned, average='macro')
recall = recall_score(y_test, y_pred_tuned, average='macro')
f1 = f1_score(y_test, y_pred_tuned, average='macro')

print("Evaluasi Model Setelah Hyperparameter Tuning")
print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)
print("F1-Score :", f1)

Evaluasi Model Setelah Hyperparameter Tuning
Accuracy : 0.9979166666666667
Precision: 0.9980039920159681
Recall   : 0.9978768577494691
F1-Score : 0.9979340362726944


In [11]:
# Saved Tuning
joblib.dump(best_model_grid, 'tuning_classification.h5')

['tuning_classification.h5']