# **1. Import Library**

Pada tahap ini, Anda perlu mengimpor beberapa pustaka (library) Python yang dibutuhkan untuk analisis data dan pembangunan model machine learning.

In [1]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder


# **2. Memuat Dataset dari Hasil Clustering**

Memuat dataset hasil clustering dari file CSV ke dalam variabel DataFrame.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_csv('/content/drive/MyDrive/dataset (1)/cluster_dataset.csv')

In [4]:
data.head()

Unnamed: 0,TransactionType,Channel,TransactionAmount,CustomerAge,TransactionDuration,AccountBalance,Cluster
0,Debit,ATM,14.09,70.0,81.0,5112.21,3
1,Debit,ATM,376.24,68.0,141.0,13758.91,3
2,Debit,Online,126.29,19.0,56.0,1122.35,0
3,Debit,Online,184.5,26.0,25.0,8569.06,0
4,Credit,Online,13.45,26.0,198.0,7429.4,4


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2399 entries, 0 to 2398
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   TransactionType      2399 non-null   object 
 1   Channel              2399 non-null   object 
 2   TransactionAmount    2399 non-null   float64
 3   CustomerAge          2399 non-null   float64
 4   TransactionDuration  2399 non-null   float64
 5   AccountBalance       2399 non-null   float64
 6   Cluster              2399 non-null   int64  
dtypes: float64(4), int64(1), object(2)
memory usage: 131.3+ KB


In [6]:
data.describe()

Unnamed: 0,TransactionAmount,CustomerAge,TransactionDuration,AccountBalance,Cluster
count,2399.0,2399.0,2399.0,2399.0,2399.0
mean,255.236257,44.704043,119.551063,5140.073097,1.856607
std,216.869261,17.755356,70.057469,3902.404953,1.390824
min,0.26,18.0,10.0,101.25,0.0
25%,78.385,27.0,63.0,1516.58,1.0
50%,197.49,45.0,112.0,4795.65,2.0
75%,374.555,59.0,162.0,7702.82,3.0
max,903.19,80.0,300.0,14977.99,4.0


In [7]:
data.isnull().sum()

Unnamed: 0,0
TransactionType,0
Channel,0
TransactionAmount,0
CustomerAge,0
TransactionDuration,0
AccountBalance,0
Cluster,0


# **3. Data Splitting**

Tahap Data Splitting bertujuan untuk memisahkan dataset menjadi dua bagian: data latih (training set) dan data uji (test set).

In [8]:
# Buat instance LabelEncoder
label_encoder = LabelEncoder()

# List kolom kategorikal yang perlu di-encode
categorical_columns = ['TransactionType', 'Channel']

# Encode kolom kategorikal
for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])

# Tampilkan DataFrame untuk memastikan encoding telah diterapkan
data.head()

Unnamed: 0,TransactionType,Channel,TransactionAmount,CustomerAge,TransactionDuration,AccountBalance,Cluster
0,1,0,14.09,70.0,81.0,5112.21,3
1,1,0,376.24,68.0,141.0,13758.91,3
2,1,2,126.29,19.0,56.0,1122.35,0
3,1,2,184.5,26.0,25.0,8569.06,0
4,0,2,13.45,26.0,198.0,7429.4,4


In [9]:
# Buat instance MinMaxScaler
scaler = MinMaxScaler()

# Normalisasi semua kolom numerik
numeric_columns = ['TransactionAmount', 'CustomerAge', 'TransactionDuration', 'AccountBalance']
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

data.head()

Unnamed: 0,TransactionType,Channel,TransactionAmount,CustomerAge,TransactionDuration,AccountBalance,Cluster
0,1,0,0.015317,0.83871,0.244828,0.336832,3
1,1,0,0.4164,0.806452,0.451724,0.918055,3
2,1,2,0.139579,0.016129,0.158621,0.068637,0
3,1,2,0.204047,0.129032,0.051724,0.569198,0
4,0,2,0.014608,0.129032,0.648276,0.492591,4


In [10]:
# Pisahkan fitur (X) dan target (y)
X = data.drop(columns=['Cluster'])
y = data['Cluster']

# Split data menjadi set pelatihan dan set uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tampilkan bentuk set pelatihan dan set uji untuk memastikan split
print(f"Training set shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Test set shape: X_test={X_test.shape}, y_test={y_test.shape}")

Training set shape: X_train=(1919, 6), y_train=(1919,)
Test set shape: X_test=(480, 6), y_test=(480,)


# **4. Membangun Model Klasifikasi**


## **a. Membangun Model Klasifikasi**

Setelah memilih algoritma klasifikasi yang sesuai, langkah selanjutnya adalah melatih model menggunakan data latih.

Berikut adalah rekomendasi tahapannya.
1. Pilih algoritma klasifikasi yang sesuai, seperti Logistic Regression, Decision Tree, Random Forest, atau K-Nearest Neighbors (KNN).
2. Latih model menggunakan data latih.

In [11]:
# Bagian 1: Pelatihan Model
# Definisikan setiap klasifikasi secara terpisah
dt = DecisionTreeClassifier().fit(X_train, y_train)
rf = RandomForestClassifier().fit(X_train, y_train)

print("Model training selesai.")

Model training selesai.


Tulis narasi atau penjelasan algoritma yang Anda gunakan.

## **b. Evaluasi Model Klasifikasi**

In [14]:
# Fungsi evaluasi dengan penyesuaian untuk multi-class classification
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    # Cek apakah confusion matrix berbentuk 2x2 (binary classification)
    if cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
    else:
        tn = fp = fn = tp = "Not applicable (Multi-class)"

    results = {
        'Confusion Matrix': cm,
        'True Positive (TP)': tp,
        'False Positive (FP)': fp,
        'False Negative (FN)': fn,
        'True Negative (TN)': tn,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='macro'),  # Macro untuk multi-class
        'Recall': recall_score(y_test, y_pred, average='macro'),  # Macro untuk multi-class
        'F1-Score': f1_score(y_test, y_pred, average='macro')  # Macro untuk multi-class
    }
    return results

# Evaluasi model
results = {
    'Decision Tree (DT)': evaluate_model(dt, X_test, y_test),
    'Random Forest (RF)': evaluate_model(rf, X_test, y_test),

}

# Buat DataFrame ringkasan hasil
summary_df = pd.DataFrame([
    {
        'Model': model_name,
        'Accuracy': metrics['Accuracy'],
        'Precision': metrics['Precision'],
        'Recall': metrics['Recall'],
        'F1-Score': metrics['F1-Score']
    }
    for model_name, metrics in results.items()
])

# Tampilkan DataFrame
print(summary_df)

                Model  Accuracy  Precision  Recall  F1-Score
0  Decision Tree (DT)       1.0        1.0     1.0       1.0
1  Random Forest (RF)       1.0        1.0     1.0       1.0


Berikut adalah **rekomendasi** tahapannya.
1. Lakukan prediksi menggunakan data uji.
2. Hitung metrik evaluasi seperti Accuracy dan F1-Score (Opsional: Precision dan Recall).
3. Buat confusion matrix untuk melihat detail prediksi benar dan salah.

Tulis hasil evaluasi algoritma yang digunakan, jika Anda menggunakan 2 algoritma, maka bandingkan hasilnya.

**Analisis dan Perbandingan**

1. Akurasi = 1.0 berarti semua model memprediksi seluruh data uji dengan benar.

2. Precision, Recall, dan F1-score = 1.0 menunjukkan bahwa tidak ada kesalahan klasifikasi (false positive & false negative = 0).

## **c. Tuning Model Klasifikasi (Optional)**

Gunakan GridSearchCV, RandomizedSearchCV, atau metode lainnya untuk mencari kombinasi hyperparameter terbaik

In [17]:
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt = DecisionTreeClassifier()

dt_grid = GridSearchCV(dt, dt_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
dt_grid.fit(X_train, y_train)

best_dt = dt_grid.best_estimator_

y_pred = best_dt.predict(X_test)

print("Best DT parameters:", dt_grid.best_params_)

Best DT parameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [18]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier()

rf_grid = GridSearchCV(rf, rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid.fit(X_train, y_train)

best_rf = rf_grid.best_estimator_

y_pred = best_rf.predict(X_test)

print("Best RF parameters:", rf_grid.best_params_)

Best RF parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


## **d. Evaluasi Model Klasifikasi setelah Tuning (Optional)**

Berikut adalah rekomendasi tahapannya.
1. Gunakan model dengan hyperparameter terbaik.
2. Hitung ulang metrik evaluasi untuk melihat apakah ada peningkatan performa.


In [19]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    results = {
        'Confusion Matrix': cm,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='macro'),
        'Recall': recall_score(y_test, y_pred, average='macro'),
        'F1-Score': f1_score(y_test, y_pred, average='macro')
    }

    return results


results = {
    'Decision Tree (DT)': evaluate_model(best_dt, X_test, y_test),
    'Random Forest (RF)': evaluate_model(best_rf, X_test, y_test),
}

summary_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])

rows = []
for model_name, metrics in results.items():
    rows.append({
        'Model': model_name,
        'Accuracy': metrics['Accuracy'],
        'Precision': metrics['Precision'],
        'Recall': metrics['Recall'],
        'F1-Score': metrics['F1-Score']
    })


summary_df = pd.DataFrame(rows)


print(summary_df)

                Model  Accuracy  Precision  Recall  F1-Score
0  Decision Tree (DT)       1.0        1.0     1.0       1.0
1  Random Forest (RF)       1.0        1.0     1.0       1.0


## **e. Analisis Hasil Evaluasi Model Klasifikasi**

Berikut adalah **rekomendasi** tahapannya.
1. Bandingkan hasil evaluasi sebelum dan setelah tuning (jika dilakukan).

    * Sebelum:
      - Model mampu mengklasifikasikan seluruh data dengan benar
    * Setelah:
      - Tidak terjadi perubahan signifikan setelah tuning

2. Identifikasi kelemahan model:
  - Kemungkinan Overfitting Karena akurasi sempurna di training dan testing, model sangat mungkin hafal pola data (overfit).

3. Berikan rekomendasi tindakan lanjutan, seperti mengumpulkan data tambahan atau mencoba algoritma lain jika hasil belum memuaskan.
  - Untuk memastikan hasil evaluasi model lebih stabil dan tidak bias, disarankan untuk menggunakan k-fold cross-validation, misalnya 5-fold. Selain itu, penting untuk menguji model pada data yang belum pernah dilihat sebelumnya, seperti data uji eksternal atau data dunia nyata, guna mengetahui seberapa baik model dapat melakukan generalisasi.