- SMOTE-ENN
- REMOVE MULTICOLINEARITY
- BPSO VANILLA (BASED RF)

In [None]:
import pandas as pd

# import data 
data = pd.read_csv("data/dataset-tbp.csv")

# head
data.head()

# Eksplorasi Data

In [None]:
# data info
data.info()

In [None]:
# hapus kolom net income flag 
data = data.drop(columns=[" Net Income Flag"])

## Distribusi Variabel Target

In [None]:
# dalam angka
print(data['Bankrupt?'].value_counts())

# visualisasi
data['Bankrupt?'].value_counts().plot(kind='bar', title='Distribusi Bankrupt?')

## Distribusi Variabel Fitur

# Prepocessing

## Balancing dengan SMOTE-ENN

In [None]:
# Pisahkan fitur dan target dari data yang sudah difilter
X = data.drop('Bankrupt?', axis=1)
y = data['Bankrupt?']

# split dulu sebelum balancing
from sklearn.model_selection import train_test_split

X_train_original, X_test, y_train_original, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data sebelum oversampling:")
print("X_train_original shape:", X_train_original.shape)
print("X_test shape:", X_test.shape)
print("y_train_original distribution:")
print(y_train_original.value_counts())
print("y_test distribution:")
print(y_test.value_counts())

In [None]:
import imblearn
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours


smote_enn = SMOTEENN(
    smote=SMOTE(random_state=42),
    enn=EditedNearestNeighbours(),
    random_state=42
)

# Apply SMOTE-ENN pada data training
X_train, y_train = smote_enn.fit_resample(X_train_original, y_train_original)

print("Data setelah SMOTE-ENN pada training set:")
print("X_train shape:", X_train.shape)
print("y_train distribution:")
print(y_train.value_counts())

print("\nData test tetap tidak tersentuh (mencegah data leakage):")
print("X_test shape:", X_test.shape)  
print("y_test distribution:")
print(y_test.value_counts())

In [None]:
print("Final data shapes:")
print("X_train shape (after SMOTE-ENN):", X_train.shape)
print("X_test shape (original, no balancing):", X_test.shape)
print("y_train shape (after SMOTE-ENN):", y_train.shape)  
print("y_test shape (original, no balancing):", y_test.shape)

print("\nFinal class distribution:")
print("Training set (setelah SMOTE-ENN):")
print(y_train.value_counts())
print("\nTest set (original distribution):")
print(y_test.value_counts())

## Seleksi Fitur Berdasarkan Korelasi

Hapus multikolinearitas

hapus fitur yang memiliki nilai yang sama

In [None]:
# Hitung matriks korelasi pada data training
correlation_matrix = X_train.corr()

print("Matriks korelasi shape:", correlation_matrix.shape)
print("\nMencari pasangan fitur dengan korelasi tinggi (>0.9 atau <-0.9)...")

# Temukan pasangan fitur dengan korelasi tinggi
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) > 0.9:  # Korelasi > 0.9 atau < -0.9
            feature1 = correlation_matrix.columns[i]
            feature2 = correlation_matrix.columns[j]
            high_corr_pairs.append({
                'feature1': feature1,
                'feature2': feature2,
                'correlation': corr_value
            })

# Tampilkan pasangan dengan korelasi tinggi
if high_corr_pairs:
    high_corr_df = pd.DataFrame(high_corr_pairs)
    print(f"\nDitemukan {len(high_corr_pairs)} pasangan fitur dengan korelasi tinggi:")
    print(high_corr_df)
else:
    print("\nTidak ditemukan pasangan fitur dengan korelasi > 0.9 atau < -0.9")

In [None]:
# Jika ada fitur dengan korelasi tinggi, pilih salah satu untuk dihapus
features_to_remove = set()

if high_corr_pairs:
    # Hitung korelasi setiap fitur dengan target variable untuk membantu dalam seleksi
    # Gabungkan X_train dengan y_train untuk menghitung korelasi dengan target
    train_data_with_target = X_train.copy()
    train_data_with_target['Bankrupt?'] = y_train
    correlation_matrix_target = train_data_with_target.corr()['Bankrupt?'].drop('Bankrupt?')
    
    for pair in high_corr_pairs:
        feature1, feature2 = pair['feature1'], pair['feature2']
        
        # hapus fitur yang memiliki korelasi lebih rendah dengan target
        # Hitung korelasi absolut dengan target untuk kedua fitur
        corr_target1 = abs(correlation_matrix_target[feature1])
        corr_target2 = abs(correlation_matrix_target[feature2])
        
        # Hapus fitur yang memiliki korelasi lebih rendah dengan target
        if corr_target1 < corr_target2:
            features_to_remove.add(feature1)
            print(f"Menghapus {feature1} (korelasi dengan target: {corr_target1:.4f}) dan mempertahankan {feature2} (korelasi dengan target: {corr_target2:.4f})")
        else:
            features_to_remove.add(feature2)
            print(f"Menghapus {feature2} (korelasi dengan target: {corr_target2:.4f}) dan mempertahankan {feature1} (korelasi dengan target: {corr_target1:.4f})")

    # Hapus fitur dari data training dan testing
    features_to_keep = [col for col in X_train.columns if col not in features_to_remove]
    
    print(f"\nFitur yang akan dihapus: {list(features_to_remove)}")
    print(f"Jumlah fitur sebelum seleksi korelasi: {X_train.shape[1]}")
    print(f"Jumlah fitur setelah seleksi korelasi: {len(features_to_keep)}")
    
    # Update data training dan testing
    X_train_filtered = X_train[features_to_keep]
    X_test_filtered = X_test[features_to_keep]
    
    print(f"\nShape setelah filtering:")
    print(f"X_train_filtered: {X_train_filtered.shape}")
    print(f"X_test_filtered: {X_test_filtered.shape}")
    
else:
    print("Tidak ada fitur yang perlu dihapus karena korelasi.")
    X_train_filtered = X_train.copy()
    X_test_filtered = X_test.copy()

## Binary Particle Swarm Optimization (BPSO) untuk Seleksi Fitur

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

class BinaryPSO:
    def __init__(self, n_particles=30, n_iterations=50, w=0.5, c1=1.5, c2=1.5, 
                 min_features=5, max_features=None):
        """
        Binary Particle Swarm Optimization untuk feature selection
        
        Parameters:
        - n_particles: jumlah partikel
        - n_iterations: jumlah iterasi
        - w: inertia weight
        - c1, c2: acceleration coefficients
        - min_features: minimum jumlah fitur yang dipilih
        - max_features: maksimum jumlah fitur yang dipilih
        """
        self.n_particles = n_particles
        self.n_iterations = n_iterations
        self.w = w
        self.c1 = c1
        self.c2 = c2
        self.min_features = min_features
        self.max_features = max_features
        
    def sigmoid(self, x):
        """Sigmoid transfer function"""
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def evaluate_fitness(self, X, y, selected_features, model):
        """Evaluasi fitness menggunakan cross-validation accuracy"""
        if np.sum(selected_features) == 0:
            return 0
        
        X_selected = X[:, selected_features.astype(bool)]
        
        # Jika jumlah fitur kurang dari minimum yang diinginkan
        if X_selected.shape[1] < self.min_features:
            return 0
            
        # Jika jumlah fitur lebih dari maksimum yang diinginkan
        if self.max_features and X_selected.shape[1] > self.max_features:
            return 0
        
        try:
            # Cross-validation score
            scores = cross_val_score(model, X_selected, y, cv=3, scoring='accuracy')
            
            # Fitness = accuracy - penalty untuk terlalu banyak fitur
            fitness = np.mean(scores) - (X_selected.shape[1] / X.shape[1]) * 0.1
            return fitness
        except:
            return 0
    
    def fit(self, X, y, model=None):
        """
        Menjalankan BPSO untuk feature selection
        
        Parameters:
        - X: fitur data
        - y: target data
        - model: model untuk evaluasi (default: RandomForestClassifier)
        """
        if model is None:
            model = RandomForestClassifier(n_estimators=50, random_state=42)
        
        n_features = X.shape[1]
        
        if self.max_features is None:
            self.max_features = min(30, n_features // 2)
        
        # Inisialisasi posisi partikel (binary)
        positions = np.random.randint(0, 2, (self.n_particles, n_features))
        
        # Inisialisasi velocity
        velocities = np.random.uniform(-1, 1, (self.n_particles, n_features))
        
        # Evaluasi fitness awal
        fitness = np.array([self.evaluate_fitness(X, y, pos, model) for pos in positions])
        
        # Personal best positions dan fitness
        pbest_positions = positions.copy()
        pbest_fitness = fitness.copy()
        
        # Global best
        gbest_idx = np.argmax(fitness)
        gbest_position = positions[gbest_idx].copy()
        gbest_fitness = fitness[gbest_idx]
        
        # Simpan history untuk plotting
        fitness_history = [gbest_fitness]
        
        print(f"Inisialisasi BPSO:")
        print(f"Jumlah partikel: {self.n_particles}")
        print(f"Jumlah fitur: {n_features}")
        print(f"Fitness awal terbaik: {gbest_fitness:.4f}")
        print(f"Jumlah fitur terpilih awal: {np.sum(gbest_position)}")
        
        # Iterasi PSO
        for iteration in range(self.n_iterations):
            for i in range(self.n_particles):
                # Update velocity
                r1, r2 = np.random.random(n_features), np.random.random(n_features)
                
                velocities[i] = (self.w * velocities[i] + 
                               self.c1 * r1 * (pbest_positions[i] - positions[i]) +
                               self.c2 * r2 * (gbest_position - positions[i]))
                
                # Update position menggunakan sigmoid transfer function
                sigmoid_v = self.sigmoid(velocities[i])
                positions[i] = (np.random.random(n_features) < sigmoid_v).astype(int)
                
                # Evaluasi fitness
                current_fitness = self.evaluate_fitness(X, y, positions[i], model)
                
                # Update personal best
                if current_fitness > pbest_fitness[i]:
                    pbest_positions[i] = positions[i].copy()
                    pbest_fitness[i] = current_fitness
                
                # Update global best
                if current_fitness > gbest_fitness:
                    gbest_position = positions[i].copy()
                    gbest_fitness = current_fitness
            
            fitness_history.append(gbest_fitness)
            
            if (iteration + 1) % 10 == 0:
                print(f"Iterasi {iteration + 1}: Fitness terbaik = {gbest_fitness:.4f}, "
                      f"Fitur terpilih = {np.sum(gbest_position)}")
        
        self.gbest_position = gbest_position
        self.gbest_fitness = gbest_fitness
        self.fitness_history = fitness_history
        
        return self

# Implementasi BPSO untuk feature selection
print("Memulai Binary Particle Swarm Optimization untuk Feature Selection...")
print("="*70)

# Konversi ke numpy array untuk kompatibilitas dengan BPSO
X_train_array = X_train_filtered.values
y_train_array = y_train.values

# Inisialisasi BPSO
bpso = BinaryPSO(
    n_particles=75,
    n_iterations=100,
    w=0.5,
    c1=0.7,
    c2=0.7,
    min_features=10,
    max_features=35
)

# Jalankan BPSO
bpso.fit(X_train_array, y_train_array)

print(f"\nHasil BPSO:")
print(f"Fitness terbaik: {bpso.gbest_fitness:.4f}")
print(f"Jumlah fitur terpilih: {np.sum(bpso.gbest_position)}")

# Dapatkan nama fitur yang terpilih
selected_feature_indices = np.where(bpso.gbest_position == 1)[0]
selected_features_bpso = X_train_filtered.columns[selected_feature_indices]

print(f"\nFitur yang terpilih oleh BPSO:")
for i, feature in enumerate(selected_features_bpso):
    print(f"{i+1}. {feature}")

In [None]:
# Import yang diperlukan
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Visualisasi convergence BPSO
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(bpso.fitness_history)
plt.title('BPSO Convergence')
plt.xlabel('Iteration')
plt.ylabel('Best Fitness')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.bar(range(len(selected_features_bpso)), [1]*len(selected_features_bpso))
plt.title(f'Selected Features by BPSO (Total: {len(selected_features_bpso)})')
plt.xlabel('Feature Index')
plt.ylabel('Selected')
plt.xticks(range(len(selected_features_bpso)), 
           [f'F{i}' for i in range(len(selected_features_bpso))], rotation=45)

plt.tight_layout()
plt.show()

# Buat dataset dengan fitur terpilih dari BPSO
X_train_selected = X_train_filtered[selected_features_bpso]
X_test_selected = X_test_filtered[selected_features_bpso]

print(f"\nDataset dengan fitur terpilih BPSO:")
print(f"X_train_selected shape: {X_train_selected.shape}")
print(f"X_test_selected shape: {X_test_selected.shape}")
print(f"\nFitur yang digunakan: {list(selected_features_bpso)}")

# Evaluasi perbandingan dengan Random Forest
print(f"\nEvaluasi dengan Random Forest:")
rf_eval = RandomForestClassifier(n_estimators=100, random_state=42)

# Evaluasi dengan semua fitur (setelah correlation filtering)
rf_eval.fit(X_train_filtered, y_train)
y_pred_all = rf_eval.predict(X_test_filtered)
accuracy_all = accuracy_score(y_test, y_pred_all)

# Evaluasi dengan fitur BPSO
rf_eval.fit(X_train_selected, y_train)
y_pred_bpso = rf_eval.predict(X_test_selected)
accuracy_bpso = accuracy_score(y_test, y_pred_bpso)

print(f"Akurasi dengan semua fitur ({X_train_filtered.shape[1]} fitur): {accuracy_all:.4f}")
print(f"Akurasi dengan fitur BPSO ({X_train_selected.shape[1]} fitur): {accuracy_bpso:.4f}")
print(f"Pengurangan fitur: {X_train_filtered.shape[1] - X_train_selected.shape[1]} fitur")
print(f"Persentase pengurangan: {((X_train_filtered.shape[1] - X_train_selected.shape[1]) / X_train_filtered.shape[1] * 100):.1f}%")

### Analisis Fitur Terpilih BPSO

In [None]:
# Analisis mendalam fitur yang terpilih oleh BPSO
print("Analisis Feature Selection dengan BPSO")
print("="*50)

# Hitung korelasi fitur terpilih dengan target
correlation_with_target = X_train_selected.corrwith(pd.Series(y_train.values, index=X_train_selected.index))
correlation_with_target = correlation_with_target.abs().sort_values(ascending=False)

print(f"\nKorelasi fitur terpilih dengan target (diurutkan):")
for feature, corr in correlation_with_target.items():
    print(f"{feature}: {corr:.4f}")

# Hitung feature importance menggunakan Random Forest
rf_importance = RandomForestClassifier(n_estimators=100, random_state=42)
rf_importance.fit(X_train_selected, y_train)

feature_importance = pd.DataFrame({
    'feature': selected_features_bpso,
    'importance': rf_importance.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nFeature Importance (Random Forest) untuk fitur terpilih BPSO:")
for _, row in feature_importance.iterrows():
    print(f"{row['feature']}: {row['importance']:.4f}")

# Visualisasi feature importance
plt.figure(figsize=(12, 8))
plt.subplot(2, 1, 1)
plt.barh(range(len(correlation_with_target)), correlation_with_target.values)
plt.yticks(range(len(correlation_with_target)), correlation_with_target.index)
plt.xlabel('Absolute Correlation with Target')
plt.title('Korelasi Fitur Terpilih dengan Target')
plt.gca().invert_yaxis()

plt.subplot(2, 1, 2)
plt.barh(range(len(feature_importance)), feature_importance['importance'].values)
plt.yticks(range(len(feature_importance)), feature_importance['feature'].values)
plt.xlabel('Feature Importance')
plt.title('Feature Importance (Random Forest)')
plt.gca().invert_yaxis()

plt.tight_layout()
plt.show()

# Statistik deskriptif
print(f"\nStatistik Seleksi Fitur BPSO:")
print(f"Total fitur awal (setelah correlation filter): {X_train_filtered.shape[1]}")
print(f"Fitur terpilih oleh BPSO: {len(selected_features_bpso)}")
print(f"Rasio seleksi: {len(selected_features_bpso)/X_train_filtered.shape[1]:.2f}")
print(f"Rata-rata korelasi dengan target: {correlation_with_target.mean():.4f}")
print(f"Rata-rata feature importance: {feature_importance['importance'].mean():.4f}")

# Modelling

## Decision Tree

In [None]:
## Decision Tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# inisialisasi model
dt = DecisionTreeClassifier(random_state=42)

# fit model dengan data hasil BPSO feature selection
dt.fit(X_train_selected, y_train)

# prediksi dengan data hasil BPSO feature selection
y_pred = dt.predict(X_test_selected)

# evaluasi
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

## Logistic Regression

In [None]:
## Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# inisialisasi model
lr = LogisticRegression(max_iter=1000, random_state=42)

# fit model dengan data hasil BPSO feature selection
lr.fit(X_train_selected, y_train)

# prediksi dengan data hasil BPSO feature selection
y_pred_lr = lr.predict(X_test_selected)

# evaluasi
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))

## Random Forest

In [None]:
## Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# inisialisasi model
rf = RandomForestClassifier(random_state=42)

# fit model dengan data hasil BPSO feature selection
rf.fit(X_train_selected, y_train)

# prediksi dengan data hasil BPSO feature selection
y_pred_rf = rf.predict(X_test_selected)

# evaluasi
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

## Support Vector Machine (SVM)

In [None]:
## Support Vector Machine (SVM)
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

# Standardisasi fitur (penting untuk SVM) pada data hasil BPSO feature selection
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Inisialisasi model SVM dengan kernel RBF
svm = SVC(kernel='rbf', random_state=42, probability=True)

# Fit model
svm.fit(X_train_scaled, y_train)

# Prediksi
y_pred_svm = svm.predict(X_test_scaled)

# Evaluasi
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))
print("Accuracy:", accuracy_score(y_test, y_pred_svm))

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# inisialisasi model
xgb = XGBClassifier(random_state=42)

# fit model dengan data hasil BPSO feature selection
xgb.fit(X_train_selected, y_train)

# prediksi dengan data hasil BPSO feature selection
y_pred_xgb = xgb.predict(X_test_selected)

# evaluasi
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# inisialisasi model
lgbm = LGBMClassifier(random_state=42)

# fit model dengan data hasil BPSO feature selection
lgbm.fit(X_train_selected, y_train)

# prediksi dengan data hasil BPSO feature selection
y_pred_lgbm = lgbm.predict(X_test_selected)

# evaluasi
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lgbm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lgbm))
print("Accuracy:", accuracy_score(y_test, y_pred_lgbm))

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
# import pandas as pd

# # Hitung confusion matrix
# cm = confusion_matrix(y_test, y_pred_lgbm)

# # Plot confusion matrix
# plt.figure(figsize=(8, 6))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
# plt.title('Confusion Matrix')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.savefig('confusion_matrix.png')  # Simpan sebagai gambar
# plt.close()

# # Buat classification report sebagai DataFrame
# report = classification_report(y_test, y_pred_lgbm, output_dict=True)
# report_df = pd.DataFrame(report).transpose()

# # Simpan ke HTML
# with open('evaluation_report.html', 'w') as f:
#     f.write("<h1>Model Evaluation Report</h1>")
#     f.write("<h2>Confusion Matrix</h2>")
#     f.write('<img src="confusion_matrix.png" width="500"><br>')
#     f.write("<h2>Classification Report</h2>")
#     f.write(report_df.to_html())
#     f.write(f"<h2>Accuracy: {accuracy_score(y_test, y_pred_lgbm):.4f}</h2>")

# print("Laporan evaluasi telah disimpan sebagai 'evaluation_report.html'.")