# **Project UAS Kecerdasan Buatan**

## **Library**

In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import optuna

  from .autonotebook import tqdm as notebook_tqdm


## **Data**

In [2]:
data = pd.read_csv('adult.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [3]:
X = data.drop('income', axis=1)
y = data['income']

In [4]:
X

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States


## **EDA**

In [5]:
data[data == '?'] = np.nan
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


## **Preprocessing**

In [6]:
X_train, X_val, y_train, y_val = train_test_split(data.drop('income', axis=1), data['income'], test_size=0.2, random_state=42)

In [7]:
# Encode categorical features in X_train and X_val
categorical_cols = X_train.select_dtypes(include=['object']).columns

le_dict = {}
for col in categorical_cols:
    le = LabelEncoder()
    # Fit on all data (train + val) to handle unseen labels
    all_values = pd.concat([X_train[col], X_val[col]], axis=0).astype(str)
    le.fit(all_values)
    X_train[col] = le.transform(X_train[col].astype(str))
    X_val[col] = le.transform(X_val[col].astype(str))
    le_dict[col] = le

# Encode target variable if needed
target_le = LabelEncoder()
y_train = target_le.fit_transform(y_train)
y_val = target_le.transform(y_val)

## **Modelling**

In [8]:
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_val)

print("Decision Tree Regressor Accuracy:", accuracy_score(y_val, y_pred_dt.round()))

Decision Tree Regressor Accuracy: 0.8111469368954399


In [9]:
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_val)

print("KNN Regressor Accuracy:", accuracy_score(y_val, y_pred_knn.round()))
print()

KNN Regressor Accuracy: 0.7785966528481498



In [10]:
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_val)

print("XGBoost Regressor Accuracy:", accuracy_score(y_val, y_pred_xgb.round()))
print(classification_report(y_val, y_pred_xgb.round(), target_names=target_le.classes_))

XGBoost Regressor Accuracy: 0.8701059419622293
              precision    recall  f1-score   support

       <=50K       0.90      0.94      0.92      4976
        >50K       0.77      0.65      0.70      1537

    accuracy                           0.87      6513
   macro avg       0.83      0.79      0.81      6513
weighted avg       0.87      0.87      0.87      6513



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


## **Optimization**

### **Genetic Algorithm**

### **Particle Swarm Optimization**

In [12]:
import numpy as np
from pyswarm import pso
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
class PSOHyperparameterTuner:
    def __init__(self, X_train, y_train, model_type='dt', X_val=None, y_val=None, cv=3):
        """
        Inisialisasi tuner hyperparameter dengan Particle Swarm Optimization (PSO) menggunakan pyswarm
        
        Parameters
        ----------
        X_train : Data fitur training
        y_train : Label training
        model_type : Tipe model ('dt' untuk Decision Tree, 'knn' untuk K-Nearest Neighbors)
        X_val : Data fitur validasi (opsional)
        y_val : Label validasi (opsional)
        cv : Jumlah fold untuk cross-validation
        """
        self.X_train = X_train
        self.y_train = y_train
        self.X_val = X_val
        self.y_val = y_val
        self.cv = cv
        self.model_type = model_type.lower()
        self.best_solution = None
        self.best_params = None
        self.fitness_history = []
        
        # Definisikan parameter bounds berdasarkan tipe model
        if self.model_type == 'dt':
            self.param_bounds = {
                'max_depth': (3, 30, 1),              # Integer parameter
                'min_samples_split': (2, 20, 1),      # Integer parameter
                'min_samples_leaf': (1, 20, 1),       # Integer parameter
                'max_features': (0.1, 1.0, 0.05),     # Float parameter (proportion)
                'criterion': (0, 1, 1)                # Categorical: 0=gini, 1=entropy
            }
        elif self.model_type == 'knn':
            self.param_bounds = {
                'n_neighbors': (1, 30, 1),            # Integer parameter
                'weights': (0, 1, 1),                 # Categorical: 0=uniform, 1=distance
                'p': (1, 2, 1),                       # Integer parameter (Manhattan=1, Euclidean=2)
                'leaf_size': (10, 50, 1)              # Integer parameter
            }
        else:
            raise ValueError("model_type harus 'dt' atau 'knn'")
        
        # Menyimpan nama parameter berdasarkan urutan
        self.param_names = list(self.param_bounds.keys())
        self.num_params = len(self.param_bounds)
        self.lower_bounds, self.upper_bounds = self._get_bounds()
    
    def _get_bounds(self):
        # Membuat lower dan upper bound untuk setiap parameter (semua dalam float)
        lowers, uppers = [], []
        for name in self.param_names:
            min_val, max_val, _ = self.param_bounds[name]
            lowers.append(float(min_val))
            uppers.append(float(max_val))
        return np.array(lowers), np.array(uppers)
    
    def decode_solution(self, solution):
        """
        Decode solusi PSO ke parameter model yang valid
        """
        params = {}
        for i, value in enumerate(solution):
            param_name = self.param_names[i]
            min_val, max_val, step = self.param_bounds[param_name]

            if param_name == 'criterion' and self.model_type == 'dt':
                value = 'gini' if value < 0.5 else 'entropy'
            elif param_name == 'weights' and self.model_type == 'knn':
                value = 'uniform' if value < 0.5 else 'distance'
            elif isinstance(step, int) or step == 1:
                value = int(round(value))
                value = min(max(value, min_val), max_val)
            else:
                # Float parameter, dibulatkan ke step terdekat
                value = round(float(value) / step) * step
                value = min(max(value, min_val), max_val)
                value = round(value, 6)
            params[param_name] = value
        return params
    
    def fitness_func(self, solution):
        """
        Fungsi fitness untuk PSO (minimize negative accuracy)
        """
        params = self.decode_solution(solution)
        # Buat model
        if self.model_type == 'dt':
            model = DecisionTreeClassifier(
                max_depth=None if params['max_depth'] >= 30 else int(params['max_depth']),
                min_samples_split=int(params['min_samples_split']),
                min_samples_leaf=int(params['min_samples_leaf']),
                max_features=params['max_features'] if params['max_features'] < 1.0 else None,
                criterion=params['criterion'],
                random_state=42
            )
        else:  # knn
            model = KNeighborsClassifier(
                n_neighbors=int(params['n_neighbors']),
                weights=params['weights'],
                p=int(params['p']),
                leaf_size=int(params['leaf_size'])
            )
        try:
            scores = cross_val_score(model, self.X_train, self.y_train, 
                                     cv=self.cv, scoring='accuracy')
            accuracy = np.mean(scores)
            # PSO melakukan minimisasi → return -accuracy
            self.fitness_history.append(accuracy)
            return -accuracy
        except Exception as e:
            print(f"Error evaluating solution: {e}")
            self.fitness_history.append(0)
            return 1  # nilai fitness buruk (karena minimisasi)
    
    def run_optimization(self, maxiter=30, swarmsize=20, debug=True):
        """
        Jalankan optimasi PSO
        """
        print("Memulai optimasi hyperparameter dengan PSO...")
        self.fitness_history = []
        # Jalankan PSO
        best_solution, best_fitness = pso(
            self.fitness_func,
            self.lower_bounds,
            self.upper_bounds,
            swarmsize=swarmsize,
            maxiter=maxiter,
            debug=debug
        )
        # Simpan hasil terbaik
        self.best_solution = best_solution
        self.best_params = self.decode_solution(best_solution)
        print("\nOptimasi selesai!")
        print(f"Best fitness (akurasi): {-best_fitness:.6f}")
        print(f"Best hyperparameters:")
        for param, value in self.best_params.items():
            print(f"  {param}: {value}")
        return self.best_params
    
    def get_best_model(self):
        """
        Mengembalikan model yang dilatih dengan parameter terbaik
        """
        if self.best_params is None:
            print("Jalankan run_optimization() terlebih dahulu!")
            return None
        if self.model_type == 'dt':
            best_model = DecisionTreeClassifier(
                max_depth=None if self.best_params['max_depth'] >= 30 else int(self.best_params['max_depth']),
                min_samples_split=int(self.best_params['min_samples_split']),
                min_samples_leaf=int(self.best_params['min_samples_leaf']),
                max_features=self.best_params['max_features'] if self.best_params['max_features'] < 1.0 else None,
                criterion=self.best_params['criterion'],
                random_state=42
            )
        else:  # knn
            best_model = KNeighborsClassifier(
                n_neighbors=int(self.best_params['n_neighbors']),
                weights=self.best_params['weights'],
                p=int(self.best_params['p']),
                leaf_size=int(self.best_params['leaf_size'])
            )
        best_model.fit(self.X_train, self.y_train)
        return best_model

    def plot_fitness_history(self):
        """
        Plot riwayat fitness selama optimasi
        """
        import matplotlib.pyplot as plt
        plt.figure(figsize=(12, 6))
        plt.plot(range(1, len(self.fitness_history) + 1), self.fitness_history, 
                 marker='o', linestyle='-', color='#27ae60')
        plt.title(f'Progression of Best Fitness Score for {self.model_type.upper()} Across Iterations (PSO)', 
                  fontsize=14)
        plt.xlabel('Iteration', fontsize=12)
        plt.ylabel('Best Fitness (Accuracy)', fontsize=12)
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        return plt

**OPTIMIZING DT**

In [None]:
print("=" * 80)
print("OPTIMASI MODEL DECISION TREE DENGAN PARTICLE SWARM OPTIMIZATION")
print("=" * 80)

dt_pso_tuner = PSOHyperparameterTuner(
    X_train=X_train_scaled,
    y_train=y_train_balanced,
    model_type='dt',  # Decision Tree
    cv=3  # 3-fold cross-validation
)

# Jalankan optimasi PSO untuk Decision Tree
dt_pso_best_params = dt_pso_tuner.run_optimization(
    maxiter=20,    # Jumlah iterasi (bisa disesuaikan)
    swarmsize=20,  # Ukuran swarm
    debug=True     # Output progres
)

# Dapatkan model DT terbaik hasil optimasi PSO
dt_pso_best_model = dt_pso_tuner.get_best_model()

# Evaluasi model pada data validasi
y_pred_dt_pso = dt_pso_best_model.predict(X_val_scaled)
dt_pso_accuracy = accuracy_score(y_val, y_pred_dt_pso)

print("\nHasil evaluasi Decision Tree dengan hyperparameter optimal dari PSO:")
print(f"Accuracy: {dt_pso_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred_dt_pso, target_names=target_le.classes_))

# Plot riwayat fitness
dt_pso_plot = dt_pso_tuner.plot_fitness_history()
dt_pso_plot.savefig('dt_pso_fitness_history.png', dpi=300, bbox_inches='tight')
dt_pso_plot.show()

**OPTIMIZING KNN**

In [None]:
print("\n\n")
print("=" * 80)
print("OPTIMASI MODEL KNN DENGAN PARTICLE SWARM OPTIMIZATION")
print("=" * 80)

knn_pso_tuner = PSOHyperparameterTuner(
    X_train=X_train_scaled,
    y_train=y_train_balanced,
    model_type='knn',  # K-Nearest Neighbors
    cv=3  # 3-fold cross-validation
)

# Jalankan optimasi PSO untuk KNN
knn_pso_best_params = knn_pso_tuner.run_optimization(
    maxiter=20,    # Jumlah iterasi
    swarmsize=20,  # Ukuran swarm
    debug=True     # Output progres
)

# Dapatkan model KNN terbaik hasil optimasi PSO
knn_pso_best_model = knn_pso_tuner.get_best_model()

# Evaluasi model pada data validasi
y_pred_knn_pso = knn_pso_best_model.predict(X_val_scaled)
knn_pso_accuracy = accuracy_score(y_val, y_pred_knn_pso)

print("\nHasil evaluasi KNN dengan hyperparameter optimal dari PSO:")
print(f"Accuracy: {knn_pso_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred_knn_pso, target_names=target_le.classes_))

# Plot riwayat fitness
knn_pso_plot = knn_pso_tuner.plot_fitness_history()
knn_pso_plot.savefig('knn_pso_fitness_history.png', dpi=300, bbox_inches='tight')
knn_pso_plot.show()


**Model Comparison**

In [None]:
models = ['DT Original', 'DT + PSO', 'KNN Original', 'KNN + PSO']
accuracies = [
    accuracy_score(y_val, y_pred_dt.round()),         # DT Original
    dt_pso_accuracy,                                  # DT + PSO
    accuracy_score(y_val, y_pred_knn.round()),        # KNN Original
    knn_pso_accuracy                                  # KNN + PSO
]

plt.figure(figsize=(12, 6))
colors = ['#3498db', '#e67e22', '#9b59b6', '#16a085']
ax = sns.barplot(x=models, y=accuracies, palette=colors)
plt.title('Perbandingan Akurasi Model: Original vs PSO-optimized', fontsize=14)
plt.ylabel('Accuracy', fontsize=12)
plt.ylim(0.70, max(accuracies) + 0.05)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Tambahkan label nilai akurasi di atas bar
for i, acc in enumerate(accuracies):
    ax.text(i, acc + 0.005, f'{acc:.4f}', ha='center', fontsize=10)

plt.tight_layout()
plt.savefig('model_comparison_pso.png', dpi=300, bbox_inches='tight')
plt.show()

# Simpan hasil parameter terbaik
results_pso = {
    'Decision Tree': {
        'Original Accuracy': accuracy_score(y_val, y_pred_dt.round()),
        'PSO Optimized Accuracy': dt_pso_accuracy,
        'Improvement': (dt_pso_accuracy - accuracy_score(y_val, y_pred_dt.round())) * 100,
        'Best Parameters': dt_pso_best_params
    },
    'KNN': {
        'Original Accuracy': accuracy_score(y_val, y_pred_knn.round()),
        'PSO Optimized Accuracy': knn_pso_accuracy,
        'Improvement': (knn_pso_accuracy - accuracy_score(y_val, y_pred_knn.round())) * 100,
        'Best Parameters': knn_pso_best_params
    }
}

# Tampilkan ringkasan hasil
print("\n" + "=" * 80)
print("RINGKASAN HASIL OPTIMASI (PSO)")
print("=" * 80)

for model, result in results_pso.items():
    print(f"\n{model}:")
    print(f"  Original Accuracy: {result['Original Accuracy']:.4f}")
    print(f"  PSO Optimized Accuracy: {result['PSO Optimized Accuracy']:.4f}")
    print(f"  Improvement: {result['Improvement']:.2f}%")
    print("  Best Parameters:")
    for param, value in result['Best Parameters'].items():
        print(f"    - {param}: {value}")

##  **Evaluation**