In [None]:
import pandas as pd
import numpy as np
import os
import glob
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_rel, wilcoxon, shapiro
from sklearn.model_selection import GridSearchCV
import gc
import itertools
from sklearn.utils import resample
import ast
import json
import re
from sklearn.base import clone
import copy
import gower

import utils 
import model_train
from constants import DATASET_CONFIGS
import particle_swarm
import concurrent.futures


%load_ext autoreload
%autoreload 2

In [None]:
# Data currently used 
DATASET_NAME = "YourDatasetNameHere"  # Replace with your dataset name

In [None]:
# Retrieve the configuration dictionary for the Adult dataset
config = DATASET_CONFIGS[DATASET_NAME]

# Extract the file path of the dataset
path = config["path"]

# Load the dataset into a pandas DataFrame
df = pd.read_csv(path)

# Perform data preprocessing (e.g., cleaning, encoding, normalization as defined in utils)
df = utils.data_prep(df)

df

In [None]:
# Define a list of ML models
ML_models = [
    ("DT", lambda: DecisionTreeClassifier(
        criterion='entropy',
        random_state=42
    )),

    ("NB", lambda: GaussianNB()),

    ("RF", lambda: RandomForestClassifier(
        n_estimators=4,
        max_depth=5,
        max_features='sqrt',
        criterion='entropy',
        n_jobs=-1,
        warm_start=False,   
        random_state=42
    )),

    ("LR", lambda: LogisticRegression(
        solver='lbfgs',
        max_iter=100,
        random_state=42,
        n_jobs=-1
    )),
    
    ("SVM", lambda: LinearSVC(
        loss="hinge",
        random_state=42
    )),

    ("LDA", lambda: LinearDiscriminantAnalysis())

]

In [None]:
# Define the dictionary with all parameter values
# =========================
# PSO parameter settings
# =========================
PSO_PARAMETERS = {
    # Core PSO
    "n_population": 15,
    "maxIter": 30,
    'n_bootstrap': 10,

    # Phase ratios
    "warmup_ratio": 0.2,
    "adaptive_ratio": 0.6,

    # Particle reduction
    "keep_ratio": 0.7,
    "elite_ratio": 0.15,

    # Phase 2 (adaptive)
    "patience_phase2": 5,
    "epsilon_phase2": 2e-3,
    "ratio_threshold": 0.25,

    # Phase 3 (exploitation)
    "patience_phase3": 8,
    "epsilon_phase3": 2e-5,

    # Runtime control (seconds)
    "time_budget": 240
}
    

ANON_PARAMETERS = {
    'gamma': 1,
    'k': 20,
    'initial_violation_threshold': 10,
    'violation_decay_rate': 0.5,
    'penalty_weight': 1,
    'aggregate_function': 'mean'
}

BASE_PARAMETERS = {
                **PSO_PARAMETERS,
                **ANON_PARAMETERS
            }


In [None]:
# Set up a base path
base_path = os.path.join(
    "Put your base path here",  # Replace with your base path
    DATASET_NAME,
    "Anonymized Data"
)

In [None]:
if __name__ == "__main__":

    dataset_config = config  # clarity

    for ML_name, ML_fn in ML_models:
        print(f"Training model: {ML_name}")
        print("-----" * 10)

        ML_model = ML_fn()

        for n_cluster_val in dataset_config['n_cluster']:

            print(f"Running with k = {BASE_PARAMETERS['k']}, "
                  f"n_cluster = {n_cluster_val}")

            # ✅ build params PER RUN
            params = {
                **BASE_PARAMETERS,
                'n_cluster': n_cluster_val
            }

            with concurrent.futures.ProcessPoolExecutor(
                max_workers=os.cpu_count()
            ) as executor:

                futures = [
                    executor.submit(
                        particle_swarm.run_single_experiment,
                        i,                    # seed
                        df,
                        DATASET_NAME,
                        ML_name,
                        ML_model,
                        params,               # ✅ packed
                        dataset_config,       # ✅ packed
                        base_path,
                        experiment_runner=particle_swarm.run_particle_swarm_experiment_with_repairing # it can be changed to run_particle_swarm_experiment_without_repairing if you want to test the version without repairing or run_particle_swarm_experiment_QIs if you want to test the version with QIs only
                    )
                    for i in range(10)
                ]

                for future in concurrent.futures.as_completed(futures):
                    future.result()   