In [1]:
import pandas as pd
import numpy as np
import os
import glob
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_rel, wilcoxon, shapiro
from sklearn.model_selection import GridSearchCV
import gc
import itertools
from sklearn.utils import resample
import ast
import json
import re

import utils 
import model_train
from constants import *
import particle_swarm

%load_ext autoreload
%autoreload 2

In [2]:
# List of file paths
df = pd.read_csv(path)
df

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,credit_risk
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking,12,existing paid,furniture/equipment,1736,<100,4<=X<7,3,female div/dep/mar,none,...,real estate,31,none,own,1,unskilled resident,1,none,yes,good
996,<0,30,existing paid,used car,3857,<100,1<=X<4,4,male div/sep,none,...,life insurance,40,none,own,1,high qualif/self emp/mgmt,1,yes,yes,good
997,no checking,12,existing paid,radio/tv,804,<100,>=7,4,male single,none,...,car,38,none,own,1,skilled,1,none,yes,good
998,<0,45,existing paid,radio/tv,1845,<100,1<=X<4,4,male single,none,...,no known property,23,none,for free,1,skilled,1,yes,yes,bad


In [3]:
df = utils.data_prep(df)

bounds = utils.get_nqi_bounds(df, NQIs)
levels = utils.get_cqi_levels(df, CQIs)
nqi_means = df[NQIs].mean()

In [4]:
# Define a list of ML models
models = [
    ("DT", DecisionTreeClassifier(criterion='entropy', random_state=42)),
    # ("LR", LogisticRegression(solver='lbfgs', max_iter=100, random_state=42, n_jobs=-1)),
    # ("NB", GaussianNB()),
    # ("NN", MLPClassifier(random_state=42)),
    # ("RF", RandomForestClassifier(criterion='entropy', random_state=42)),
    # ("SVM", LinearSVC(random_state=42))   
]

In [5]:
# Define the dictionary with all parameter values
parameters_dic = {'gamma': [1],
                  'k': [20],
                  'n_cluster': [20],
                  'initial_violation_threshold': [10],
                  'violation_decay_rate': [0.5],
                  'penalty_weight': [1]
                  }

# Generate all combinations of parameters using itertools.product
param_combinations = list(itertools.product(*parameters_dic.values()))

In [None]:
results = particle_swarm.run_particle_swarm_experiment(
    df=df, 
    models=models, 
    param_combinations=param_combinations, 
    NQIs=NQIs, 
    CQIs=CQIs, 
    n_population=100,
    maxIter=100,
    n_bootstrap=100,
    bounds=bounds, 
    levels=levels, 
    nqi_means=nqi_means, 
    filedirectory='/Users/yusiwei/Library/CloudStorage/OneDrive-Personal/research/Fourth Year Paper/Experiments/2nd experiments/Experiment results/V2/Anonymized Data'
)

Running with k = 20, n_cluster = 20,  initial_violation_threshold = 10, violation_decay_rate = 0.5, penalty_weight = 1
Training model: DT
Iteration: 0


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 1


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 2


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 3


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 4


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 5


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 6


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 7


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 8


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 9


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 10


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 11


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 12


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 13


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 14


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 15


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 16


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 17


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 18


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 19


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 20


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 21


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 22


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 23


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 24


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 25


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 26


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 27


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 28


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 29


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 30


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 31


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 32


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 33


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 34


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 35


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 36


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 37


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 38


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 39


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 40


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 41


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 42


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 43


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 44


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 45


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 46


  particle_numeric[mask_low, col_idx] = column_means[col_idx]
  particle_numeric[mask_high, col_idx] = column_means[col_idx]


Iteration: 47


In [7]:
# save the results to a file
results_df = pd.DataFrame(results)
filedirectory = '/Users/yusiwei/Library/CloudStorage/OneDrive-Personal/research/Fourth Year Paper/Experiments/2nd experiments/Experiment results/V2/Iteration Tracking Info'
filename = f"DT_Anonymization_1.csv"
filename = os.path.join(filedirectory, filename)
results_df.to_csv(filename, index=False)
        