In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Packages

In [None]:
from collections import Counter
import math
import pandas as pd
import numpy as np
from scipy import stats
# import matplotlib
import matplotlib.pyplot as plt
# import seaborn
import seaborn as sns
%matplotlib inline

# Functions

In [None]:
def learning_curve(best_estimator, name):
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6), sharey=True)

    common_params = {
        "X": x_train,
        "y": y_train,
        "train_sizes": np.arange(0.1, 1.1, 0.1),
        "cv": StratifiedKFold(n_splits=5),
        "score_type": "both",
        "n_jobs": -1,
        "line_kw": {"marker": "o"},
        "std_display_style": "fill_between",
        "scoring": "roc_auc",
    }

    LearningCurveDisplay.from_estimator(best_estimator, **common_params, ax=ax)
    handles, label = ax.get_legend_handles_labels()
    ax.legend(handles[:2], ["Training Score", "Test Score"])
    ax.set_title(f"Learning Curve for {name}")
    plt.savefig(f"{name}_learningcurve")
    
def model_eval(clf, x, y, filename):
    print("Best score (CV score=%0.3f):" % clf.best_score_)
    print(clf.best_params_)
    print(f"Best Results: {clf.cv_results_['params'][clf.best_index_]}")
    print("\n")
    for k , v in clf.cv_results_.items():
        print(f"{k} : {clf.cv_results_[k][clf.best_index_]}")
    y_pred = clf.predict(x)
    print(classification_report(y,y_pred))
    y_score = clf.predict_proba(x)
    # ROC
    fig, ax = plt.subplots()
    fig.set_size_inches(18.5, 10.5)
    ax.plot([0, 1], [0, 1], transform=ax.transAxes)
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.title("ROC-AUC Training Set")
    RocCurveDisplay.from_estimator(clf, x, y, ax=ax)
    plt.savefig(f"{filename}.png")
    
def validation_curved(cv_results, parameters):
    df = pd.DataFrame(cv_results)
    results = ['mean_test_score',
               'mean_train_score',
               'std_test_score', 
               'std_train_score']
    # https://en.wikipedia.org/wiki/Pooled_variance#Pooled_standard_deviation
    def pooled_var(stds):
        n = 5 # size of each group
        return np.sqrt(sum((n-1)*(stds**2))/ len(stds)*(n-1))
    fig, axes = plt.subplots(1, len(parameters), 
                             figsize = (5*len(parameters), 7),
                             sharey='row')
    axes[0].set_ylabel("Score", fontsize=25)
    lw = 2

    for idx, (param_name, param_range) in enumerate(parameters.items()):
        grouped_df = df.groupby(f'param_{param_name}')[results]\
            .agg({'mean_train_score': 'mean',
                  'mean_test_score': 'mean',
                  'std_train_score': pooled_var,
                  'std_test_score': pooled_var})

        previous_group = df.groupby(f'param_{param_name}')[results]
        axes[idx].set_xlabel(param_name, fontsize=30)
        axes[idx].set_ylim(0.0, 1.1)
        axes[idx].plot(param_range, 
                    grouped_df['mean_train_score'],
                    label="Training score",
                    color="darkorange",
                    lw=lw)
        axes[idx].fill_between(param_range,
                    grouped_df['mean_train_score'] - grouped_df['std_train_score'],
                    grouped_df['mean_train_score'] + grouped_df['std_train_score'],
                    alpha=0.2,
                    color="darkorange",
                    lw=lw)
        axes[idx].plot(param_range,
                    grouped_df['mean_test_score'],
                    label="Cross-validation score",
                    color="navy",
                    lw=lw)
        axes[idx].fill_between(param_range,
                        grouped_df['mean_test_score'] - grouped_df['std_test_score'],
                        grouped_df['mean_test_score'] + grouped_df['std_test_score'],
                        alpha=0.2,
                        color="navy",
                        lw=lw)

    handles, labels = axes[0].get_legend_handles_labels()
    fig.suptitle('Validation curves', fontsize=40)
    fig.legend(handles, labels, loc=8, ncol=2, fontsize=20)

    fig.subplots_adjust(bottom=0.25, top=0.85)  
    plt.show()

# Read Data

In [None]:
df = pd.read_csv("../input/adult-income-dataset/adult.csv", sep=",",skipinitialspace=True, header=0, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'], na_values='?')

# Clean Data

In [None]:
# change ? to NA
df.replace("?",np.nan, inplace=True)

# drop NA
df.dropna(axis=0,inplace=True)


#drop fnlwgt b/c a. it's ordinal-like data and b. i believe scale is different across states so there's no good way of defining the relationship with this info alone
df.drop(['fnlwgt'], axis=1, inplace=True)

#drop education-num. redundant to education
df.drop(['education-num'], axis=1, inplace=True)


# group native-country by developing and developed
developed = ['Canada', 'England', 'France', 'Germany', 'Greece', 'Holand-Netherlands', 'Hungary', 'Ireland', 'Italy', 'Japan', 'Poland', 'Portugal', 'Scotland', 'Taiwan', 'Hong', 'South']
df['country-status'] = df['native-country'].apply(lambda x: "United-States" if x == 'United-States' else ('developed' if x in developed else 'developing'))
df.drop(['native-country'], axis=1, inplace=True)

# Dropping capital gain outliers
ix = df[df['capital-gain'] >= 99999].index
df.drop(ix, inplace=True)

#remove duplicate rows
df.drop_duplicates(inplace=True, ignore_index=True)

# Model Packages

In [None]:
from sklearn.model_selection import StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold, train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, LearningCurveDisplay, validation_curve as vc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, classification_report, auc, roc_curve, RocCurveDisplay
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.inspection import permutation_importance
from sklearn.exceptions import ConvergenceWarning

# Model Preprocessing

In [None]:
#features
x = df.drop('income', axis=1).copy()
#target
y = df.loc[:,df.columns == "income"].copy()
#label encode target
y = pd.Series(np.where(y.income.values == '>50K', 1, 0), y.index).values
#ratio of target
counter = Counter(y)
# estimate scale_pos_weight value
target_ratio = counter[0] / counter[1]
print(counter)

# select categorical and numerical features
categorical_features = x.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_features = x.select_dtypes(include=['int64', 'float64']).columns.tolist()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify=y, shuffle=True)


n_features = len(np.unique(x_train[categorical_features])) + len(numerical_features)

# Dummy Classifier

In [None]:
steps = []
#pipeline one hot encode and standard scale
transformations = ColumnTransformer(transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
                                                  ('ss', StandardScaler(), numerical_features)])
steps.append(('tf', transformations))
# define the reference model
model = DummyClassifier(strategy='most_frequent')
steps.append(('baseline', model))

pipeline = Pipeline(steps)
# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=42)

scores = cross_val_score(pipeline, x_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)

print (f"Baseline {np.mean(scores)}")

# XGBoost Model

In [None]:
model1 = XGBClassifier(scale_pos_weight=target_ratio, eval_metric='auc', seed=42, objective='binary:logistic', learning_rate = 0.1, n_estimators=1000)
steps = []
transformations = ColumnTransformer(transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
                                                  ('ss', StandardScaler(), numerical_features)])
steps.append(('tf', transformations))
steps.append(('xgboost', model1))
pipeline = Pipeline(steps)
skf = StratifiedKFold(n_splits=5)
parameters1 = {
    "xgboost__max_depth": [3, 4, 5],  # the maximum depth of a tree
    "xgboost__min_child_weight": [1, 5, 10, 25]  # the minimum sum of instance weight needed in a leaf
}
xgboost_clf1 = GridSearchCV(pipeline, param_grid = parameters1, cv = skf, scoring= 'roc_auc', verbose=2, n_jobs=-1, error_score='raise', return_train_score=True)
xgboost_clf1.fit(x_train,y_train)

In [None]:
xgboost_clf1.best_params_.items()

In [None]:
params = {}
for k, v in xgboost_clf1.best_params_.items():
    params[k.split("__")[1]] = v
model2 = XGBClassifier(scale_pos_weight=target_ratio, eval_metric='auc', seed=42, objective='binary:logistic', learning_rate = 0.1, n_estimators=1000, **params)
steps = []
transformations = ColumnTransformer(transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
                                                  ('ss', StandardScaler(), numerical_features)])
steps.append(('tf', transformations))
steps.append(('xgboost', model2))
pipeline = Pipeline(steps)

parameters2 = {
    "xgboost__subsample": [0.25, 0.5, 1.0], # samples used
    "xgboost__colsample_bytree": [0.3, 0.4, 0.5 , 0.6, 0.7, 0.8]  # columns used
}
xgboost_clf2 = GridSearchCV(pipeline, param_grid = parameters2, cv = skf, scoring= 'roc_auc', verbose=2, n_jobs=-1, error_score='raise', return_train_score=True)
xgboost_clf2.fit(x_train,y_train)

In [None]:
params2 = {}
for k, v in xgboost_clf2.best_params_.items():
    params2[k.split("__")[1]] = v
params3 = params | params2
model3 = XGBClassifier(scale_pos_weight=target_ratio, eval_metric='auc', seed=42, objective='binary:logistic', **params3)
steps = []
transformations = ColumnTransformer(transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
                                                  ('ss', StandardScaler(), numerical_features)])
steps.append(('tf', transformations))
steps.append(('xgboost', model3))
pipeline = Pipeline(steps)

parameters3 = {
    "xgboost__n_estimators": [50, 100, 250, 500, 1000],
    "xgboost__learning_rate": [0.05, 0.10, 0.20, 0.5]
}
xgboost_clf = GridSearchCV(pipeline, param_grid = parameters3, cv = skf, scoring= 'roc_auc', verbose=2, n_jobs=-1, error_score='raise', return_train_score=True)
xgboost_clf.fit(x_train,y_train)

In [None]:
model_eval(xgboost_clf, x_train, y_train, "xgboost1_rocauc")

In [None]:
def create_validation(x, y, clf, key, values):
    train_scores_list, test_scores_list = [], []
    key2 = key.split("__")[1]
    
    params = clf.best_params_
    m_params = {}
    for k, v in clf.best_params_.items():
        m_params[k.split("__")[1]] = v
    del m_params[key2]    
    model = XGBClassifier(scale_pos_weight=target_ratio, eval_metric='auc', seed=42, objective='binary:logistic', **m_params)
    steps = []
    transformations = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ss', StandardScaler(), numerical_features)
    ]
    )
    steps.append(('tf', transformations))
    steps.append(('xgboost', model))
    pipeline = Pipeline(steps)    
    train_scores, test_scores = vc(
        estimator=pipeline,
        X=x,
        y=y,
        param_name=key,
        param_range=values,
        scoring="roc_auc",
        verbose=0    
    )
    train_scores_list.append(train_scores)
    test_scores_list.append(test_scores)
    
    return train_scores_list, test_scores_list

### Tree Depth Validation

In [None]:
xgboost_train_scores_tree_depth, xgboost_test_scores_tree_depth = create_validation(x_train, y_train, xgboost_clf, "xgboost__max_depth", [3, 4, 5])

In [None]:
xgboost_train_scores_mean_tree_depth = np.mean(xgboost_train_scores_tree_depth[0], axis=1)
xgboost_train_scores_std_tree_depth = np.std(xgboost_train_scores_tree_depth[0], axis=1)
xgboost_test_scores_mean_tree_depth = np.mean(xgboost_test_scores_tree_depth[0], axis=1)
xgboost_test_scores_std_tree_depth = np.std(xgboost_test_scores_tree_depth[0], axis=1)
xgboost_param_range_tree_depth = np.array([3, 4, 5])

plt.title("Validation Curve - XGBoost Max Tree Depth")
plt.xlabel("$Max Tree Depth$")
plt.ylabel("ROC-AUC")
plt.ylim(0.8, 1.1)
plt.plot(xgboost_param_range_tree_depth, xgboost_train_scores_mean_tree_depth, label="Training score", color="r")
plt.fill_between(xgboost_param_range_tree_depth, xgboost_train_scores_mean_tree_depth - xgboost_train_scores_std_tree_depth,
                 xgboost_train_scores_mean_tree_depth + xgboost_train_scores_std_tree_depth, alpha=0.2, color="r")
plt.plot(xgboost_param_range_tree_depth, xgboost_test_scores_mean_tree_depth, label="Cross-validation score",
             color="g")
plt.fill_between(xgboost_param_range_tree_depth, xgboost_test_scores_mean_tree_depth - xgboost_test_scores_std_tree_depth,
                 xgboost_test_scores_mean_tree_depth + xgboost_test_scores_std_tree_depth, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("xgboost1_maxtreedepth")

### XGBoost Min Child Weight

In [None]:
xgboost_train_scores_min_child_weight, xgboost_test_scores_min_child_weight = create_validation(x_train, y_train, xgboost_clf, "xgboost__min_child_weight", [1, 5, 10, 25])

In [None]:
xgboost_train_scores_mean_min_child_weight = np.mean(xgboost_train_scores_min_child_weight[0], axis=1)
xgboost_train_scores_std_min_child_weight = np.std(xgboost_train_scores_min_child_weight[0], axis=1)
xgboost_test_scores_mean_min_child_weight = np.mean(xgboost_test_scores_min_child_weight[0], axis=1)
xgboost_test_scores_std_min_child_weight = np.std(xgboost_test_scores_min_child_weight[0], axis=1)
xgboost_param_range_min_child_weight = np.array([1, 5, 10, 25])

plt.title("Validation Curve - XGBoost Min Child Weight")
plt.xlabel("$Min Child Weight$")
plt.ylabel("ROC-AUC")
plt.ylim(0.8, 1.1)
plt.plot(xgboost_param_range_min_child_weight, xgboost_train_scores_mean_min_child_weight, label="Training score", color="r")
plt.fill_between(xgboost_param_range_min_child_weight, xgboost_train_scores_mean_min_child_weight - xgboost_train_scores_std_min_child_weight,
                 xgboost_train_scores_mean_min_child_weight + xgboost_train_scores_std_min_child_weight, alpha=0.2, color="r")
plt.plot(xgboost_param_range_min_child_weight, xgboost_test_scores_mean_min_child_weight, label="Cross-validation score",
             color="g")
plt.fill_between(xgboost_param_range_min_child_weight, xgboost_test_scores_mean_min_child_weight - xgboost_test_scores_std_min_child_weight,
                 xgboost_test_scores_mean_min_child_weight + xgboost_test_scores_std_min_child_weight, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("xgboost1_minchildweight")

### XGBoost Col Sample Used

In [None]:
xgboost_train_scores_colsample_bytree, xgboost_test_scores_colsample_bytree = create_validation(x_train, y_train, xgboost_clf, "xgboost__colsample_bytree", [0.3, 0.4, 0.5 , 0.6, 0.7, 0.8])

In [None]:
xgboost_train_scores_mean_colsample_bytree = np.mean(xgboost_train_scores_colsample_bytree[0], axis=1)
xgboost_train_scores_std_colsample_bytree = np.std(xgboost_train_scores_colsample_bytree[0], axis=1)
xgboost_test_scores_mean_colsample_bytree = np.mean(xgboost_test_scores_colsample_bytree[0], axis=1)
xgboost_test_scores_std_colsample_bytree = np.std(xgboost_test_scores_colsample_bytree[0], axis=1)
xgboost_param_range_colsample_bytree = np.array([0.3, 0.4, 0.5 , 0.6, 0.7, 0.8])

plt.title("Validation Curve - XGBoost Col Samples Used")
plt.xlabel("$Col Samples Used$")
plt.ylabel("ROC-AUC")
plt.ylim(0.8, 1.1)
plt.plot(xgboost_param_range_colsample_bytree, xgboost_train_scores_mean_colsample_bytree, label="Training score", color="r")
plt.fill_between(xgboost_param_range_colsample_bytree, xgboost_train_scores_mean_colsample_bytree - xgboost_train_scores_std_colsample_bytree,
                 xgboost_train_scores_mean_colsample_bytree + xgboost_train_scores_std_colsample_bytree, alpha=0.2, color="r")
plt.plot(xgboost_param_range_colsample_bytree, xgboost_test_scores_mean_colsample_bytree, label="Cross-validation score",
             color="g")
plt.fill_between(xgboost_param_range_colsample_bytree, xgboost_test_scores_mean_colsample_bytree - xgboost_test_scores_std_colsample_bytree,
                 xgboost_test_scores_mean_colsample_bytree + xgboost_test_scores_std_colsample_bytree, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("xgboost1_colused")

### XGBoost Row Samples Used

In [None]:
xgboost_train_scores_subsample, xgboost_test_scores_subsample = create_validation(x_train, y_train, xgboost_clf, "xgboost__subsample", [0.25, 0.5, 1.0])

In [None]:
xgboost_train_scores_mean_subsample = np.mean(xgboost_train_scores_subsample[0], axis=1)
xgboost_train_scores_std_subsample = np.std(xgboost_train_scores_subsample[0], axis=1)
xgboost_test_scores_mean_subsample = np.mean(xgboost_test_scores_subsample[0], axis=1)
xgboost_test_scores_std_subsample = np.std(xgboost_test_scores_subsample[0], axis=1)
xgboost_param_range_subsample = np.array([0.25, 0.5, 1.0])

plt.title("Validation Curve - XGBoost Samples Used")
plt.xlabel("$Samples Used$")
plt.ylabel("ROC-AUC")
plt.ylim(0.8, 1.1)
plt.plot(xgboost_param_range_subsample, xgboost_train_scores_mean_subsample, label="Training score", color="r")
plt.fill_between(xgboost_param_range_subsample, xgboost_train_scores_mean_subsample - xgboost_train_scores_std_subsample,
                 xgboost_train_scores_mean_subsample + xgboost_train_scores_std_subsample, alpha=0.2, color="r")
plt.plot(xgboost_param_range_subsample, xgboost_test_scores_mean_subsample, label="Cross-validation score",
             color="g")
plt.fill_between(xgboost_param_range_subsample, xgboost_test_scores_mean_subsample - xgboost_test_scores_std_subsample,
                 xgboost_test_scores_mean_subsample + xgboost_test_scores_std_subsample, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("xgboost1_samplesused")

### XGBoost No. Estimators

In [None]:
xgboost_train_scores_n_estimators, xgboost_test_scores_n_estimators = create_validation(x_train, y_train, xgboost_clf, "xgboost__n_estimators", [50, 100, 250, 500, 1000])

In [None]:
xgboost_train_scores_mean_n_estimators = np.mean(xgboost_train_scores_n_estimators[0], axis=1)
xgboost_train_scores_std_n_estimators = np.std(xgboost_train_scores_n_estimators[0], axis=1)
xgboost_test_scores_mean_n_estimators = np.mean(xgboost_test_scores_n_estimators[0], axis=1)
xgboost_test_scores_std_n_estimators = np.std(xgboost_test_scores_n_estimators[0], axis=1)
xgboost_param_range_n_estimators = np.array([50, 100, 250, 500, 1000])

plt.title("Dataset 1 XGBoost No. Estimators")
plt.xlabel("$No. Estimators$")
plt.ylabel("ROC-AUC")
plt.ylim(0.8, 1.1)
plt.plot(xgboost_param_range_n_estimators, xgboost_train_scores_mean_n_estimators, label="Training score", color="r")
plt.fill_between(xgboost_param_range_n_estimators, xgboost_train_scores_mean_n_estimators - xgboost_train_scores_std_n_estimators,
                 xgboost_train_scores_mean_n_estimators + xgboost_train_scores_std_n_estimators, alpha=0.2, color="r")
plt.plot(xgboost_param_range_n_estimators, xgboost_test_scores_mean_n_estimators, label="Cross-validation score",
             color="g")
plt.fill_between(xgboost_param_range_n_estimators, xgboost_test_scores_mean_n_estimators - xgboost_test_scores_std_n_estimators,
                 xgboost_test_scores_mean_n_estimators + xgboost_test_scores_std_n_estimators, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("xgboost1_estimator")

### XGBoost Learning Rate

In [None]:
xgboost_train_scores_learning_rate, xgboost_test_scores_learning_rate = create_validation(x_train, y_train, xgboost_clf, "xgboost__learning_rate", [0.001, 0.01, 0.1, 1.])

In [None]:
xgboost_train_scores_mean_learning_rate = np.mean(xgboost_train_scores_learning_rate[0], axis=1)
xgboost_train_scores_std_learning_rate = np.std(xgboost_train_scores_learning_rate[0], axis=1)
xgboost_test_scores_mean_learning_rate = np.mean(xgboost_test_scores_learning_rate[0], axis=1)
xgboost_test_scores_std_learning_rate = np.std(xgboost_test_scores_learning_rate[0], axis=1)
xgboost_param_range_learning_rate = np.array([0.001, 0.01, 0.1, 1.])

plt.title("Dataset 1 XGBoost Learning Rate")
plt.xlabel("$Learning Rate$")
plt.ylabel("ROC-AUC")
plt.ylim(0.8, 1.1)
plt.semilogx(xgboost_param_range_learning_rate, xgboost_train_scores_mean_learning_rate, label="Training score", color="r")
plt.fill_between(xgboost_param_range_learning_rate, xgboost_train_scores_mean_learning_rate - xgboost_train_scores_std_learning_rate,
                 xgboost_train_scores_mean_learning_rate + xgboost_train_scores_std_learning_rate, alpha=0.2, color="r")
plt.semilogx(xgboost_param_range_learning_rate, xgboost_test_scores_mean_learning_rate, label="Cross-validation score",
             color="g")
plt.fill_between(xgboost_param_range_learning_rate, xgboost_test_scores_mean_learning_rate - xgboost_test_scores_std_learning_rate,
                 xgboost_test_scores_mean_learning_rate + xgboost_test_scores_std_learning_rate, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("xgboost1_learningrate")

In [None]:
learning_curve(xgboost_clf.best_estimator_, "xgboost")

# Decision Tree Model

In [None]:
model = DecisionTreeClassifier(class_weight='balanced')
steps = []
transformations = ColumnTransformer(transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features), ('ss', StandardScaler(), numerical_features)])
steps.append(('tf', transformations))
steps.append(('decision_tree', model))
pipeline = Pipeline(steps)
path = pipeline[-1].cost_complexity_pruning_path(
    pipeline[:-1].fit_transform(x_train),
    y_train,
)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
ccp_alphas_filtered = [ccp_alphas[i] for i in range(0, len(ccp_alphas), len(ccp_alphas)//5)]
parameters = {
    "decision_tree__ccp_alpha" : ccp_alphas_filtered,
    "decision_tree__min_samples_split": [2, 5, 20, 100],
    "decision_tree__min_samples_leaf": [1, 2, 5, 20, 100],
    "decision_tree__max_features": [n_features, "log2", "sqrt"]
}
skf = StratifiedKFold(n_splits=5)
dt_clf = GridSearchCV(pipeline, param_grid = parameters, cv = skf, scoring= 'roc_auc', verbose=3, n_jobs=-1, error_score='raise', return_train_score=True)
dt_clf.fit(x_train,y_train)

In [None]:
model_eval(dt_clf, x_train, y_train, "decision_tree1")

In [None]:
def create_validation_dt(x, y, clf, key, values):
    train_scores_list, test_scores_list = [], []
    key2 = key.split("__")[1]
    
    params = clf.best_params_
    m_params = {}
    for k, v in clf.best_params_.items():
        m_params[k.split("__")[1]] = v
    del m_params[key2]    
    model = DecisionTreeClassifier(class_weight='balanced', **m_params)
    steps = []
    transformations = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ss', StandardScaler(), numerical_features)
    ]
    )
    steps.append(('tf', transformations))
    steps.append(('decision_tree', model))
    pipeline = Pipeline(steps)    
    train_scores, test_scores = vc(
        estimator=pipeline,
        X=x,
        y=y,
        param_name=key,
        param_range=values,
        scoring="roc_auc",
        verbose=0    
    )
    train_scores_list.append(train_scores)
    test_scores_list.append(test_scores)
    
    return train_scores_list, test_scores_list

### Decision Tree Post Pruning Alpha

In [None]:
dt_train_scores_alpha, dt_test_scores_alpha = create_validation_dt(x_train, y_train, dt_clf, "decision_tree__ccp_alpha", ccp_alphas_filtered)

In [None]:
dt_train_scores_mean_alpha = np.mean(dt_train_scores_alpha[0], axis=1)
dt_train_scores_std_alpha = np.std(dt_train_scores_alpha[0], axis=1)
dt_test_scores_mean_alpha = np.mean(dt_test_scores_alpha[0], axis=1)
dt_test_scores_std_alpha = np.std(dt_test_scores_alpha[0], axis=1)
dt_param_range_alpha = np.array(ccp_alphas_filtered)

plt.title("Validation Curve - Decision Tree Alpha")
plt.xlabel("$Alpha$")
plt.ylabel("ROC-AUC")
plt.ylim(0.5, 1.1)
plt.plot(dt_param_range_alpha, dt_train_scores_mean_alpha, label="Training score", color="r")
plt.fill_between(dt_param_range_alpha, dt_train_scores_mean_alpha - dt_train_scores_std_alpha,
                 dt_train_scores_mean_alpha + dt_train_scores_std_alpha, alpha=0.2, color="r")
plt.plot(dt_param_range_alpha, dt_test_scores_mean_alpha, label="Cross-validation score",
             color="g")
plt.fill_between(dt_param_range_alpha, dt_test_scores_mean_alpha - dt_test_scores_std_alpha,
                 dt_test_scores_mean_alpha + dt_test_scores_std_alpha, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("dt1_alpha")

### Decision Tree Min Samples Split

In [None]:
dt_train_scores_min_samples_split, dt_test_scores_min_samples_split = create_validation_dt(x_train, y_train, dt_clf, "decision_tree__min_samples_split", [2, 5, 20, 100])

In [None]:
dt_train_scores_mean_min_samples_split = np.mean(dt_train_scores_min_samples_split[0], axis=1)
dt_train_scores_std_min_samples_split = np.std(dt_train_scores_min_samples_split[0], axis=1)
dt_test_scores_mean_min_samples_split = np.mean(dt_test_scores_min_samples_split[0], axis=1)
dt_test_scores_std_min_samples_split = np.std(dt_test_scores_min_samples_split[0], axis=1)
dt_param_range_min_samples_split = np.array([2, 5, 20, 100])

plt.title("Validation Curve - Decision Tree Min Samples Split")
plt.xlabel("$Min Samples Split$")
plt.ylabel("ROC-AUC")
plt.ylim(0.8, 1.1)
plt.plot(dt_param_range_min_samples_split, dt_train_scores_mean_min_samples_split, label="Training score", color="r")
plt.fill_between(dt_param_range_min_samples_split, dt_train_scores_mean_min_samples_split - dt_train_scores_std_min_samples_split,
                 dt_train_scores_mean_min_samples_split + dt_train_scores_std_min_samples_split, alpha=0.2, color="r")
plt.plot(dt_param_range_min_samples_split, dt_test_scores_mean_min_samples_split, label="Cross-validation score",
             color="g")
plt.fill_between(dt_param_range_min_samples_split, dt_test_scores_mean_min_samples_split - dt_test_scores_std_min_samples_split,
                 dt_test_scores_mean_min_samples_split + dt_test_scores_std_min_samples_split, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("dt1_min_sample_split")

### Decision Tree Min Samples Leaf

In [None]:
dt_train_scores_min_samples_leaf, dt_test_scores_min_samples_leaf = create_validation_dt(x_train, y_train, dt_clf, "decision_tree__min_samples_leaf", [1, 2, 5, 20, 100])    

In [None]:
dt_train_scores_mean_min_samples_leaf = np.mean(dt_train_scores_min_samples_leaf[0], axis=1)
dt_train_scores_std_min_samples_leaf = np.std(dt_train_scores_min_samples_leaf[0], axis=1)
dt_test_scores_mean_min_samples_leaf = np.mean(dt_test_scores_min_samples_leaf[0], axis=1)
dt_test_scores_std_min_samples_leaf = np.std(dt_test_scores_min_samples_leaf[0], axis=1)
dt_param_range_min_samples_leaf = np.array([1, 2, 5, 20, 100])

plt.title("Validation Curve - Decision Tree Min Samples leaf")
plt.xlabel("$Min Samples leaf$")
plt.ylabel("ROC-AUC")
plt.ylim(0.8, 1.1)
plt.plot(dt_param_range_min_samples_leaf, dt_train_scores_mean_min_samples_leaf, label="Training score", color="r")
plt.fill_between(dt_param_range_min_samples_leaf, dt_train_scores_mean_min_samples_leaf - dt_train_scores_std_min_samples_leaf,
                 dt_train_scores_mean_min_samples_leaf + dt_train_scores_std_min_samples_leaf, alpha=0.2, color="r")
plt.plot(dt_param_range_min_samples_leaf, dt_test_scores_mean_min_samples_leaf, label="Cross-validation score",
             color="g")
plt.fill_between(dt_param_range_min_samples_leaf, dt_test_scores_mean_min_samples_leaf - dt_test_scores_std_min_samples_leaf,
                 dt_test_scores_mean_min_samples_leaf + dt_test_scores_std_min_samples_leaf, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("dt1_min_sample_leaf")

### Decision Tree Max Features

In [None]:
dt_train_scores_max_features, dt_test_scores_max_features = create_validation_dt(x_train, y_train, dt_clf, "decision_tree__max_features", ["log2", "sqrt", n_features])        

In [None]:
dt_train_scores_mean_max_features = np.mean(dt_train_scores_max_features[0], axis=1)
dt_train_scores_std_max_features = np.std(dt_train_scores_max_features[0], axis=1)
dt_test_scores_mean_max_features = np.mean(dt_test_scores_max_features[0], axis=1)
dt_test_scores_std_max_features = np.std(dt_test_scores_max_features[0], axis=1)
dt_param_range_max_features = np.array(["log2", "sqrt", "All Features"])

plt.title("Validation Curve - Decision Tree Max Features")
plt.xlabel("$Max Features$")
plt.ylabel("ROC-AUC")
plt.ylim(0.8, 1.1)
plt.plot(dt_param_range_max_features, dt_train_scores_mean_max_features, label="Training score", color="r")
plt.fill_between(dt_param_range_max_features, dt_train_scores_mean_max_features - dt_train_scores_std_max_features,
                 dt_train_scores_mean_max_features + dt_train_scores_std_max_features, alpha=0.2, color="r")
plt.plot(dt_param_range_max_features, dt_test_scores_mean_max_features, label="Cross-validation score",
             color="g")
plt.fill_between(dt_param_range_max_features, dt_test_scores_mean_max_features - dt_test_scores_std_max_features,
                 dt_test_scores_mean_max_features + dt_test_scores_std_max_features, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("dt1_max_features")

In [None]:
learning_curve(dt_clf.best_estimator_, "Decision Tree")

# Neural Network Model

In [None]:
sqrt_n_features = int(math.sqrt(n_features))
sqrt_sqrt_n_features = int(math.sqrt(sqrt_n_features))
log2_n_features = int(math.log2(n_features))
log2_log2_n_features = int(math.log2(log2_n_features))  
steps = []
transformations = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ss', StandardScaler(), numerical_features)
    ]
)
steps.append(('tf', transformations))
model = MLPClassifier(max_iter=500)
steps.append(('mlp', model))
pipeline = Pipeline(steps)
parameters = {
    "mlp__hidden_layer_sizes"    : [(sqrt_n_features,), (log2_n_features,), (sqrt_n_features, sqrt_sqrt_n_features), (log2_n_features, log2_log2_n_features)],
    "mlp__activation": ["logistic", "relu", "tanh"],
    "mlp__alpha": [0.0001, 0.001, 0.01, 0.1]
}
skf = StratifiedKFold(n_splits=5)
mlp_clf = GridSearchCV(pipeline, param_grid = parameters, cv = skf, scoring= 'roc_auc', verbose=2, n_jobs=-1, error_score='raise', return_train_score=True)
mlp_clf.fit(x_train,y_train)

In [None]:
model_eval(mlp_clf, x_train, y_train, "mlp1_rocauc")

In [None]:
def create_validation_mlp(x, y, clf, key, values):
    train_scores_list, test_scores_list = [], []
    key2 = key.split("__")[1]
    
    params = clf.best_params_
    m_params = {}
    for k, v in clf.best_params_.items():
        m_params[k.split("__")[1]] = v
    del m_params[key2]    
    model = MLPClassifier(max_iter=500,**m_params)
    steps = []
    transformations = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ss', StandardScaler(), numerical_features)
    ]
    )
    steps.append(('tf', transformations))
    steps.append(('mlp', model))
    pipeline = Pipeline(steps)    
    train_scores, test_scores = vc(
        estimator=pipeline,
        X=x,
        y=y,
        param_name=key,
        param_range=values,
        scoring="roc_auc",
        verbose=0    
    )
    train_scores_list.append(train_scores)
    test_scores_list.append(test_scores)
    
    return train_scores_list, test_scores_list

### MLP Hidden Layer Size

In [None]:
mlp_train_scores_hidden_layer_size, mlp_test_scores_hidden_layer_size = create_validation_mlp(x_train, y_train, mlp_clf, "mlp__hidden_layer_sizes",  [(sqrt_n_features,), (log2_n_features,), (sqrt_n_features, sqrt_sqrt_n_features), (log2_n_features, log2_log2_n_features)])

In [None]:
mlp_train_scores_mean_hidden_layer_size = np.mean(mlp_train_scores_hidden_layer_size[0], axis=1)
mlp_train_scores_std_hidden_layer_size = np.std(mlp_train_scores_hidden_layer_size[0], axis=1)
mlp_test_scores_mean_hidden_layer_size = np.mean(mlp_test_scores_hidden_layer_size[0], axis=1)
mlp_test_scores_std_hidden_layer_size = np.std(mlp_test_scores_hidden_layer_size[0], axis=1)
mlp_param_range_hidden_layer_size = np.array([f"{(sqrt_n_features,)}", f"{(log2_n_features,)}", f"{(sqrt_n_features, sqrt_sqrt_n_features)}", f"{(log2_n_features, log2_log2_n_features)}"])

plt.title("Dataset 1 MLP Hidden Layer Size")
plt.xlabel("$Hidden Layer Size$")
plt.ylabel("ROC-AUC")
plt.ylim(0.8, 1.1)
plt.plot(mlp_param_range_hidden_layer_size, mlp_train_scores_mean_hidden_layer_size, label="Training score", color="r")
plt.fill_between(mlp_param_range_hidden_layer_size, mlp_train_scores_mean_hidden_layer_size - mlp_train_scores_std_hidden_layer_size,
                 mlp_train_scores_mean_hidden_layer_size + mlp_train_scores_std_hidden_layer_size, alpha=0.2, color="r")
plt.plot(mlp_param_range_hidden_layer_size, mlp_test_scores_mean_hidden_layer_size, label="Cross-validation score",
             color="g")
plt.fill_between(mlp_param_range_hidden_layer_size, mlp_test_scores_mean_hidden_layer_size - mlp_test_scores_std_hidden_layer_size,
                 mlp_test_scores_mean_hidden_layer_size + mlp_test_scores_std_hidden_layer_size, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("mlp1_hiddenlayer")

### MLP Activation

In [None]:
mlp_train_scores_activation, mlp_test_scores_activation = create_validation_mlp(x_train, y_train, mlp_clf, "mlp__activation",  ["logistic", "relu", "tanh"])

In [None]:
mlp_train_scores_mean_activation = np.mean(mlp_train_scores_activation[0], axis=1)
mlp_train_scores_std_activation = np.std(mlp_train_scores_activation[0], axis=1)
mlp_test_scores_mean_activation = np.mean(mlp_test_scores_activation[0], axis=1)
mlp_test_scores_std_activation = np.std(mlp_test_scores_activation[0], axis=1)
mlp_param_range_activation = np.array(["logistic", "relu", "tanh"])

plt.title("Validation Curve - MLP Activation")
plt.xlabel("$Activation$")
plt.ylabel("ROC-AUC")
plt.ylim(0.5, 1.1)
width = 0.2
_X = np.arange(len(mlp_param_range_activation))
plt.bar(_X - width, mlp_train_scores_mean_activation, width=0.25, label="Training score", color="r")
plt.bar(_X + width, mlp_test_scores_mean_activation, width=0.25, label="Cross-validation score",
             color="g")
plt.legend(loc="best")
plt.xticks(_X, mlp_param_range_activation) # set labels manually
plt.savefig("mlp1_activation")

### MLP Alpha

In [None]:
mlp_train_scores_alpha, mlp_test_scores_alpha = create_validation_mlp(x_train, y_train, mlp_clf, "mlp__alpha",  [0.0001, 0.001, 0.01, 0.1])

In [None]:
mlp_train_scores_mean_alpha = np.mean(mlp_train_scores_alpha[0], axis=1)
mlp_train_scores_std_alpha = np.std(mlp_train_scores_alpha[0], axis=1)
mlp_test_scores_mean_alpha = np.mean(mlp_test_scores_alpha[0], axis=1)
mlp_test_scores_std_alpha = np.std(mlp_test_scores_alpha[0], axis=1)
mlp_param_range_alpha = np.array([0.0001, 0.001, 0.01, 0.1])

plt.title("Dataset 1 MLP Alpha")
plt.xlabel("$alpha$")
plt.ylabel("ROC-AUC")
plt.ylim(0.8, 1.1)
plt.semilogx(mlp_param_range_alpha, mlp_train_scores_mean_alpha, label="Training score", color="r")
plt.fill_between(mlp_param_range_alpha, mlp_train_scores_mean_alpha - mlp_train_scores_std_alpha,
                 mlp_train_scores_mean_alpha + mlp_train_scores_std_alpha, alpha=0.2, color="r")
plt.semilogx(mlp_param_range_alpha, mlp_test_scores_mean_alpha, label="Cross-validation score",
             color="g")
plt.fill_between(mlp_param_range_alpha, mlp_test_scores_mean_alpha - mlp_test_scores_std_alpha,
                 mlp_test_scores_mean_alpha + mlp_test_scores_std_alpha, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("mlp1_alpha")

In [None]:
learning_curve(mlp_clf.best_estimator_, "MLP")

In [None]:
for i, p in enumerate(mlp_clf.cv_results_['params']):
    steps = []
    transformations = ColumnTransformer(
        transformers=[
            ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
            ('ss', StandardScaler(), numerical_features)
        ]
    )
    steps.append(('tf', transformations))


    model = MLPClassifier(max_iter=500,)
    steps.append(('mlp', model))
    pipeline = Pipeline(steps)
    pipeline.set_params(**p)
    model = pipeline.fit(x_train, y_train)
    plt.plot(model["mlp"].loss_curve_)

In [None]:
params = {}
for k, v in mlp_clf.best_params_.items():
    params[k.split("__")[1]] = v
model = MLPClassifier(max_iter=500, **params)
steps = []
transformations = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ss', StandardScaler(), numerical_features)
    ]
)
steps.append(('tf', transformations))
steps.append(('mlp', model))
pipeline = Pipeline(steps)
mlp_model_loss_curve = pipeline.fit(x_train, y_train)
plt.title("Dataset 1 MLP Loss Curve")
plt.xlabel("$Epochs$")
plt.ylabel("Loss")
plt.plot(mlp_model_loss_curve["mlp"].loss_curve_)
plt.savefig("mlp1_losscurve")

# KNN

In [None]:
model = KNeighborsClassifier()
steps = []
transformations = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ss', StandardScaler(), numerical_features)
    ]
)
steps.append(('tf', transformations))
steps.append(('knn', model))
pipeline = Pipeline(steps)
parameters = {
    "knn__n_neighbors" : [3, 4, 5] ,
    "knn__p": [1, 2], # Manahttan vs Euclidean
    "knn__weights": ["uniform", "distance"]
}
skf = StratifiedKFold(n_splits=5)
knn_clf = GridSearchCV(pipeline, param_grid = parameters, cv = skf, scoring= 'roc_auc', verbose=2, n_jobs=-1, error_score='raise', return_train_score=True)
knn_clf.fit(x_train,y_train)

In [None]:
knn_pre = knn_clf

In [None]:
model = KNeighborsClassifier()
steps = []
transformations = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ss', StandardScaler(), numerical_features)
    ]
)
steps.append(('tf', transformations))
steps.append(('knn', model))
pipeline = Pipeline(steps)
parameters = {
    "knn__n_neighbors" : np.arange(3, 21, 1),
    "knn__p": [2], #Eucldean
    "knn__weights": ["uniform"]
}
skf = StratifiedKFold(n_splits=5)
knn_clf = GridSearchCV(pipeline, param_grid = parameters, cv = skf, scoring= 'roc_auc', verbose=3, n_jobs=-1, error_score='raise', return_train_score=True)
knn_clf.fit(x_train,y_train)

In [None]:
model_eval(knn_clf, x_train, y_train ,"knn1_rocauc")

In [None]:
def create_validation_knn(x, y, clf, key, values):
    train_scores_list, test_scores_list = [], []
    key2 = key.split("__")[1]
    
    params = clf.best_params_
    m_params = {}
    for k, v in clf.best_params_.items():
        m_params[k.split("__")[1]] = v
    del m_params[key2]    
    model = KNeighborsClassifier(**m_params)
    steps = []
    transformations = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ss', StandardScaler(), numerical_features)
    ]
    )
    steps.append(('tf', transformations))
    steps.append(('knn', model))
    pipeline = Pipeline(steps)    
    train_scores, test_scores = vc(
        estimator=pipeline,
        X=x,
        y=y,
        param_name=key,
        param_range=values,
        scoring="roc_auc",
        verbose=0    
    )
    train_scores_list.append(train_scores)
    test_scores_list.append(test_scores)
    
    return train_scores_list, test_scores_list

### KNN No. Neighbors

In [None]:
knn_train_scores_n_neighbors, knn_test_scores_n_neighbors = create_validation_knn(x_train, y_train, knn_clf, "knn__n_neighbors",  np.arange(3, 21, 1))

In [None]:
knn_train_scores_mean_n_neighbors = np.mean(knn_train_scores_n_neighbors[0], axis=1)
knn_train_scores_std_n_neighbors = np.std(knn_train_scores_n_neighbors[0], axis=1)
knn_test_scores_mean_n_neighbors = np.mean(knn_test_scores_n_neighbors[0], axis=1)
knn_test_scores_std_n_neighbors = np.std(knn_test_scores_n_neighbors[0], axis=1)
knn_param_range_n_neighbors = np.arange(3, 21, 1)

plt.title("Validation Curve - KNN No. Neighbors")
plt.xlabel("$No. Neighbors$")
plt.ylabel("ROC-AUC")
plt.ylim(0.8, 1.1)
plt.plot(knn_param_range_n_neighbors, knn_train_scores_mean_n_neighbors, label="Training score", color="r")
plt.fill_between(knn_param_range_n_neighbors, knn_train_scores_mean_n_neighbors - knn_train_scores_std_n_neighbors,
                 knn_train_scores_mean_n_neighbors + knn_train_scores_std_n_neighbors, alpha=0.2, color="r")
plt.plot(knn_param_range_n_neighbors, knn_test_scores_mean_n_neighbors, label="Cross-validation score",
             color="g")
plt.fill_between(knn_param_range_n_neighbors, knn_test_scores_mean_n_neighbors - knn_test_scores_std_n_neighbors,
                 knn_test_scores_mean_n_neighbors + knn_test_scores_std_n_neighbors, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("knn1_neighbors")

In [None]:
learning_curve(knn_clf.best_estimator_, "KNN")

# SVM

In [None]:
steps = []
transformations = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ss', StandardScaler(), numerical_features)
    ]
)
steps.append(('tf', transformations))


model = SVC(random_state=42, class_weight='balanced', probability=True)
steps.append(('svm', model))
pipeline = Pipeline(steps)
parameters = {
    "svm__kernel" : ["rbf", "linear", "poly"],
    "svm__gamma": [0.1, 1, 10],
    "svm__C": [0.1, 1, 10],
    "svm__degree": [1, 2, 3, 4, 5]

}
skf = StratifiedKFold(n_splits=5)
svm_clf = GridSearchCV(pipeline, param_grid = parameters, cv = skf, scoring= 'roc_auc', verbose=3, n_jobs=-1, error_score='raise', return_train_score=True)
x_train_copy = x_train.copy()
x_train_copy.reset_index(drop=True, inplace=True)

x_train_sample = x_train_copy.sample(frac=0.05, random_state=42)
y_train_sample = y_train[x_train_sample.index]
svm_clf.fit(x_train_sample,y_train_sample)

In [None]:
model_eval(svm_clf, x_train, y_train, "svm1_rocauc")

In [None]:
def create_validation_svm(x, y, clf, key, values):
    train_scores_list, test_scores_list = [], []
    key2 = key.split("__")[1]
    
    params = clf.best_params_
    m_params = {}
    for k, v in clf.best_params_.items():
        m_params[k.split("__")[1]] = v
    del m_params[key2]    
    model = SVC(random_state=42, class_weight='balanced', probability=True, **m_params)
    steps = []
    transformations = ColumnTransformer(
        transformers=[
            ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
            ('ss', StandardScaler(), numerical_features)
        ]
    )
    steps.append(('tf', transformations))
    steps.append(('svm', model))
    pipeline = Pipeline(steps)
    train_scores, test_scores = vc(
        estimator=pipeline,
        X=x,
        y=y,
        param_name=key,
        param_range=values,
        scoring="roc_auc",
        verbose=0    
    )
    train_scores_list.append(train_scores)
    test_scores_list.append(test_scores)
    
    return train_scores_list, test_scores_list

### SVM Kernel

In [None]:
svm_train_scores_kernel, svm_test_scores_kernel = create_validation_svm(x_train_sample, y_train_sample, svm_clf, "svm__kernel",  ["rbf", "linear", "poly"])

In [None]:
svm_train_scores_mean_kernel = np.mean(svm_train_scores_kernel[0], axis=1)
svm_train_scores_std_kernel = np.std(svm_train_scores_kernel[0], axis=1)
svm_test_scores_mean_kernel = np.mean(svm_test_scores_kernel[0], axis=1)
svm_test_scores_std_kernel = np.std(svm_test_scores_kernel[0], axis=1)
svm_param_range_kernel =  ["rbf", "linear", "poly"]

plt.title("Dataset 1 SVM Kernel")
plt.xlabel("$Kernel$")
plt.ylabel("ROC-AUC")
plt.ylim(0.0, 1.1)
width = 0.2
_X = np.arange(len(svm_param_range_kernel))
plt.bar(_X - width, svm_train_scores_mean_kernel, width=0.25, label="Training score", color="r")
plt.bar(_X + width, svm_test_scores_mean_kernel, width=0.25, label="Cross-validation score",
             color="g")
plt.legend(loc="best")
plt.xticks(_X, svm_param_range_kernel) # set labels manually
plt.savefig("svm1_kernel")

### SVM Degree

In [None]:
svm_train_scores_degree, svm_test_scores_degree = create_validation_svm(x_train_sample, y_train_sample, svm_clf, "svm__degree",  [1, 2, 3, 4, 5])

In [None]:
svm_train_scores_mean_degree = np.mean(svm_train_scores_degree[0], axis=1)
svm_train_scores_std_degree = np.std(svm_train_scores_degree[0], axis=1)
svm_test_scores_mean_degree = np.mean(svm_test_scores_degree[0], axis=1)
svm_test_scores_std_degree = np.std(svm_test_scores_degree[0], axis=1)
svm_param_range_degree = [1, 2, 3, 4, 5]

plt.title("Validation Curve - SVM Degree")
plt.xlabel("$Degree$")
plt.ylabel("ROC-AUC")
plt.ylim(0.5, 1.1)
plt.plot(svm_param_range_degree, svm_train_scores_mean_degree, label="Training score", color="r")
plt.fill_between(svm_param_range_degree, svm_train_scores_mean_degree - svm_train_scores_std_degree,
                 svm_train_scores_mean_degree + svm_train_scores_std_degree, alpha=0.2, color="r")
plt.plot(svm_param_range_degree, svm_test_scores_mean_degree, label="Cross-validation score",
             color="g")
plt.fill_between(svm_param_range_degree, svm_test_scores_mean_degree - svm_test_scores_std_degree,
                 svm_test_scores_mean_degree + svm_test_scores_std_degree, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("svm1_degree")

### SVM C

In [None]:
svm_train_scores_C, svm_test_scores_C = create_validation_svm(x_train_sample, y_train_sample, svm_clf, "svm__C", [0.1, 1, 10])

In [None]:
svm_train_scores_mean_C = np.mean(svm_train_scores_C[0], axis=1)
svm_train_scores_std_C = np.std(svm_train_scores_C[0], axis=1)
svm_test_scores_mean_C = np.mean(svm_test_scores_C[0], axis=1)
svm_test_scores_std_C = np.std(svm_test_scores_C[0], axis=1)
svm_param_range_C = [0.1, 1, 10]

plt.title("Dataset 1 SVM C")
plt.xlabel("$C$")
plt.ylabel("ROC-AUC")
plt.ylim(0.5, 1.1)
plt.semilogx(svm_param_range_C, svm_train_scores_mean_C, label="Training score", color="r")
plt.fill_between(svm_param_range_C, svm_train_scores_mean_C - svm_train_scores_std_C,
                 svm_train_scores_mean_C + svm_train_scores_std_C, alpha=0.2, color="r")
plt.semilogx(svm_param_range_C, svm_test_scores_mean_C, label="Cross-validation score",
             color="g")
plt.fill_between(svm_param_range_C, svm_test_scores_mean_C - svm_test_scores_std_C,
                 svm_test_scores_mean_C + svm_test_scores_std_C, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("svm1_C")

### SVM Gamma

In [None]:
svm_train_scores_gamma, svm_test_scores_gamma = create_validation_svm(x_train, y_train, svm_clf, "svm__gamma", [0.1, 1, 10])

In [None]:
svm_train_scores_mean_gamma = np.mean(svm_train_scores_gamma[0], axis=1)
svm_train_scores_std_gamma = np.std(svm_train_scores_gamma[0], axis=1)
svm_test_scores_mean_gamma = np.mean(svm_test_scores_gamma[0], axis=1)
svm_test_scores_std_gamma = np.std(svm_test_scores_gamma[0], axis=1)
svm_param_range_gamma = [0.1, 1, 10]

plt.title("Validation Curve - SVM Gamma")
plt.xlabel("$Gamma$")
plt.ylabel("ROC-AUC")
plt.ylim(0.5, 1.1)
plt.semilogx(svm_param_range_gamma, svm_train_scores_mean_gamma, label="Training score", color="r")
plt.fill_between(svm_param_range_gamma, svm_train_scores_mean_gamma - svm_train_scores_std_gamma,
                 svm_train_scores_mean_gamma + svm_train_scores_std_gamma, alpha=0.2, color="r")
plt.semilogx(svm_param_range_gamma, svm_test_scores_mean_gamma, label="Cross-validation score",
             color="g")
plt.fill_between(svm_param_range_gamma, svm_test_scores_mean_gamma - svm_test_scores_std_gamma,
                 svm_test_scores_mean_gamma + svm_test_scores_std_gamma, alpha=0.2, color="g")
plt.legend(loc="best")
plt.savefig("svm1_gamma")

In [None]:
learning_curve(svm_clf.best_estimator_, "SVM")

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
params = {}
for k, v in xgboost_clf.best_params_.items():
    params[k.split("__")[1]] = v
model = XGBClassifier(scale_pos_weight=target_ratio, eval_metric='auc', seed=42, objective='binary:logistic', **params)
steps = []
transformations = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ss', StandardScaler(), numerical_features)
    ]
)
steps.append(('tf', transformations))
steps.append(('xgboost', model))
pipeline = Pipeline(steps)
xgboost_model = pipeline.fit(x_train, y_train)


params = {}
for k, v in mlp_clf.best_params_.items():
    params[k.split("__")[1]] = v
model = MLPClassifier(max_iter=500, **params)
steps = []
transformations = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ss', StandardScaler(), numerical_features)
    ]
)
steps.append(('tf', transformations))
steps.append(('mlp', model))
pipeline = Pipeline(steps)
mlp_model = pipeline.fit(x_train, y_train)

In [None]:
params = {}
for k, v in dt_clf.best_params_.items():
    params[k.split("__")[1]] = v
model = model = DecisionTreeClassifier(class_weight='balanced', random_state=42, **params)
steps = []
transformations = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ss', StandardScaler(), numerical_features)
    ]
)
steps.append(('tf', transformations))
steps.append(('dt', model))
pipeline = Pipeline(steps)
dt_model = pipeline.fit(x_train, y_train)

In [None]:
params = {}
for k, v in knn_clf.best_params_.items():
    params[k.split("__")[1]] = v
model = model = KNeighborsClassifier(**params)
steps = []
transformations = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ss', StandardScaler(), numerical_features)
    ]
)
steps.append(('tf', transformations))
steps.append(('knn', model))
pipeline = Pipeline(steps)
knn_model = pipeline.fit(x_train, y_train)

In [None]:
params = {}
for k, v in svm_clf.best_params_.items():
    params[k.split("__")[1]] = v
model = model = SVC(random_state=42, class_weight='balanced', probability=True, **params)
steps = []
transformations = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('ss', StandardScaler(), numerical_features)
    ]
)
steps.append(('tf', transformations))
steps.append(('svm', model))
pipeline = Pipeline(steps)
svm_model = pipeline.fit(x_train, y_train)

In [None]:
xgboost_model.predict(x_test)
xgboost_rocauc_test = roc_auc_score(y_test, xgboost_model.predict_proba(x_test)[:, 1])
print(f"Xgboost: {xgboost_rocauc_test}")


mlp_model.predict(x_test)
mlp_rocauc_test = roc_auc_score(y_test, mlp_model.predict_proba(x_test)[:, 1])
print(f"MLP: {mlp_rocauc_test}")


dt_model.predict(x_test)
dt_rocauc_test = roc_auc_score(y_test, dt_model.predict_proba(x_test)[:, 1])
print(f"Decision Tree: {dt_rocauc_test}")


knn_model.predict(x_test)
knn_rocauc_test = roc_auc_score(y_test, knn_model.predict_proba(x_test)[:, 1])
print(f"KNN: {knn_rocauc_test}")

svm_model.predict(x_test)
svm_rocauc_test = roc_auc_score(y_test, svm_model.predict_proba(x_test)[:, 1])
print(f"SVM: {svm_rocauc_test}")