In [1]:
import os
import pandas as pd
import optuna
import numpy as np
import openml
import logging
import matplotlib.pyplot as plt
import seaborn as sns
import kaleido
import plotly
import scipy.stats as stats
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from optuna.samplers import TPESampler, CmaEsSampler
from xgboost import XGBClassifier
from plotly.io import show
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

In [3]:
dataset_ids = [
    1590,   # Adult
    1510,   # Breast Cancer Wisconsin (Diagnostic)
    1461,   # Bank Marketing
    24,     # Mushroom
    40945   # Titanic
]

In [4]:
def fetch_and_prepare(openml_id):
    global y
    global X
    dataset = openml.datasets.get_dataset(openml_id)
    X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute, dataset_format='dataframe')
    for col in X.select_dtypes(include=['category', 'object']):
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
    imputer = SimpleImputer(strategy='mean')
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    scaler = MinMaxScaler()
    X_imputed = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X.columns)
    if y.dtype == 'category':
        y = LabelEncoder().fit_transform(y)
    zbior = X_imputed['Target'] = y
    return X_imputed

In [5]:
def objective(trial):
    calkowity_rezultat = []
    for i in dataset_ids:
        zbior = fetch_and_prepare(i)

        X_train, X_test, y_train, y_test = train_test_split(zbior.iloc[:, :-1], zbior['Target'], test_size=0.30, random_state=42)

        n_neighbors = trial.suggest_int('n_neighbors', 10, 30)

        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        wyniki = []

        for train_index, test_index in kf.split(X_train):
            X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
            y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

            X_train_fold = X_train_fold.reset_index(drop=True)
            X_test_fold = X_test_fold.reset_index(drop=True)
            y_train_fold = y_train_fold.reset_index(drop=True)
            y_test_fold = y_test_fold.reset_index(drop=True)

            X_train_fold = list(zip(*[X_train_fold[col] for col in X_train_fold]))
            X_test_fold = list(zip(*[X_test_fold[col] for col in X_test_fold]))

            KNN = KNeighborsClassifier(n_neighbors = n_neighbors)

            model = KNN.fit(X_train_fold, y_train_fold)


            y_pred_proba = model.predict_proba(X_test_fold)[:, 1]
            auc = roc_auc_score(y_test_fold, y_pred_proba)
                                            
            wyniki.append(auc)
        rezultat = np.mean(wyniki)
        calkowity_rezultat.append(rezultat)

    wynik = np.mean(calkowity_rezultat)
    return wynik

In [6]:
study = optuna.create_study(direction='maximize',sampler=TPESampler())
wynik = study.optimize(objective, n_trials=32)

In [7]:
#Defaultowe 
#Wyliczenie wartości AUC przy użyciu KNN oraz wyznaczonej wcześniej defaulowej konfiguracji.
zbior_auc_def = []
for i in dataset_ids:
    zbior = fetch_and_prepare(i)
    X_train, X_test, y_train, y_test = train_test_split(zbior.iloc[:, :-1], zbior['Target'], test_size=0.30, random_state=42)
    KNN = KNeighborsClassifier(n_neighbors = study.best_params["n_neighbors"])
    model = KNN.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    zbior_auc_def.append(auc)

zbior_auc_def = [float(x) for x in zbior_auc_def]
print(list(zbior_auc_def))
#Tutaj mam listę wartości jakie osiąga KNN dla każdego datasetu przy użyciu defaultowej konfiguracji.

[0.8877635496498153, 0.9939006466784244, 0.7881841427222772, 0.9999794545970171, 0.9453719357565511]


## Tunowalność

In [8]:
def objective(trial):

    n_neighbors = trial.suggest_int('n_neighbors', 10, 30)
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    wyniki_auc = []

    for train_index, test_index in kf.split(X_train):
        
        X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

        X_train_fold_proc = list(zip(*[X_train_fold[col] for col in X_train_fold]))
        X_test_fold_proc = list(zip(*[X_test_fold[col] for col in X_test_fold]))

        KNN = KNeighborsClassifier(n_neighbors = n_neighbors)

        model = KNN.fit(X_train_fold_proc, y_train_fold)
        y_pred_proba = model.predict_proba(X_test_fold_proc)[:, 1]
        
        auc = roc_auc_score(y_test_fold, y_pred_proba)
        wyniki_auc.append(auc)

    return np.mean(wyniki_auc)

In [9]:
zbior_auc_opt = []
for i in dataset_ids:
    zbior = fetch_and_prepare(i)
    X_train, X_test, y_train, y_test = train_test_split(zbior.iloc[:, :-1], zbior['Target'], test_size=0.30, random_state=42)
    study = optuna.create_study(direction='maximize',sampler=TPESampler())
    wynik = study.optimize(objective, n_trials=32)

    KNN = KNeighborsClassifier(n_neighbors = study.best_params["n_neighbors"])
    model = KNN.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    zbior_auc_opt.append(auc)

zbior_auc_opt = [float(x) for x in zbior_auc_opt]
print(list(zbior_auc_opt))

[0.8877635496498153, 0.9939741328630218, 0.7903717656848765, 1.0, 0.9451606086221471]


In [10]:
roznica = []
for i in range(len(zbior_auc_def)):
    roz = zbior_auc_opt[i] - zbior_auc_def[i]
    roznica.append(roz)
print(roznica)

[0.0, 7.348618459745992e-05, 0.0021876229625993338, 2.0545402982907746e-05, -0.00021132713440397133]
