In [52]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

In [53]:
file_path = 'Data/dane.csv'

heart_test = pd.read_csv('Data/heart_test.csv')
heart_train = pd.read_csv('Data/heart_train.csv')
diabetes_test = pd.read_csv('Data/diabetes_test.csv')
diabetes_train = pd.read_csv('Data/diabetes_train.csv')
cancer_test = pd.read_csv('Data/cancer_test.csv')
cancer_train = pd.read_csv('Data/cancer_train.csv')
alzheimer_test = pd.read_csv('Data/alzheimer_test.csv')
alzheimer_train = pd.read_csv('Data/alzheimer_train.csv')

datasets = {
    "heart": (heart_train, heart_test),
    "diabetes": (diabetes_train, diabetes_test),
    "cancer": (cancer_train, cancer_test),
    "alzheimer": (alzheimer_train, alzheimer_test)
}

In [54]:
from sklearn.model_selection import train_test_split

# Training sets for 25%, 50%, 75%
heart_train25, _ = train_test_split(heart_train, train_size=0.25, random_state=42, stratify=heart_train.iloc[:, -1])
heart_train50, _ = train_test_split(heart_train, train_size=0.50, random_state=42, stratify=heart_train.iloc[:, -1])
heart_train75, _ = train_test_split(heart_train, train_size=0.75, random_state=42, stratify=heart_train.iloc[:, -1])

diabetes_train25, _ = train_test_split(diabetes_train, train_size=0.25, random_state=42, stratify=diabetes_train.iloc[:, -1])
diabetes_train50, _ = train_test_split(diabetes_train, train_size=0.50, random_state=42, stratify=diabetes_train.iloc[:, -1])
diabetes_train75, _ = train_test_split(diabetes_train, train_size=0.75, random_state=42, stratify=diabetes_train.iloc[:, -1])

cancer_train25, _ = train_test_split(cancer_train, train_size=0.25, random_state=42, stratify=cancer_train.iloc[:, -1])
cancer_train50, _ = train_test_split(cancer_train, train_size=0.50, random_state=42, stratify=cancer_train.iloc[:, -1])
cancer_train75, _ = train_test_split(cancer_train, train_size=0.75, random_state=42, stratify=cancer_train.iloc[:, -1])

alzheimer_train25, _ = train_test_split(alzheimer_train, train_size=0.25, random_state=42, stratify=alzheimer_train.iloc[:, -1])
alzheimer_train50, _ = train_test_split(alzheimer_train, train_size=0.50, random_state=42, stratify=alzheimer_train.iloc[:, -1])
alzheimer_train75, _ = train_test_split(alzheimer_train, train_size=0.75, random_state=42, stratify=alzheimer_train.iloc[:, -1])


datasets25 = {
    "heart": (heart_train25, heart_test),
    "diabetes": (diabetes_train25, diabetes_test),
    "cancer": (cancer_train25, cancer_test),
    "alzheimer": (alzheimer_train25, alzheimer_test)
}

datasets50 = {
    "heart": (heart_train50, heart_test),
    "diabetes": (diabetes_train50, diabetes_test),
    "cancer": (cancer_train50, cancer_test),
    "alzheimer": (alzheimer_train50, alzheimer_test)
}

datasets75 = {
    "heart": (heart_train75, heart_test),
    "diabetes": (diabetes_train75, diabetes_test),
    "cancer": (cancer_train75, cancer_test),
    "alzheimer": (alzheimer_train75, alzheimer_test)
}

# Uniform

In [55]:
n_random = 100
np.random.seed(42)

from scipy.stats import randint

param_dist_knn = {
    'n_neighbors': randint(1, 31),           # liczba sąsiadów w zakresie [1, 30]
    'weights': ['uniform', 'distance'],      # sposób ważenia sąsiadów
    'p': randint(1, 3),                      # 1 = Manhattan, 2 = Euklides
}



In [56]:
all_results = []

for name, (train, test) in datasets75.items():
    print(f"Trenuję model KNN dla: {name}")

    X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
    X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]

    model = KNeighborsClassifier()

    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist_knn,
        n_iter=100,
        scoring='roc_auc',
        cv=5,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )

    search.fit(X_train, y_train)

    cv_results = pd.DataFrame(search.cv_results_)

    for i, params in enumerate(search.cv_results_['params']):
        tmp_model = KNeighborsClassifier(**params)
        tmp_model.fit(X_train, y_train)
        y_proba = tmp_model.predict_proba(X_test)[:, 1]
        test_auc = roc_auc_score(y_test, y_proba)
    
        result = {
            "dataset": name,
            "cv_roc_auc": search.cv_results_['mean_test_score'][i],
            "test_roc_auc": test_auc
        }
    
        for param_name, param_value in params.items():
            result[param_name] = param_value
    
        all_results.append(result)


results_df = pd.DataFrame(all_results)

Trenuję model KNN dla: heart
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Trenuję model KNN dla: diabetes
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Trenuję model KNN dla: cancer
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Trenuję model KNN dla: alzheimer
Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [57]:
best_per_dataset = (
    results_df
    .sort_values(by=["dataset", "test_roc_auc"], ascending=[True, False])
    .groupby("dataset", as_index=False)
    .first()
)

param_cols = [col for col in results_df.columns if col not in ["dataset", "cv_roc_auc", "test_roc_auc"]]
params_df = best_per_dataset[param_cols]

aggregated_params = {}

for col in params_df.columns:
    if col.lower() in ["n_neighbors"]:
        # Średnia i zaokrąglenie do najbliższej liczby całkowitej
        aggregated_params[col] = int(round(params_df[col].mean()))
    else:
        aggregated_params[col] = params_df[col].mode().iloc[0]


mean_params = pd.Series(aggregated_params)

print("Średnie najlepsze parametry:")
print(mean_params)


Średnie najlepsze parametry:
n_neighbors          26
p                     1
weights        distance
dtype: object


In [58]:
mean_results = []

print(f"Testuję wspólne średnie parametry: {mean_params.to_dict()}")

for name, (train, test) in datasets75.items():
    print(f"\nTrenuję model KNN na średnich parametrach dla: {name}")

    X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
    X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]

    model = KNeighborsClassifier(**mean_params.to_dict())
    model.fit(X_train, y_train)

    y_proba = model.predict_proba(X_test)[:, 1]
    mean_auc = roc_auc_score(y_test, y_proba)

    mean_results.append({
        "dataset": name,
        "star_test_roc_auc": mean_auc
    })

mean_df = pd.DataFrame(mean_results)

print("\nWyniki AUC dla wspólnych średnich parametrów na wszystkich zbiorach danych:")
print(mean_df)

Testuję wspólne średnie parametry: {'n_neighbors': 26, 'p': 1, 'weights': 'distance'}

Trenuję model KNN na średnich parametrach dla: heart

Trenuję model KNN na średnich parametrach dla: diabetes

Trenuję model KNN na średnich parametrach dla: cancer

Trenuję model KNN na średnich parametrach dla: alzheimer

Wyniki AUC dla wspólnych średnich parametrów na wszystkich zbiorach danych:
     dataset  star_test_roc_auc
0      heart           0.675536
1   diabetes           0.814090
2     cancer           0.764626
3  alzheimer           0.738581


In [59]:
results_df = results_df.merge(mean_df, on="dataset")
results_df["diff_from_star"] = results_df["star_test_roc_auc"] - results_df["test_roc_auc"]
results_df

results_df = results_df[
    ["dataset", "n_neighbors", "p", "weights", "cv_roc_auc", "test_roc_auc", "star_test_roc_auc", "diff_from_star"]
]

results_df.to_csv("Results/knn_uniform_75.csv", index=False)

In [60]:
results_df

Unnamed: 0,dataset,n_neighbors,p,weights,cv_roc_auc,test_roc_auc,star_test_roc_auc,diff_from_star
0,heart,7,2,uniform,0.633447,0.662949,0.675536,0.012587
1,heart,15,1,distance,0.666080,0.683061,0.675536,-0.007525
2,heart,29,1,uniform,0.678546,0.660894,0.675536,0.014643
3,heart,26,1,uniform,0.669010,0.667430,0.675536,0.008107
4,heart,11,1,distance,0.654988,0.684070,0.675536,-0.008533
...,...,...,...,...,...,...,...,...
395,alzheimer,17,2,uniform,0.743928,0.711494,0.738581,0.027087
396,alzheimer,27,1,distance,0.765375,0.738355,0.738581,0.000227
397,alzheimer,2,2,distance,0.628293,0.599062,0.738581,0.139519
398,alzheimer,23,1,distance,0.763424,0.732018,0.738581,0.006564


In [61]:
# Creating a short summary dataset
results_df =  pd.read_csv("Results/knn_uniform_75.csv")

# Best parameters for each dataset
best_per_dataset = (
    results_df.sort_values(by=["dataset", "test_roc_auc"], ascending=[True, False])
    .groupby("dataset", as_index=False)
    .first()
    .drop(['cv_roc_auc', 'diff_from_star'], axis=1)
)

In [62]:
# Deafault model

default_results = []
for name, (train, test) in datasets75.items():
    
    X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
    X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]

    model = KNeighborsClassifier()
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, y_proba)

    default_results.append({
        "dataset": name,
        "default_test_roc_auc": score
    })

default_df = pd.DataFrame(default_results)

In [63]:
summary_df = best_per_dataset.merge(default_df, on="dataset")

In [64]:
# STAR row
mean_row = {
    "dataset": "STAR",
    **mean_params.to_dict(),
    # "ccp_alpha" :None, "max_depth": None, "min_samples_leaf": None, "min_samples_split": None,
    "test_roc_auc": None,
    "star_test_roc_auc": mean_df["star_test_roc_auc"].mean(),
    #"star_test_roc_auc": None,
    "default_test_roc_auc": None
}

summary_df = pd.concat([summary_df, pd.DataFrame([mean_row])], ignore_index=True)
summary_df


  summary_df = pd.concat([summary_df, pd.DataFrame([mean_row])], ignore_index=True)


Unnamed: 0,dataset,n_neighbors,p,weights,test_roc_auc,star_test_roc_auc,default_test_roc_auc
0,alzheimer,30,1,distance,0.755762,0.738581,0.639776
1,cancer,30,2,uniform,0.785398,0.764626,0.701657
2,diabetes,30,1,uniform,0.819463,0.81409,0.708716
3,heart,14,1,distance,0.687386,0.675536,0.666809
4,STAR,26,1,distance,,0.748208,


In [65]:
summary_df

Unnamed: 0,dataset,n_neighbors,p,weights,test_roc_auc,star_test_roc_auc,default_test_roc_auc
0,alzheimer,30,1,distance,0.755762,0.738581,0.639776
1,cancer,30,2,uniform,0.785398,0.764626,0.701657
2,diabetes,30,1,uniform,0.819463,0.81409,0.708716
3,heart,14,1,distance,0.687386,0.675536,0.666809
4,STAR,26,1,distance,,0.748208,


In [66]:
summary_df.to_csv("Results/knn_uniform_summary_75.csv", index=False)

# Bayesian

In [67]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.metrics import roc_auc_score

In [68]:
search_spaces = {
    'n_neighbors': Integer(1, 30),                  # zakres [1, 30]
    'weights': Categorical(['uniform', 'distance']), # wybór spośród dwóch opcji
    'p': Integer(1, 2),                             # 1 = Manhattan, 2 = Euklides
}

In [69]:
from sklearn.neighbors import KNeighborsClassifier
from skopt import BayesSearchCV
from sklearn.metrics import roc_auc_score
import pandas as pd

all_results = []

for name, (train, test) in datasets75.items():
    print(f"Trenuję model KNN dla: {name}")

    X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
    X_test, y_test = test.iloc[:, :-1], test.iloc[:, -1]

    model = KNeighborsClassifier()

    search = BayesSearchCV(
        estimator=model,
        search_spaces=search_spaces,
        n_iter=100,
        scoring='roc_auc',
        cv=5,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )

    search.fit(X_train, y_train)

    cv_results = pd.DataFrame(search.cv_results_)

    for i, params in enumerate(search.cv_results_['params']):
        tmp_model = KNeighborsClassifier(**params)
        tmp_model.fit(X_train, y_train)
        y_proba = tmp_model.predict_proba(X_test)[:, 1]
        test_auc = roc_auc_score(y_test, y_proba)

        result = {
            "dataset": name,
            "cv_roc_auc": search.cv_results_['mean_test_score'][i],
            "test_roc_auc": test_auc
        }

        result.update(params)

        all_results.append(result)

results_df = pd.DataFrame(all_results)


Trenuję model KNN dla: heart
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of



Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Trenuję model KNN dla: diabetes
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Trenuję model KNN dla: alzheimer
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for eac



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits




Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [72]:
# results_df = results_df.merge(mean_df, on="dataset")
# results_df["diff_from_mean"] = results_df["star_test_roc_auc"] - results_df["test_roc_auc"]
# results_df

results_df = results_df[
    ["dataset", "n_neighbors", "p", "weights", "cv_roc_auc", "test_roc_auc"]
]

results_df.to_csv("Results/knn_bayes_75.csv", index=False)

In [73]:
results_df

Unnamed: 0,dataset,n_neighbors,p,weights,cv_roc_auc,test_roc_auc
0,heart,13,2,distance,0.660530,0.668496
1,heart,25,2,uniform,0.670650,0.655405
2,heart,14,2,uniform,0.665603,0.667914
3,heart,25,1,distance,0.672349,0.671192
4,heart,24,1,distance,0.676315,0.670843
...,...,...,...,...,...,...
395,alzheimer,13,2,uniform,0.733250,0.691410
396,alzheimer,28,2,uniform,0.760831,0.733167
397,alzheimer,9,2,uniform,0.722941,0.678471
398,alzheimer,7,2,distance,0.713381,0.651248


In [74]:
best_per_dataset_2 = (
    results_df.sort_values(by=["dataset", "test_roc_auc"], ascending=[True, False])
    .groupby("dataset", as_index=False)
    .first()
    .drop(['cv_roc_auc'], axis=1)
)

summary_2_df = best_per_dataset_2.merge(default_df, on="dataset")

In [75]:
summary_2_df

Unnamed: 0,dataset,n_neighbors,p,weights,test_roc_auc,default_test_roc_auc
0,alzheimer,30,1,distance,0.755762,0.639776
1,cancer,30,2,uniform,0.785398,0.701657
2,diabetes,30,1,uniform,0.819463,0.708716
3,heart,14,1,distance,0.687386,0.666809


In [76]:
summary_2_df.to_csv("Results/knn_bayes_summary_75.csv", index=False)