In [1]:
import pandas as pd

from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.utils import shuffle
from sklearn.model_selection import ValidationCurveDisplay
from sklearn.neighbors import KNeighborsClassifier
from ucimlrepo import fetch_ucirepo
import warnings
warnings.filterwarnings("ignore")

## #Get datasets x3
#Shuffle X and y using shuffle
#Encode X (if needed), squeeze y
#Set aside data for "test" to keep the amount of "train" & "validate" consistent across datasets
#ShuffleSplit(n_iter=3, (THIS IS THE 3 TRIALS) testsize=TRY ALL 3) and gridsearch to find hyperparams x3 testsize, x3 classifiers
#set parameters
#Get test error with test set
#Evaluate

In [2]:
def fetch_dataset(name):
    match name:
        case "dry_bean_dataset":
            dry_bean_dataset = fetch_ucirepo(id=602) 

            # data (as pandas dataframes) 
            X = dry_bean_dataset.data.features 
            y = dry_bean_dataset.data.targets 
            y = (y == "DERMASON").squeeze()
        case "adult":
            adult = fetch_ucirepo(id=2) 

            # data (as pandas dataframes) 
            X = adult.data.features 
            y = adult.data.targets
            y = ((y == "<=50K") | (y == "<=50K.")).squeeze()
        case "bank_marketing":
            bank_marketing = fetch_ucirepo(id=222) 
  
            # data (as pandas dataframes) 
            X = bank_marketing.data.features 
            y = bank_marketing.data.targets
            y = (y == "yes").squeeze()
    return X, y

In [3]:
def encoding(X):
    return pd.get_dummies(X)

In [4]:
def split_data(X, y):
    X_sum = X.iloc[:5000]
    y_sum = y.iloc[:5000]
    X_test = X.iloc[5000:]
    y_test = y.iloc[5000:]
    return X_sum, y_sum, X_test, y_test

In [5]:
def data_setup(X, y):
    X, y = shuffle(X, y)#, random_state=0)
    X, y = X.reset_index(drop=True), y.reset_index(drop=True)
    X = encoding(X)
    X_sum, y_sum, X_test, y_test = split_data(X, y)
    return X_sum, y_sum, X_test, y_test

In [6]:
def KNN_score(X, y, test_size, train_size):
    X_sum, y_sum, X_test, y_test = data_setup(X, y)
    parameters = {"n_neighbors": [5, 10, 15]}
    # print("train: " + str(train_size))
    # Train test split
    shuffle_split = ShuffleSplit(n_splits=3, test_size=test_size, train_size=train_size)
    # This is equivalent to: 
    #   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # But, it is usable for GridSearchCV

    # GridSearch without CV
    clf = KNeighborsClassifier()
    grid_search = GridSearchCV(clf, param_grid=parameters, cv=shuffle_split, return_train_score=True)
    grid_search.fit(X_sum, y_sum)
    
    results = pd.DataFrame(grid_search.cv_results_)
    best_params = grid_search.best_params_
    train_score = results["mean_train_score"][grid_search.best_index_]
    validation_score = results["mean_test_score"][grid_search.best_index_]
    clf = KNeighborsClassifier(n_neighbors=best_params["n_neighbors"]).fit(X_sum, y_sum)#, random_state=0)
    test_score = clf.score(X_test, y_test)
    
    return train_score, validation_score, test_score, best_params

In [7]:
X, y = fetch_dataset("dry_bean_dataset")

In [8]:
col = []
for train, test in [[0.2, 0.8], [0.5, 0.5], [0.8, 0.2]]:
    train_score, validation_score, test_score, best_params = KNN_score(X, y, train_size=train, test_size=test)
    row = [[train, test], train_score, validation_score, test_score, best_params]
    col.append(row)
df = pd.DataFrame(col)
df.columns = ["Train/Test Split", "Training Accuracy", "Validation Accuracy", "Testing Accuracy", "Hyperparameters"]
df

Unnamed: 0,Train/Test Split,Training Accuracy,Validation Accuracy,Testing Accuracy,Hyperparameters
0,"[0.2, 0.8]",0.919333,0.907083,0.897108,{'n_neighbors': 15}
1,"[0.5, 0.5]",0.914,0.899333,0.900244,{'n_neighbors': 15}
2,"[0.8, 0.2]",0.936833,0.9,0.905121,{'n_neighbors': 5}


In [9]:
df.to_clipboard()

In [8]:
print(KNN_score(X, y, train_size=0.2, test_size=0.8))

({'n_neighbors': 15}, 0.9103333333333333, 0.8965833333333334, 0.90140517942167)


In [9]:
print(KNN_score(X, y, train_size=0.5, test_size=0.5))

({'n_neighbors': 15}, 0.9129333333333333, 0.9001333333333333, 0.9022180931366857)


In [10]:
print(KNN_score(X, y, train_size=0.8, test_size=0.2))

({'n_neighbors': 5}, 0.9358333333333334, 0.8936666666666667, 0.9076762280803623)


In [11]:
X, y = fetch_dataset("adult")

In [10]:
col = []
for train, test in [[0.2, 0.8], [0.5, 0.5], [0.8, 0.2]]:
    train_score, validation_score, test_score, best_params = KNN_score(X, y, train_size=train, test_size=test)
    row = [[train, test], train_score, validation_score, test_score, best_params]
    col.append(row)
df = pd.DataFrame(col)
df.columns = ["Train/Test Split", "Training Accuracy", "Validation Accuracy", "Testing Accuracy", "Hyperparameters"]
df

Unnamed: 0,Train/Test Split,Training Accuracy,Validation Accuracy,Testing Accuracy,Hyperparameters
0,"[0.2, 0.8]",0.914333,0.903167,0.900128,{'n_neighbors': 15}
1,"[0.5, 0.5]",0.912667,0.896,0.906631,{'n_neighbors': 10}
2,"[0.8, 0.2]",0.935417,0.901,0.905818,{'n_neighbors': 5}


In [11]:
df.to_clipboard()

In [12]:
print(KNN_score(X, y, train_size=0.2, test_size=0.8))

({'n_neighbors': 15}, 0.7813333333333334, 0.7680833333333332, 0.7898362300989918)


In [13]:
print(KNN_score(X, y, train_size=0.5, test_size=0.5))

({'n_neighbors': 15}, 0.7885333333333334, 0.7747999999999999, 0.7843848364581908)


In [14]:
print(KNN_score(X, y, train_size=0.8, test_size=0.2))

({'n_neighbors': 15}, 0.7987500000000001, 0.7753333333333333, 0.7878290223986132)


In [15]:
X, y = fetch_dataset("bank_marketing")

In [12]:
col = []
for train, test in [[0.2, 0.8], [0.5, 0.5], [0.8, 0.2]]:
    train_score, validation_score, test_score, best_params = KNN_score(X, y, train_size=train, test_size=test)
    row = [[train, test], train_score, validation_score, test_score, best_params]
    col.append(row)
df = pd.DataFrame(col)
df.columns = ["Train/Test Split", "Training Accuracy", "Validation Accuracy", "Testing Accuracy", "Hyperparameters"]
df

Unnamed: 0,Train/Test Split,Training Accuracy,Validation Accuracy,Testing Accuracy,Hyperparameters
0,"[0.2, 0.8]",0.914333,0.89825,0.903612,{'n_neighbors': 15}
1,"[0.5, 0.5]",0.913733,0.901733,0.902915,{'n_neighbors': 10}
2,"[0.8, 0.2]",0.938667,0.916333,0.907792,{'n_neighbors': 5}


In [13]:
df.to_clipboard()

In [16]:
print(KNN_score(X, y, train_size=0.2, test_size=0.8))

({'n_neighbors': 10}, 0.8913333333333333, 0.8821666666666667, 0.8848573773345602)


In [17]:
print(KNN_score(X, y, train_size=0.5, test_size=0.5))

({'n_neighbors': 10}, 0.8993333333333333, 0.8857333333333334, 0.8845589515306757)


In [18]:
print(KNN_score(X, y, train_size=0.8, test_size=0.2))

({'n_neighbors': 15}, 0.8965, 0.8943333333333333, 0.8840118375568874)
