In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
# import my implementation of the toolkit
import toolkit as tk
# import my implementation of the models
import models

## Loading Data and preprocessing

In [2]:
# Abalone
abalone_names = ["sex", "length", "diameter", "height", "whole weight", "shucked weight", "viscera weight", "shell weight", "rings"]
abalone = tk.load_data("../classification_datasets/abalone.data", names=abalone_names)
abalone_columns = ['sex']
tk.one_hot_encoding(abalone, abalone_columns)

# Breast cancer wisconsin
breast_cancer_names = ['sample_code', 'clump_thickness', 'uniformity_of_cell_size', 'uniformity_of_cell_shape', 'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'motoses', 'class']
breast_cancer = tk.load_data("../classification_datasets/breast-cancer-wisconsin.data", names=breast_cancer_names, index_col=0)

# breast cancer: missing value denoted by ?
tk.handle_missing_values(breast_cancer)
breast_cancer_col = ['class']
tk.one_hot_encoding(breast_cancer, breast_cancer_col)

# Car
# arribute "class" is added for data loading purposes
car_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'] 
car = tk.load_data("../classification_datasets/car.data", names=car_names)

# encoding ordinal
# treating doors and persons as ordinal categorical data as well
car_columns = {
    'buying': ['low', 'med', 'high', 'vhigh'],
    'maint': ['low', 'med', 'high', 'vhigh'],
    'doors': ['2', '3', '4', '5more'], 
    'persons': ['2', '4', 'more'],
    'lug_boot': ['small', 'med', 'big'],
    'safety': ['low', 'med', 'high'],
    'class': ['unacc', 'acc', 'good', 'vgood']
}
tk.encode_ordinal(car, car_columns)

# Forest Fires
# forestfires has header
forestfires = tk.load_data("../classification_datasets/forestfires.data", header=0)
# forest fires
ff_columns = {
    'month': ['jan','feb','mar','apr','may','jun','jul','aug','sep', 'oct', 'nov', 'dec'],
    'day': ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
}
tk.encode_ordinal(forestfires, ff_columns)

# house votes
house__votes_names = ['class', 'handicapped', 'water_project_cost_sharing', 'adoption_of_the_budget_resolution', 'physician_fee_freeze', 'el_salvador_aid', 'religious_groups_in_schools', 'anti_satellite_test_ban', 'aid_to_nicaraguan_contras', 'mx_missile', 'immigration', 'synfuels_corporation_cutback', 'education_spending', 'superfund_right_to_sue', 'crime', 'duty_free_exports', 'export_administration_act_south_africa'] 
house_votes = tk.load_data("../classification_datasets/house-votes-84.data", names=house__votes_names)
house_votes_columns = house__votes_names
tk.one_hot_encoding(house_votes, house_votes_columns)

# Machine
machine_names = ['vendor', 'model', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP'] 
machine = tk.load_data("../classification_datasets/machine.data", names=machine_names)
# drop vendor and model
machine.drop(columns=['vendor', 'model'], inplace=True)

### Data Split into Five Folds

In [3]:
k = 5
model = models.KNN(k=5)
hyperparam = [{'k': 1}, {'k': 5}]
car_target = car['class'].to_numpy()
car_data = car.drop(columns=['class']).to_numpy()
tk.k_2_cv(car_data, car_target, k, hyperparam, model, classification=True)

X_train size:  1382
X_valid size:  346
y_train size:  1382
y_valid size:  346
Round 0.1 of hyperparameter choosing.
train size: 690. test size: 346
Performance of corresponding hyperparams: [0.8265895953757225, 0.9161849710982659]
Round 0.2 of hyperparameter choosing.
train size: 692. test size: 346
Performance of corresponding hyperparams: [0.8352601156069365, 0.9017341040462428]
Round 1.1 of hyperparameter choosing.
train size: 690. test size: 346
Performance of corresponding hyperparams: [0.8063583815028902, 0.8959537572254336]
Round 1.2 of hyperparameter choosing.
train size: 692. test size: 346
Performance of corresponding hyperparams: [0.7947976878612717, 0.8959537572254336]
Round 2.1 of hyperparameter choosing.
train size: 690. test size: 346
Performance of corresponding hyperparams: [0.8497109826589595, 0.9190751445086706]
Round 2.2 of hyperparameter choosing.
train size: 692. test size: 346
Performance of corresponding hyperparams: [0.861271676300578, 0.9277456647398844]
Round

({'k': 5}, 0.912448270084611)

### Calculation of Distance

In [4]:
forestfires.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,2,4,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,9,1,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,9,5,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,2,4,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,2,6,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [5]:
# compare the first two points
x1 = [7,5,2,4,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0]
x2 = [[7,4,9,1,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0]]
# cyclic data month and day
cyclic = {
    2: 12,
    3: 7,
}
models.euclidean_distances(x1, x2, cyclic)

[575.3175470989913]

### Calculation of Kernel Function

In [6]:
# assume x has nearest neighbors with xq
x = [7,5,2,4,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0]
xq = [[7,4,9,1,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0],
      [7,4,9,5,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0],]
# same cyclic data 
cyclic = {
    2: 12,
    3: 7,
}
gamma = 0.1
models.gamma_kernel(x, xq, gamma, cyclic)

array([1.03341889e-25, 1.72292866e-26])

### Example of a Point Classified Using k-nn

In [7]:
car.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,3,3,0,0,0,0,0
1,3,3,0,0,0,1,0
2,3,3,0,0,0,2,0
3,3,3,0,0,1,0,0
4,3,3,0,0,1,1,0


In [8]:
knn_classifier = models.KNN(k=5)  # for classification
knn_classifier.fit(car_data, car_target)
X_test = [[3,3,0,1,0,0]]
neighbor, y_pred_classification = knn_classifier.predict(X_test)
print("neighbor_y: ", neighbor[0][0])
print("neighbor_x: ", neighbor[0][1])
print("prediction: ", y_pred_classification)

neighbor_y:  [0, 0, 0, 0, 0]
neighbor_x:  [array([3, 3, 0, 1, 0, 0]), array([3, 3, 0, 0, 0, 0]), array([3, 3, 1, 1, 0, 0]), array([3, 3, 0, 2, 0, 0]), array([3, 3, 0, 1, 1, 0])]
prediction:  [0]


### Example of of a Point Regressed Using k-nn

In [9]:
abalone.head()

Unnamed: 0,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight,rings,sex_M,sex_F,sex_I
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,1,0,0
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,1,0,0
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0,1,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,1,0,0
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,0,1


In [10]:
abalone_target = abalone['rings'].to_numpy()
abalone_data = abalone.drop(columns=['rings']).to_numpy()

gamma = 1/abalone_target.std()
knn_regressor = models.KNN(k=5, gamma=gamma)  # for regression
knn_regressor.fit(abalone_data, abalone_target)
X_test = [[0.455,0.365,0.095,0.5140,0.2245,0.1010,0.150,1,0,0]]
neighbor, y_pred_regression = knn_regressor.predict(X_test)
print("neighbor_y: ", neighbor[0][0])
print("neighbor_x: ", neighbor[0][1])
print("prediction: ", y_pred_regression)

neighbor_y:  [15, 10, 8, 8, 8]
neighbor_x:  [array([0.455 , 0.365 , 0.095 , 0.514 , 0.2245, 0.101 , 0.15  , 1.    ,
       0.    , 0.    ]), array([0.46  , 0.35  , 0.12  , 0.515 , 0.224 , 0.108 , 0.1565, 1.    ,
       0.    , 0.    ]), array([0.465 , 0.375 , 0.11  , 0.5   , 0.21  , 0.113 , 0.1505, 1.    ,
       0.    , 0.    ]), array([0.48  , 0.37  , 0.1   , 0.5135, 0.243 , 0.1015, 0.135 , 1.    ,
       0.    , 0.    ]), array([0.475 , 0.37  , 0.11  , 0.4895, 0.2185, 0.107 , 0.146 , 1.    ,
       0.    , 0.    ])]
prediction:  [9.811156501049624]


### Example edited out of the training set using edited nearest neighbor

In [11]:
abalone.head()

Unnamed: 0,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight,rings,sex_M,sex_F,sex_I
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,1,0,0
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,1,0,0
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,0,1,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,1,0,0
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,0,1


In [12]:
X_train, X_test, y_train, y_test = tk.train_test_split(data=abalone_data, target=abalone_target, split_percentage=0.8, classification=False)
gamma = 1/y_train.std()
knn_regressor = models.EditedKNN(k=5, epsilon=0.1, gamma=gamma)  # for regression
knn_regressor.fit(X_train, y_train, X_test, y_test)
# When we predicted using the first data, the prediction if out of range of epsilon
# so we are expecting it to be removed in the updated X_train after fit
X_test = X_train[0]
print("line 1 is now at: ", np.where(np.all(knn_regressor.X_train==X_test,axis=1)))


line 1 is now at:  (array([0]),)


In [13]:
X_test = X_train[2]
print("line 3 is now at: ", np.where(np.all(knn_regressor.X_train==X_test,axis=1)))


line 3 is now at:  (array([2]),)


### Example added to the training set using condensed nearest neighbor

In [14]:
knn_regressor = models.CondensedKNN(k=3, epsilon=0.1, gamma=gamma)  # for regression
knn_regressor.fit(abalone_data, abalone_target)
# When we predicted using the first data, the prediction if out of range of epsilon
# so we are expecting add it to our condensed dataset
# X_test = [[0.455,0.365,0.095,0.5140,0.2245,0.1010,0.150,1,0,0]]
knn_regressor.X_train[0]

array([0.455 , 0.365 , 0.095 , 0.514 , 0.2245, 0.101 , 0.15  , 1.    ,
       0.    , 0.    ])

# Experiments

## KNN

Abalone

In [15]:
K=5 # 5-2 Cross Validation
abalone_target = abalone['rings'].to_numpy()
abalone_data = abalone.drop(columns=['rings']).to_numpy()
gamma = 1/abalone_target.std()
model = models.KNN(k=5, gamma=gamma)
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
for k in k_knn:
    hyperparam.append({'k': k})
tk.k_2_cv(abalone_data, abalone_target, K, hyperparam, model, classification=False)

X_train size:  3341
X_valid size:  836
y_train size:  3341
y_valid size:  836
Round 0.1 of hyperparameter choosing.
train size: 1670. test size: 836
Performance of corresponding hyperparams: [2.014354066985646, 1.67191932883871, 1.6306339722457965, 1.6177363554617126, 1.6057045123284202, 1.596310583340928, 1.6003923264298778]
Round 0.2 of hyperparameter choosing.
train size: 1671. test size: 836
Performance of corresponding hyperparams: [2.0574162679425836, 1.7118197482639033, 1.6468778401656787, 1.6241094316342495, 1.6086103888760528, 1.6032163212143433, 1.5941622911504203]
Round 1.1 of hyperparameter choosing.
train size: 1670. test size: 836
Performance of corresponding hyperparams: [1.9712918660287082, 1.6793729803267812, 1.650797966071345, 1.6084897603243218, 1.6081368458050727, 1.5994779240177908, 1.5868768834304785]
Round 1.2 of hyperparameter choosing.
train size: 1671. test size: 836
Performance of corresponding hyperparams: [1.9413875598086126, 1.7032538218319466, 1.645208748

({'k': 13}, 1.5430766182838807)

Breast Cancer

In [16]:
model = models.KNN(k=5)
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
for k in k_knn:
    hyperparam.append({'k': k})
breast_cancer_label = breast_cancer['class_2'].to_numpy()
breast_cancer_data = breast_cancer.drop(columns=['class_2']).to_numpy()
tk.k_2_cv(breast_cancer_data, breast_cancer_label, K, hyperparam, model, classification=True)

X_train size:  558
X_valid size:  141
y_train size:  558
y_valid size:  141
Round 0.1 of hyperparameter choosing.
train size: 279. test size: 141
Performance of corresponding hyperparams: [0.9574468085106383, 0.9645390070921985, 0.950354609929078, 0.9716312056737588, 0.9645390070921985, 0.9645390070921985, 0.9645390070921985]
Round 0.2 of hyperparameter choosing.
train size: 279. test size: 141
Performance of corresponding hyperparams: [0.9645390070921985, 0.9645390070921985, 0.9574468085106383, 0.9645390070921985, 0.9645390070921985, 0.9574468085106383, 0.9574468085106383]
Round 1.1 of hyperparameter choosing.
train size: 279. test size: 141
Performance of corresponding hyperparams: [0.9787234042553191, 0.950354609929078, 0.9716312056737588, 0.9574468085106383, 0.950354609929078, 0.9716312056737588, 0.9716312056737588]
Round 1.2 of hyperparameter choosing.
train size: 279. test size: 141
Performance of corresponding hyperparams: [0.9645390070921985, 0.9716312056737588, 0.9645390070921

({'k': 1}, 0.9483870967741936)

Car

In [17]:
model = models.KNN(k=5)
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
for k in k_knn:
    hyperparam.append({'k': k})
car_target = car['class'].to_numpy()
car_data = car.drop(columns=['class']).to_numpy()
tk.k_2_cv(car_data, car_target, K, hyperparam, model, classification=True)

X_train size:  1382
X_valid size:  346
y_train size:  1382
y_valid size:  346
Round 0.1 of hyperparameter choosing.
train size: 690. test size: 346
Performance of corresponding hyperparams: [0.8323699421965318, 0.8901734104046243, 0.9075144508670521, 0.9075144508670521, 0.9046242774566474, 0.9017341040462428, 0.8930635838150289]
Round 0.2 of hyperparameter choosing.
train size: 692. test size: 346
Performance of corresponding hyperparams: [0.8439306358381503, 0.8641618497109826, 0.9190751445086706, 0.9132947976878613, 0.9161849710982659, 0.9190751445086706, 0.9104046242774566]
Round 1.1 of hyperparameter choosing.
train size: 690. test size: 346
Performance of corresponding hyperparams: [0.838150289017341, 0.8959537572254336, 0.8988439306358381, 0.8959537572254336, 0.9046242774566474, 0.884393063583815, 0.8786127167630058]
Round 1.2 of hyperparameter choosing.
train size: 692. test size: 346
Performance of corresponding hyperparams: [0.8410404624277457, 0.8988439306358381, 0.9190751445

({'k': 9}, 0.9079651503727904)

Forest Fire

In [18]:
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
for k in k_knn:
    hyperparam.append({'k': k})
forestfires_target = forestfires['area'].to_numpy()
forestfires_data = forestfires.drop(columns=['area']).to_numpy()
gamma = 1/forestfires_target.std()
cyclic = {
    2: 12,
    3: 7,
}
model = models.KNN(k=5, gamma=gamma, cyclic=cyclic)
tk.k_2_cv(forestfires_data, forestfires_target, K, hyperparam, model, classification=False)

X_train size:  413
X_valid size:  104
y_train size:  413
y_valid size:  104
Round 0.1 of hyperparameter choosing.
train size: 206. test size: 104
Performance of corresponding hyperparams: [22.613076923076925, 22.814670597867583, 22.299649238459203, 22.497474402114406, 23.5610932311678, 23.42269285099136, 23.230277216852862]
Round 0.2 of hyperparameter choosing.
train size: 207. test size: 104
Performance of corresponding hyperparams: [25.20317307692308, 24.063490714558444, 26.197433337077253, 26.1275418489086, 26.707764413028393, 25.88712187404068, 25.987063839486602]
Round 1.1 of hyperparameter choosing.
train size: 206. test size: 104
Performance of corresponding hyperparams: [24.32625, 26.22183706783411, 26.00750745323019, 25.46903420408527, 26.431071542457587, 26.237446485429324, 26.396454827240195]
Round 1.2 of hyperparameter choosing.
train size: 207. test size: 104
Performance of corresponding hyperparams: [25.867596153846154, 22.611921583967096, 21.6894954151946, 21.74953102642

({'k': 7}, 19.57246141031698)

House Votes

In [19]:
model = models.KNN(k=5)
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
for k in k_knn:
    hyperparam.append({'k': k})
house_votes_target = house_votes['class_republican'].to_numpy()
house_votes_data = house_votes.drop(columns=['class_republican']).to_numpy()
tk.k_2_cv(forestfires_data, forestfires_target, K, hyperparam, model, classification=True)

X_train size:  217
X_valid size:  300
y_train size:  217
y_valid size:  300
Round 0.1 of hyperparameter choosing.
train size: 99. test size: 300
Performance of corresponding hyperparams: [0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666]
Round 0.2 of hyperparameter choosing.
train size: 118. test size: 300
Performance of corresponding hyperparams: [0.13, 0.15, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666]
Round 1.1 of hyperparameter choosing.
train size: 99. test size: 300
Performance of corresponding hyperparams: [0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666]
Round 1.2 of hyperparameter choosing.
train size: 118. test size: 300
Performance of corresponding hyperparams: [0.12666666666666668, 0.15666666666666668, 0.16666666666666666, 

({'k': 5}, 0.9144410203732238)

Machine

In [20]:
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
for k in k_knn:
    hyperparam.append({'k': k})
machine_target = machine['PRP'].to_numpy()
machine_data = machine.drop(columns=['PRP']).to_numpy()
gamma = 1/machine_target.std()
model = models.KNN(k=5, gamma=gamma)
tk.k_2_cv(machine_data, machine_target, K, hyperparam, model, classification=False)

X_train size:  167
X_valid size:  42
y_train size:  167
y_valid size:  42
Round 0.1 of hyperparameter choosing.
train size: 83. test size: 42
Performance of corresponding hyperparams: [48.333333333333336, 48.245045174774, 48.256949337350875, 49.490688352983376, 49.36784036699501, 49.36639081353333, 49.37067354532554]
Round 0.2 of hyperparameter choosing.
train size: 84. test size: 42
Performance of corresponding hyperparams: [66.11904761904762, 69.00191966769896, 68.6563278708805, 68.81393862033953, 68.75219250534185, 68.76406927824048, 68.74851747363469]
Round 1.1 of hyperparameter choosing.
train size: 83. test size: 42
Performance of corresponding hyperparams: [35.523809523809526, 32.80592194046643, 33.555603906266484, 33.453167498995505, 33.45677691831167, 33.50130191968285, 33.501071379273576]
Round 1.2 of hyperparameter choosing.
train size: 84. test size: 42
Performance of corresponding hyperparams: [55.26190476190476, 53.749273133786446, 53.96178595171397, 54.61725293119922, 54

({'k': 3}, 34.4540950113049)

## Condensed KNN

Abalone

In [21]:
K=5 # 5-2 Cross Validation
abalone_target = abalone['rings'].to_numpy()
abalone_data = abalone.drop(columns=['rings']).to_numpy()
gamma = 1/abalone_target.std()
model = models.CondensedKNN(gamma=gamma)
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
epsilon = [0.001, 0.01, 0.1, 1]
for k in k_knn:
    for e in epsilon:
        hyperparam.append({'k': k, 'epsilon': e})
tk.k_2_cv(abalone_data, abalone_target, K, hyperparam, model, classification=False)

X_train size:  3341
X_valid size:  836
y_train size:  3341
y_valid size:  836
Round 0.1 of hyperparameter choosing.
train size: 1670. test size: 836
Performance of corresponding hyperparams: [1.9892344497607655, 1.9892344497607655, 1.9892344497607655, 2.076555023923445, 1.721529475130351, 1.721529475130351, 1.721529475130351, 1.7619507511812362, 1.6064752372787032, 1.6064752372787032, 1.6064752372787032, 1.7002138539034009, 1.5918211276907774, 1.5918211276907774, 1.5918211276907774, 1.690947704938403, 1.567285424763225, 1.567285424763225, 1.567285424763225, 1.6761229152940207, 1.5652929115318535, 1.5652929115318535, 1.5652929115318535, 1.6918190470173533, 1.590180392526352, 1.590180392526352, 1.590180392526352, 1.724172841849701]
Round 0.2 of hyperparameter choosing.
train size: 1671. test size: 836
Performance of corresponding hyperparams: [2.027511961722488, 2.027511961722488, 2.027511961722488, 2.034688995215311, 1.7370940643823478, 1.7370940643823478, 1.7370940643823478, 1.76315326

Performance: 1.5200099587367941
Round 4.2 performance testing of best hyperparameter setting.
train size: 1671. test size: 1670
Performance: 1.6231065557029507
The average performance is: 1.581218978649439


({'k': 11, 'epsilon': 0.001}, 1.581218978649439)

Breast Cancer

In [22]:
K=5 # 5-2 Cross Validation
model = models.CondensedKNN()
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
#epsilon = [0.01, 0.1, 0.5, 1, 1.5]
for k in k_knn:
    #for e in epsilon:
    hyperparam.append({'k': k})
breast_cancer_label = breast_cancer['class_2'].to_numpy()
breast_cancer_data = breast_cancer.drop(columns=['class_2']).to_numpy()        
tk.k_2_cv(breast_cancer_data, breast_cancer_label, K, hyperparam, model, classification=True)


X_train size:  558
X_valid size:  141
y_train size:  558
y_valid size:  141
Round 0.1 of hyperparameter choosing.
train size: 279. test size: 141
Performance of corresponding hyperparams: [0.7446808510638298, 0.6524822695035462, 0.6524822695035462, 0.6524822695035462, 0.6524822695035462, 0.6524822695035462, 0.6524822695035462]
Round 0.2 of hyperparameter choosing.
train size: 279. test size: 141
Performance of corresponding hyperparams: [0.6879432624113475, 0.6524822695035462, 0.6524822695035462, 0.6524822695035462, 0.6524822695035462, 0.6524822695035462, 0.6524822695035462]
Round 1.1 of hyperparameter choosing.
train size: 279. test size: 141
Performance of corresponding hyperparams: [0.6737588652482269, 0.6524822695035462, 0.6524822695035462, 0.6524822695035462, 0.6524822695035462, 0.6524822695035462, 0.6524822695035462]
Round 1.2 of hyperparameter choosing.
train size: 279. test size: 141
Performance of corresponding hyperparams: [0.6737588652482269, 0.6524822695035462, 0.6524822695

({'k': 1}, 0.7584229390681003)

Car

In [23]:
K=5 # 5-2 Cross Validation
model = models.CondensedKNN()
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
# epsilon = [0.01, 0.1, 0.5, 1, 1.5]
for k in k_knn:
    #for e in epsilon:
    hyperparam.append({'k': k})
        
car_target = car['class'].to_numpy()
car_data = car.drop(columns=['class']).to_numpy()
tk.k_2_cv(car_data, car_target, K, hyperparam, model, classification=True)

X_train size:  1382
X_valid size:  346
y_train size:  1382
y_valid size:  346
Round 0.1 of hyperparameter choosing.
train size: 690. test size: 346
Performance of corresponding hyperparams: [0.18497109826589594, 0.12716763005780346, 0.10404624277456648, 0.1069364161849711, 0.11271676300578035, 0.09826589595375723, 0.10115606936416185]
Round 0.2 of hyperparameter choosing.
train size: 692. test size: 346
Performance of corresponding hyperparams: [0.3208092485549133, 0.19653179190751446, 0.21098265895953758, 0.18208092485549132, 0.1907514450867052, 0.18208092485549132, 0.1676300578034682]
Round 1.1 of hyperparameter choosing.
train size: 690. test size: 346
Performance of corresponding hyperparams: [0.2514450867052023, 0.15606936416184972, 0.11271676300578035, 0.10982658959537572, 0.08670520231213873, 0.07514450867052024, 0.057803468208092484]
Round 1.2 of hyperparameter choosing.
train size: 692. test size: 346
Performance of corresponding hyperparams: [0.30057803468208094, 0.2109826589

({'k': 1}, 0.21247717181871492)

Forest Fire

In [24]:
K=5 # 5-2 Cross Validation
forestfires_target = forestfires['area'].to_numpy()
forestfires_data = forestfires.drop(columns=['area']).to_numpy()
gamma = 1/forestfires_target.std()
cyclic = {
    2: 12,
    3: 7,
}
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
epsilon = [0.001, 0.05, 0.01, 0.1, 0.5]
for k in k_knn:
    for e in epsilon:
        hyperparam.append({'k': k, 'epsilon': e})
model = models.CondensedKNN(gamma=gamma, cyclic=cyclic)
tk.k_2_cv(forestfires_data, forestfires_target, K, hyperparam, model, classification=False)

X_train size:  413
X_valid size:  104
y_train size:  413
y_valid size:  104
Round 0.1 of hyperparameter choosing.
train size: 206. test size: 104
Performance of corresponding hyperparams: [9.63701923076923, 9.63701923076923, 9.63701923076923, 9.63701923076923, 9.63451923076923, 10.893284086054056, 10.937444308324569, 10.912511891880605, 10.937444308324569, 11.824427005646516, 16.146876025100497, 16.20933801896995, 16.161775379775044, 16.217900250895134, 16.339765762978658, 15.50917935059752, 15.556874177657562, 15.502907025790702, 15.556023311363408, 15.582490626474204, 14.651875096131247, 14.680901421183023, 14.655802454822627, 14.685522020470962, 14.783653370123956, 14.91194188461433, 14.920168020808463, 14.908731729108954, 14.917857660859172, 15.104719250064958, 14.565481475039102, 14.603939280370705, 14.560156847467074, 14.62944091563593, 14.734063277508186]
Round 0.2 of hyperparameter choosing.
train size: 207. test size: 104
Performance of corresponding hyperparams: [29.164326923

Performance: 25.688519003542396
Round 0.2 performance testing of best hyperparameter setting.
train size: 207. test size: 206
Performance: 23.854677247764652
Round 1.1 performance testing of best hyperparameter setting.
train size: 206. test size: 207
Performance: 26.6179051162522
Round 1.2 performance testing of best hyperparameter setting.
train size: 207. test size: 206
Performance: 21.769497861691324
Round 2.1 performance testing of best hyperparameter setting.
train size: 206. test size: 207
Performance: 24.40005738673209
Round 2.2 performance testing of best hyperparameter setting.
train size: 207. test size: 206
Performance: 25.13300037350818
Round 3.1 performance testing of best hyperparameter setting.
train size: 206. test size: 207
Performance: 25.27618947241603
Round 3.2 performance testing of best hyperparameter setting.
train size: 207. test size: 206
Performance: 22.728050473611983
Round 4.1 performance testing of best hyperparameter setting.
train size: 206. test size: 2

({'k': 7, 'epsilon': 0.01}, 24.161401802994693)

House Vote

In [29]:
model = models.CondensedKNN()
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
# epsilon = [0.01, 0.1, 0.5, 1, 1.5]
for k in k_knn:
    # for e in epsilon:
    hyperparam.append({'k': k})
house_votes_target = house_votes['class_republican'].to_numpy()
house_votes_data = house_votes.drop(columns=['class_republican']).to_numpy()
tk.k_2_cv(house_votes_data, house_votes_target, K, hyperparam, model, classification=True)

X_train size:  347
X_valid size:  88
y_train size:  347
y_valid size:  88
Round 0.1 of hyperparameter choosing.
train size: 173. test size: 88
Performance of corresponding hyperparams: [0.8977272727272727, 0.38636363636363635, 0.38636363636363635, 0.38636363636363635, 0.38636363636363635, 0.38636363636363635, 0.38636363636363635]
Round 0.2 of hyperparameter choosing.
train size: 174. test size: 88
Performance of corresponding hyperparams: [0.6931818181818182, 0.38636363636363635, 0.38636363636363635, 0.38636363636363635, 0.38636363636363635, 0.38636363636363635, 0.38636363636363635]
Round 1.1 of hyperparameter choosing.
train size: 173. test size: 88
Performance of corresponding hyperparams: [0.6931818181818182, 0.38636363636363635, 0.38636363636363635, 0.38636363636363635, 0.38636363636363635, 0.38636363636363635, 0.38636363636363635]
Round 1.2 of hyperparameter choosing.
train size: 174. test size: 88
Performance of corresponding hyperparams: [0.7727272727272727, 0.38636363636363635,

({'k': 1}, 0.7529300378712378)

Machine

In [26]:
K=5 # 5-2 Cross Validation
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
epsilon = [0.001, 0.01, 0.1, 1]
for k in k_knn:
    for e in epsilon:
        hyperparam.append({'k': k, 'epsilon': e})
machine_target = machine['PRP'].to_numpy()
machine_data = machine.drop(columns=['PRP']).to_numpy()
gamma = 1/machine_target.std()
model = models.CondensedKNN(gamma=gamma)
tk.k_2_cv(machine_data, machine_target, K, hyperparam, model, classification=False)

X_train size:  167
X_valid size:  42
y_train size:  167
y_valid size:  42
Round 0.1 of hyperparameter choosing.
train size: 83. test size: 42
Performance of corresponding hyperparams: [39.11904761904762, 39.11904761904762, 39.11904761904762, 37.57142857142857, 36.44019597543187, 36.44019597543187, 36.44019597543187, 36.44019597543187, 40.91216996596548, 40.91216996596548, 40.91216996596548, 40.91608739917359, 40.75219760193255, 40.75219760193255, 40.75219760193255, 40.75052444612079, 40.71366009661132, 40.71366009661132, 40.71366009661132, 40.71594189651256, 40.703425697324796, 40.703425697324796, 40.703425697324796, 40.68595959440594, 40.69554246044123, 40.69554246044123, 40.69554246044123, 40.69493946098972]
Round 0.2 of hyperparameter choosing.
train size: 84. test size: 42
Performance of corresponding hyperparams: [31.333333333333332, 31.333333333333332, 31.333333333333332, 31.333333333333332, 31.69953192779438, 31.69953192779438, 31.69953192779438, 31.69953192779438, 31.5252232234

({'k': 1, 'epsilon': 1}, 43.40757314974182)

## Edited KNN

Need to reload model for edited

abalone

In [5]:
K=5 # 5-2 Cross Validation
abalone_target = abalone['rings'].to_numpy()
abalone_data = abalone.drop(columns=['rings']).to_numpy()
gamma = 1/abalone_target.std()
model = models.EditedKNN(gamma=gamma)
hyperparam = []
# for faster computation, using three k values at a time
k_knn = [9, 11, 13]
epsilon = [0.001, 0.01, 0.1, 1]
for k in k_knn:
    for e in epsilon:
        hyperparam.append({'k': k, 'epsilon': e})
tk.k_2_cv(abalone_data, abalone_target, K, hyperparam, model, classification=False)

X_train size:  3341
X_valid size:  836
y_train size:  3341
y_valid size:  836
Round 0.1 of hyperparameter choosing.
train size: 1670. test size: 836
Performance of corresponding hyperparams: [1.607660444790643, 1.607660444790643, 1.607660444790643, 1.607660444790643, 1.5942473816014782, 1.5942473816014782, 1.5942473816014782, 1.5915425711256437, 1.5953184693319422, 1.5953184693319422, 1.5953184693319422, 1.5953184693319422]
Round 0.2 of hyperparameter choosing.
train size: 1671. test size: 836
Performance of corresponding hyperparams: [1.585814293571819, 1.585814293571819, 1.585814293571819, 1.5855490493388056, 1.5919610005454867, 1.5919610005454867, 1.5919610005454867, 1.5902281755307082, 1.5985170704778477, 1.5985170704778477, 1.5985170704778477, 1.5985170704778477]
Round 1.1 of hyperparameter choosing.
train size: 1670. test size: 836
Performance of corresponding hyperparams: [1.6131009610667835, 1.6131009610667835, 1.6131009610667835, 1.6131009610667835, 1.5954551425294436, 1.59545

({'k': 13, 'epsilon': 0.001}, 1.540875568661407)

Breast Cancer

In [6]:
K=5 # 5-2 Cross Validation
model = models.EditedKNN()
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
for k in k_knn:
    hyperparam.append({'k': k})

breast_cancer_label = breast_cancer['class_2'].to_numpy()
breast_cancer_data = breast_cancer.drop(columns=['class_2']).to_numpy()
tk.k_2_cv(breast_cancer_data, breast_cancer_label, K, hyperparam, model, classification=True)

X_train size:  558
X_valid size:  141
y_train size:  558
y_valid size:  141
Round 0.1 of hyperparameter choosing.
train size: 279. test size: 141
Performance of corresponding hyperparams: [0.9432624113475178, 0.9645390070921985, 0.9645390070921985, 0.950354609929078, 0.9574468085106383, 0.9574468085106383, 0.950354609929078]
Round 0.2 of hyperparameter choosing.
train size: 279. test size: 141
Performance of corresponding hyperparams: [0.950354609929078, 0.9645390070921985, 0.9645390070921985, 0.9574468085106383, 0.9574468085106383, 0.9645390070921985, 0.9574468085106383]
Round 1.1 of hyperparameter choosing.
train size: 279. test size: 141
Performance of corresponding hyperparams: [0.9716312056737588, 0.9645390070921985, 0.950354609929078, 0.9574468085106383, 0.9574468085106383, 0.9574468085106383, 0.9574468085106383]
Round 1.2 of hyperparameter choosing.
train size: 279. test size: 141
Performance of corresponding hyperparams: [0.950354609929078, 0.9645390070921985, 0.964539007092198

({'k': 3}, 0.9706093189964159)

Car

In [7]:
K=5 # 5-2 Cross Validation
model = models.EditedKNN()
hyperparam = []
k_knn = [ 1, 3, 5, 7, 9, 11, 13]
for k in k_knn:
    hyperparam.append({'k': k})
car_target = car['class'].to_numpy()
car_data = car.drop(columns=['class']).to_numpy()
tk.k_2_cv(car_data, car_target, K, hyperparam, model, classification=True)

X_train size:  1382
X_valid size:  346
y_train size:  1382
y_valid size:  346
Round 0.1 of hyperparameter choosing.
train size: 690. test size: 346
Performance of corresponding hyperparams: [0.8497109826589595, 0.9132947976878613, 0.930635838150289, 0.9219653179190751, 0.9248554913294798, 0.9161849710982659, 0.9219653179190751]
Round 0.2 of hyperparameter choosing.
train size: 692. test size: 346
Performance of corresponding hyperparams: [0.8786127167630058, 0.9075144508670521, 0.9190751445086706, 0.9190751445086706, 0.9190751445086706, 0.8930635838150289, 0.8930635838150289]
Round 1.1 of hyperparameter choosing.
train size: 690. test size: 346
Performance of corresponding hyperparams: [0.8554913294797688, 0.9046242774566474, 0.9104046242774566, 0.9277456647398844, 0.9219653179190751, 0.9132947976878613, 0.9190751445086706]
Round 1.2 of hyperparameter choosing.
train size: 692. test size: 346
Performance of corresponding hyperparams: [0.8526011560693642, 0.9277456647398844, 0.936416184

({'k': 5}, 0.9104155147859597)

Forest Fire

In [8]:
K=5 # 5-2 Cross Validation
forestfires_target = forestfires['area'].to_numpy()
forestfires_data = forestfires.drop(columns=['area']).to_numpy()
gamma = 1/forestfires_target.std()
cyclic = {
    2: 12,
    3: 7,
}
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
epsilon = [0.001, 0.05, 0.01, 0.1, 1]
for k in k_knn:
    for e in epsilon:
        hyperparam.append({'k': k, 'epsilon': e})
model = models.EditedKNN(gamma=gamma, cyclic=cyclic)
tk.k_2_cv(forestfires_data, forestfires_target, K, hyperparam, model, classification=False)

X_train size:  413
X_valid size:  104
y_train size:  413
y_valid size:  104
Round 0.1 of hyperparameter choosing.
train size: 206. test size: 104
Performance of corresponding hyperparams: [35.80355769230769, 35.80355769230769, 35.80355769230769, 35.80355769230769, 35.80355769230769, 33.886878015180706, 33.886878015180706, 33.886878015180706, 33.886878015180706, 33.886878015180706, 32.08343495671282, 32.08343495671282, 32.08343495671282, 32.08343495671282, 32.08343495671282, 30.95641778961759, 30.95641778961759, 30.95641778961759, 30.95641778961759, 30.95641778961759, 30.517830118141962, 30.517830118141962, 30.517830118141962, 30.517830118141962, 30.517830118141962, 30.254352428540948, 30.254352428540948, 30.254352428540948, 30.254352428540948, 30.254352428540948, 29.904323128643476, 29.904323128643476, 29.904323128643476, 29.904323128643476, 29.904323128643476]
Round 0.2 of hyperparameter choosing.
train size: 207. test size: 104
Performance of corresponding hyperparams: [27.5516346153

Performance: 14.833522700165
Round 0.2 performance testing of best hyperparameter setting.
train size: 207. test size: 206
Performance: 15.611576390533026
Round 1.1 performance testing of best hyperparameter setting.
train size: 206. test size: 207
Performance: 14.694909549244198
Round 1.2 performance testing of best hyperparameter setting.
train size: 207. test size: 206
Performance: 15.746816483937645
Round 2.1 performance testing of best hyperparameter setting.
train size: 206. test size: 207
Performance: 13.8949679081078
Round 2.2 performance testing of best hyperparameter setting.
train size: 207. test size: 206
Performance: 15.580025167796483
Round 3.1 performance testing of best hyperparameter setting.
train size: 206. test size: 207
Performance: 15.224474863216098
Round 3.2 performance testing of best hyperparameter setting.
train size: 207. test size: 206
Performance: 15.779346758667861
Round 4.1 performance testing of best hyperparameter setting.
train size: 206. test size: 2

({'k': 13, 'epsilon': 0.001}, 15.049881619580692)

House Vote

In [9]:
K=5 # 5-2 Cross Validation
model = models.EditedKNN()
hyperparam = []
k_knn = [1, 3, 5, 7, 9, 11, 13]
for k in k_knn:
    hyperparam.append({'k': k})
house_votes_target = house_votes['class_republican'].to_numpy()
house_votes_data = house_votes.drop(columns=['class_republican']).to_numpy()
tk.k_2_cv(forestfires_data, forestfires_target, K, hyperparam, model, classification=True)

X_train size:  217
X_valid size:  300
y_train size:  217
y_valid size:  300
Round 0.1 of hyperparameter choosing.
train size: 99. test size: 300
Performance of corresponding hyperparams: [0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666]
Round 0.2 of hyperparameter choosing.
train size: 118. test size: 300
Performance of corresponding hyperparams: [0.13333333333333333, 0.16, 0.16333333333333333, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666]
Round 1.1 of hyperparameter choosing.
train size: 99. test size: 300
Performance of corresponding hyperparams: [0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666]
Round 1.2 of hyperparameter choosing.
train size: 118. test size: 300
Performance of corresponding hyperparams: [0.15333333333333332, 0.16666666666666666, 0.1666

({'k': 7}, 0.9144410203732238)

Machine

In [10]:
K=5 # 5-2 Cross Validation
hyperparam = []
k_knn = [5, 7, 9, 11, 13]
epsilon = [0.001, 0.01, 0.1, 1]
for k in k_knn:
    for e in epsilon:
        hyperparam.append({'k': k, 'epsilon': e})
machine_target = machine['PRP'].to_numpy()
machine_data = machine.drop(columns=['PRP']).to_numpy()
gamma = 1/machine_target.std()
model = models.EditedKNN(gamma=gamma)
tk.k_2_cv(machine_data, machine_target, K, hyperparam, model, classification=False)

X_train size:  167
X_valid size:  42
y_train size:  167
y_valid size:  42
Round 0.1 of hyperparameter choosing.
train size: 83. test size: 42
Performance of corresponding hyperparams: [36.27482387050867, 36.27482387050867, 36.27482387050867, 36.27482387050867, 36.133019672671544, 36.133019672671544, 36.133019672671544, 36.133019672671544, 36.36804826659661, 36.36804826659661, 36.36804826659661, 36.36804826659661, 36.68498553853577, 36.68498553853577, 36.68498553853577, 36.68498553853577, 36.740177341216025, 36.740177341216025, 36.740177341216025, 36.740177341216025]
Round 0.2 of hyperparameter choosing.
train size: 84. test size: 42
Performance of corresponding hyperparams: [32.41255089152351, 32.41255089152351, 32.41255089152351, 32.41255089152351, 32.76005989666362, 32.76005989666362, 32.76005989666362, 32.76005989666362, 32.77064325341407, 32.77064325341407, 32.77064325341407, 32.77064325341407, 32.72316310224113, 32.72316310224113, 32.72316310224113, 32.72316310224113, 32.712764763

({'k': 5, 'epsilon': 0.001}, 40.45602496347563)