In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interestes
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

## Question 1

In [3]:
target.value_counts()

Adelie Penguin (Pygoscelis adeliae)          151
Gentoo penguin (Pygoscelis papua)            123
Chinstrap penguin (Pygoscelis antarctica)     68
Name: Species, dtype: int64

In [4]:
data.min()

Body Mass (g)          2700.0
Flipper Length (mm)     172.0
Culmen Length (mm)       32.1
dtype: float64

In [5]:
data.max()

Body Mass (g)          6300.0
Flipper Length (mm)     231.0
Culmen Length (mm)       59.6
dtype: float64

## Question 2

In [35]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [16]:
from sklearn.model_selection import cross_validate

res = cross_validate(model, data, target, scoring='balanced_accuracy')
res['test_score'].mean(), res['test_score'].std()

(0.9487175942337233, 0.01917307908775349)

In [10]:
model51 = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=51)),
])

res = cross_validate(model51, data, target, scoring='balanced_accuracy')
res['test_score'].mean()

0.9393601953601953

In [14]:
model2 = Pipeline(steps=[
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

res = cross_validate(model2, data, target, scoring='balanced_accuracy')
res['test_score'].mean(), res['test_score'].std()

(0.7215834416479577, 0.09837706199533194)

## Question 3

In [42]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=42)

In [36]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

n_neighbors = [5, 51, 101]

In [48]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'preprocessor': all_preprocessors,
    'classifier__n_neighbors': n_neighbors
}

model_gs = GridSearchCV(model, param_grid=param_grid)

In [49]:
model_gs.fit(data_train, target_train)

GridSearchCV(estimator=Pipeline(steps=[('preprocessor', StandardScaler()),
                                       ('classifier', KNeighborsClassifier())]),
             param_grid={'classifier__n_neighbors': [5, 51, 101],
                         'preprocessor': [None, StandardScaler(),
                                          MinMaxScaler(),
                                          QuantileTransformer(n_quantiles=100),
                                          PowerTransformer(method='box-cox')]})

In [50]:
model_gs.cv_results_

{'mean_fit_time': array([0.00189872, 0.0028986 , 0.00271564, 0.00335164, 0.00563011,
        0.0017312 , 0.00287766, 0.0027153 , 0.00333047, 0.00561948,
        0.00188813, 0.00294924, 0.00280471, 0.00345058, 0.00564713]),
 'std_fit_time': array([2.50370905e-04, 2.88333143e-05, 9.33115616e-06, 2.71865769e-05,
        3.50269484e-04, 8.44957597e-07, 4.48139162e-05, 7.28703288e-06,
        1.41026151e-05, 3.15202866e-04, 2.42858862e-04, 3.21810329e-05,
        1.79219739e-05, 4.30983773e-05, 3.57254959e-04]),
 'mean_score_time': array([0.00258398, 0.00259295, 0.00255866, 0.00271845, 0.0026854 ,
        0.00271268, 0.00283175, 0.00284185, 0.00295868, 0.00295935,
        0.00372419, 0.00316706, 0.00314837, 0.00330663, 0.00322132]),
 'std_score_time': array([1.01489275e-04, 2.49990038e-05, 5.06301374e-06, 3.61985049e-05,
        1.76806949e-05, 2.27793450e-05, 1.69249653e-05, 3.24713851e-05,
        2.63737076e-05, 2.33239182e-05, 1.30634572e-03, 1.67108125e-05,
        3.15287816e-05, 4.08

In [51]:
pd.DataFrame(model_gs.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.002716,9.331156e-06,0.002559,5e-06,5,MinMaxScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.980769,0.980392,0.960784,0.960784,0.960784,0.968703,0.009699,1
4,0.00563,0.0003502695,0.002685,1.8e-05,5,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.980769,0.980392,0.960784,0.960784,0.941176,0.964781,0.014754,2
1,0.002899,2.883331e-05,0.002593,2.5e-05,5,StandardScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.980769,0.980392,0.960784,0.941176,0.941176,0.96086,0.017623,3
7,0.002715,7.287033e-06,0.002842,3.2e-05,51,MinMaxScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.980769,0.980392,0.901961,0.941176,0.941176,0.949095,0.029427,4
3,0.003352,2.718658e-05,0.002718,3.6e-05,5,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.980769,0.960784,0.941176,0.901961,0.941176,0.945173,0.026115,5
6,0.002878,4.481392e-05,0.002832,1.7e-05,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.961538,0.980392,0.921569,0.941176,0.901961,0.941327,0.027838,6
9,0.005619,0.0003152029,0.002959,2.3e-05,51,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.942308,0.980392,0.882353,0.941176,0.941176,0.937481,0.031404,7
8,0.00333,1.410262e-05,0.002959,2.6e-05,51,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.942308,0.941176,0.901961,0.921569,0.901961,0.921795,0.017795,8
11,0.002949,3.218103e-05,0.003167,1.7e-05,101,StandardScaler(),"{'classifier__n_neighbors': 101, 'preprocessor...",0.865385,0.882353,0.784314,0.862745,0.862745,0.851508,0.034385,9
14,0.005647,0.000357255,0.003221,3.8e-05,101,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 101, 'preprocessor...",0.846154,0.882353,0.784314,0.862745,0.843137,0.843741,0.032832,10
