In [1]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]


In [2]:
#Question 1
# target.unique()
# target.nunique()


In [3]:
#question 2
data.describe()
# target.value_counts()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [4]:
#Q3
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate 

model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

cv_results = cross_validate(model, data, target, cv = 10, scoring = "balanced_accuracy")

print(cv_results["test_score"].mean())
print(cv_results["test_score"])



0.9521978021978021
[1.         1.         1.         0.91880342 0.88253968 0.95238095
 0.97777778 0.93015873 0.90793651 0.95238095]


In [5]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [6]:
#Q4
from pprint import pprint
model.set_params(classifier__n_neighbors=51) 
# pprint(model.get_params())
cv_results_51 = cross_validate(model, data, target, cv = 10, scoring = "balanced_accuracy")

print(cv_results_51["test_score"].mean())
print(cv_results_51["test_score"])

count = 0
for five, fiftyone in zip(cv_results["test_score"], cv_results_51["test_score"]):
    if (five > fiftyone):    
        count += 1
print("51 comparison with 5:", count)

model.set_params(classifier__n_neighbors=101)
# pprint(model.get_params())

cv_results_101 = cross_validate(model, data, target, cv = 10, scoring = "balanced_accuracy")

print(cv_results_101["test_score"].mean())
print(cv_results_101["test_score"])

count = 0
for five, hundredone in zip(cv_results["test_score"], cv_results_101["test_score"]):
    if (five > hundredone):    
        count += 1
print("101 comparison with 5:", count)

model.set_params(steps= [
  ('classifier', KNeighborsClassifier(n_neighbors=5))])
#or model.set_params(preprocessor=None, classifier__n_neighbors=5)
# pprint(model.get_params())
cv_results_without_scaler = cross_validate(model, data, target, cv = 10, scoring = "balanced_accuracy")
count = 0
for five, withoutScaler in zip(cv_results["test_score"], cv_results_without_scaler["test_score"]):
    if (five > withoutScaler):    
        count += 1
print("without scaler comparison with 5:", count)



0.9418803418803419
[0.95238095 0.97777778 1.         0.86324786 0.88253968 0.95238095
 0.95555556 0.95238095 0.93015873 0.95238095]
51 comparison with 5: 4
0.8766422466422465
[0.85714286 0.95238095 0.94444444 0.86324786 0.83492063 0.85714286
 0.83492063 0.88253968 0.83492063 0.9047619 ]
101 comparison with 5: 10
without scaler comparison with 5: 10


In [7]:
#Q5
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import GridSearchCV

model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

# https://scikit-learn.org/stable/modules/preprocessing.html
all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

param_grid = {
    'preprocessor': all_preprocessors,
    'classifier__n_neighbors': [5,51,101]
}

model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=2, cv=10, scoring="balanced_accuracy") #like a cartesian product
model_grid_search.fit(data, target)

In [21]:
#the best score is obtained with these params : {'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
print(model_grid_search.best_score_, model_grid_search.best_params_)
# best_score = model_grid_search.best_score_ 

# print(model_grid_search.cv_results_)
indexStandardScaler = []
for num, param in enumerate(model_grid_search.cv_results_["params"]):
    # If the parameters store class names or similar identifiers
    if 'StandardScaler' in str(param['preprocessor']):
        indexStandardScaler.append(num)
print(indexStandardScaler)

splits_name = [x for x in model_grid_search.cv_results_.keys() if "split" in x]

print(splits_name)
for split_name in splits_name:
    first_StandardScaler = model_grid_search.cv_results_[split_name][1]
    second_StandardScaler = model_grid_search.cv_results_[split_name][6]
    third_StandardScaler = model_grid_search.cv_results_[split_name][11]
    best_score = max([first_StandardScaler, second_StandardScaler, third_StandardScaler])
    count = 0
    for x in model_grid_search.cv_results_[split_name]:
        if (x < best_score):
            count += 1
    print(count > 7)
    
print(model_grid_search.cv_results_['split2_test_score'])

0.9521978021978021 {'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
[1, 6, 11]
['split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score']
True
True
False
True
True
True
True
True
True
True
[0.74102564 1.         1.         1.         1.         0.5965812
 1.         1.         1.         1.         0.57435897 0.94444444
 0.94444444 0.77777778 0.88888889]
