In [2]:
import pandas as pd

penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interest
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]


In [3]:
#Question 1
# target.unique()
# target.nunique()


In [4]:
#question 2
data.describe()
# target.value_counts()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
count,342.0,342.0,342.0
mean,4201.754386,200.915205,43.92193
std,801.954536,14.061714,5.459584
min,2700.0,172.0,32.1
25%,3550.0,190.0,39.225
50%,4050.0,197.0,44.45
75%,4750.0,213.0,48.5
max,6300.0,231.0,59.6


In [5]:
#Q3
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate 

model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

cv_results = cross_validate(model, data, target, cv = 10, scoring = "balanced_accuracy")

print(cv_results["test_score"].mean())
print(cv_results["test_score"])



0.9521978021978021
[1.         1.         1.         0.91880342 0.88253968 0.95238095
 0.97777778 0.93015873 0.90793651 0.95238095]


In [6]:
model.get_params()

{'memory': None,
 'steps': [('preprocessor', StandardScaler()),
  ('classifier', KNeighborsClassifier())],
 'verbose': False,
 'preprocessor': StandardScaler(),
 'classifier': KNeighborsClassifier(),
 'preprocessor__copy': True,
 'preprocessor__with_mean': True,
 'preprocessor__with_std': True,
 'classifier__algorithm': 'auto',
 'classifier__leaf_size': 30,
 'classifier__metric': 'minkowski',
 'classifier__metric_params': None,
 'classifier__n_jobs': None,
 'classifier__n_neighbors': 5,
 'classifier__p': 2,
 'classifier__weights': 'uniform'}

In [7]:
#Q4
from pprint import pprint
model.set_params(classifier__n_neighbors=51) 
# pprint(model.get_params())
cv_results_51 = cross_validate(model, data, target, cv = 10, scoring = "balanced_accuracy")

print(cv_results_51["test_score"].mean())
print(cv_results_51["test_score"])

count = 0
for five, fiftyone in zip(cv_results["test_score"], cv_results_51["test_score"]):
    if (five > fiftyone):    
        count += 1
print("51 comparison with 5:", count)

model.set_params(classifier__n_neighbors=101)
# pprint(model.get_params())

cv_results_101 = cross_validate(model, data, target, cv = 10, scoring = "balanced_accuracy")

print(cv_results_101["test_score"].mean())
print(cv_results_101["test_score"])

count = 0
for five, hundredone in zip(cv_results["test_score"], cv_results_101["test_score"]):
    if (five > hundredone):    
        count += 1
print("101 comparison with 5:", count)

model.set_params(steps= [
  ('classifier', KNeighborsClassifier(n_neighbors=5))])
#or model.set_params(preprocessor=None, classifier__n_neighbors=5)
# pprint(model.get_params())
cv_results_without_scaler = cross_validate(model, data, target, cv = 10, scoring = "balanced_accuracy")
count = 0
for five, withoutScaler in zip(cv_results["test_score"], cv_results_without_scaler["test_score"]):
    if (five > withoutScaler):    
        count += 1
print("without scaler comparison with 5:", count)



0.9418803418803419
[0.95238095 0.97777778 1.         0.86324786 0.88253968 0.95238095
 0.95555556 0.95238095 0.93015873 0.95238095]
51 comparison with 5: 4
0.8766422466422465
[0.85714286 0.95238095 0.94444444 0.86324786 0.83492063 0.85714286
 0.83492063 0.88253968 0.83492063 0.9047619 ]
101 comparison with 5: 10
without scaler comparison with 5: 10


In [8]:
#Q5
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import GridSearchCV

model = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

# https://scikit-learn.org/stable/modules/preprocessing.html
all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

param_grid = {
    'preprocessor': all_preprocessors,
    'classifier__n_neighbors': [5,51,101]
}

model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=2, cv=10, scoring="balanced_accuracy") #like a cartesian product
model_grid_search.fit(data, target)

In [9]:
model_grid_search.cv_results_

{'mean_fit_time': array([0.00120988, 0.00216844, 0.00180464, 0.00245821, 0.00360651,
        0.0009989 , 0.00237072, 0.00157824, 0.00260139, 0.00327322,
        0.00123267, 0.00177271, 0.00197883, 0.0030021 , 0.00352814]),
 'std_fit_time': array([5.84570095e-04, 4.58864543e-04, 4.65480777e-04, 5.32138008e-04,
        7.85543883e-04, 4.59927978e-05, 6.21627406e-04, 5.03654282e-04,
        4.20739841e-04, 7.39619367e-04, 5.88915018e-04, 6.80646152e-04,
        3.59079448e-04, 9.67016364e-04, 5.31331671e-04]),
 'mean_score_time': array([0.00210569, 0.00255826, 0.00182035, 0.00190027, 0.00214913,
        0.00203035, 0.00252635, 0.00200751, 0.00213223, 0.00218549,
        0.00240526, 0.00223513, 0.00218031, 0.00299106, 0.00257175]),
 'std_score_time': array([0.00031249, 0.0007272 , 0.00047333, 0.00037443, 0.00030765,
        0.00032879, 0.00052237, 0.00037008, 0.0002787 , 0.00035519,
        0.00052521, 0.00056474, 0.00060288, 0.00093936, 0.00040305]),
 'param_classifier__n_neighbors': mask

In [40]:
#the best score is obtained with these params : {'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
# print(model_grid_search.best_score_, model_grid_search.best_params_)
# best_score = model_grid_search.best_score_ 

# print(model_grid_search.cv_results_)

# NOT VERY CLEAR THE QUESTIONS 

indexStandardScaler = []
for num, param in enumerate(model_grid_search.cv_results_["params"]):
    # If the parameters store class names or similar identifiers
    if 'StandardScaler' in str(param['preprocessor']) and param['classifier__n_neighbors'] == 5:
        indexStandardScaler.append(num)

splits_name = [x for x in model_grid_search.cv_results_.keys() if "split" in x]

for split_name in splits_name:
    score = model_grid_search.cv_results_[split_name][1]
    countMinMaxScaler = 0
    countNone = 0
    countQuantileTransformer = 0
    countPowerTransformer = 0
    for num, x in enumerate(model_grid_search.cv_results_[split_name]):
        param = model_grid_search.cv_results_["params"][num]
        if (param['classifier__n_neighbors'] == 5 and 'MinMaxScaler' in str(param['preprocessor'])):
            if (x < score):
                countMinMaxScaler += 1
        if (param['classifier__n_neighbors'] == 5 and 'None' in str(param['preprocessor'])):
            if (x < score):
                countNone += 1
        if (param['classifier__n_neighbors'] == 5 and 'QuantileTransformer' in str(param['preprocessor'])):
            if (x < score):
                countQuantileTransformer += 1
        if (param['classifier__n_neighbors'] == 5 and 'PowerTransformer' in str(param['preprocessor'])):
            if (x < score):
                countPowerTransformer += 1

# print(countMinMaxScaler > 7, countNone > 7, countQuantileTransformer > 7, countPowerTransformer > 7)
    
indexNoScaler = []
for num, param in enumerate(model_grid_search.cv_results_["params"]):
    # If the parameters store class names or similar identifiers
    if 'None' in str(param['preprocessor']):
        indexNoScaler.append(num)
        
# print(indexNoScaler)

for split_name in splits_name:
    first_no_Scaler = model_grid_search.cv_results_[split_name][0]
    second_no_Scaler = model_grid_search.cv_results_[split_name][5]
    third_no_Scaler = model_grid_search.cv_results_[split_name][10]
    better5 = True
    better51 = True
    better101 = True
    for num, x in enumerate(model_grid_search.cv_results_[split_name]):
        param = model_grid_search.cv_results_["params"][num]
        if (x < first_no_Scaler and 'None' not in str(param['preprocessor']) and param['classifier__n_neighbors'] == 5):
            better5 = False
            print(x, split_name, first_no_Scaler, model_grid_search.cv_results_["params"][num])
            break
        if (x < second_no_Scaler and 'None' not in str(param['preprocessor']) and param['classifier__n_neighbors'] == 51):
            better51 = False
            print(x, split_name, second_no_Scaler, model_grid_search.cv_results_["params"][num])
            break
        if (x < third_no_Scaler and 'None' not in str(param['preprocessor']) and param['classifier__n_neighbors'] == 101):
            better101 = False
            print(x, split_name, third_no_Scaler, model_grid_search.cv_results_["params"][num])
            break
    # print(better5,better51,better101, split_name)
    

index_5_SS = []
index_51_SS = []
index_101_SS = []
for num, param in enumerate(model_grid_search.cv_results_["params"]):
    # If the parameters store class names or similar identifiers
    if 'StandardScaler' in str(param['preprocessor']) and param['classifier__n_neighbors'] == 5:
        index_5_SS.append(num)
    if 'StandardScaler' in str(param['preprocessor']) and param['classifier__n_neighbors'] == 51:
        index_51_SS.append(num)
    if 'StandardScaler' in str(param['preprocessor']) and param['classifier__n_neighbors'] == 101:
        index_101_SS.append(num)
    
# print(index_5_SS, index_51_SS, index_101_SS)


# split_5_SS = f"split{index_5_SS[0]}_test_score"
# split_51_SS = f"split{index_51_SS[0]}_test_score"
# split_101_SS = f"split{index_101_SS[0]}_test_score"
count = 0
for split_name in splits_name:
    x = model_grid_search.cv_results_[split_name][index_5_SS[0]]
    y = model_grid_search.cv_results_[split_name][index_51_SS[0]]
    count = count + 1 if (x > y) else count 
# print(count)
count = 0
for split_name in splits_name:
    x = model_grid_search.cv_results_[split_name][index_101_SS[0]]
    y = model_grid_search.cv_results_[split_name][index_51_SS[0]]
    count = count + 1 if (x < y) else count 

# print(count)
# print(model_grid_search.cv_results_['split2_test_score'])

False False False False
True True True split0_test_score
True True True split1_test_score
True True True split2_test_score
True True True split3_test_score
True True True split4_test_score
True True True split5_test_score
True True True split6_test_score
True True True split7_test_score
True True True split8_test_score
True True True split9_test_score


In [41]:
#Q6
from sklearn.model_selection import cross_validate

cv_results = cross_validate(
    model_grid_search, data, target, cv=10, n_jobs=2, return_estimator=True, scoring="balanced_accuracy"
)

cv_results["test_score"].mean()

0.9426495726495727

In [44]:
#Q7
for model in cv_results['estimator']:
    print(model.best_score_, model.best_params_)



0.9592629592629593 {'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
0.9543567543567543 {'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
0.9473970473970474 {'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
0.952952602952603 {'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
0.951975801975802 {'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
0.955944055944056 {'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
0.9518315018315018 {'classifier__n_neighbors': 5, 'preprocessor': MinMaxScaler()}
0.9517926517926518 {'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
0.9569208569208569 {'classifier__n_neighbors': 5, 'preprocessor': StandardScaler()}
0.9567765567765569 {'classifier__n_neighbors': 5, 'preprocessor': QuantileTransformer(n_quantiles=100)}
