# 📝 Exercise M3.02

The goal is to find the best set of hyperparameters which maximize the
statistical performance on a training set.

Here again with limit the size of the training set to make computation
run faster. Feel free to increase the `train_size` value if your computer
is powerful enough.

In [1]:
import numpy as np
import pandas as pd

adult_census = pd.read_csv("../datasets/adult-census.csv")

target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "education-num"])

from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=42)

Create your machine learning pipeline

You should:
* preprocess the categorical columns using a `OneHotEncoder` and use a
  `StandardScaler` to normalize the numerical data.
* use a `LogisticRegression` as a predictive model.

Start by defining the columns and the preprocessing pipelines to be applied
on each columns.

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_selector as selector

In [5]:
categorical_selector = selector(dtype_include=object)
numerical_selector = selector(dtype_exclude=object)

In [8]:
categorical_cols = categorical_selector(data)
categorical_cols

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [9]:
numerical_cols = numerical_selector(data)
numerical_cols

['age', 'capital-gain', 'capital-loss', 'hours-per-week']

In [10]:
from sklearn.compose import ColumnTransformer

In [26]:
ColumnTransformer?

In [31]:
processor = ColumnTransformer([("cat", OneHotEncoder(),categorical_cols),
                               ("num", StandardScaler(), numerical_cols)], 
                              remainder='passthrough')

In [34]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import set_config
from sklearn.linear_model import LogisticRegression
set_config(display='diagram')

In [36]:
model = make_pipeline(processor, LogisticRegression())

In [37]:
model

Subsequently, create a `ColumnTransformer` to redirect the specific columns
a preprocessing pipeline.

In [40]:
from sklearn.compose import ColumnTransformer

Finally, concatenate the preprocessing pipeline with a logistic regression.

In [41]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

Use a `RandomizedSearchCV` to find the best set of hyperparameters by tuning
the following parameters of the `model`:

- the parameter `C` of the `LogisticRegression` with values ranging from
  0.001 to 10. You can use a log-uniform distribution
  (i.e. `scipy.stats.loguniform`);
- the parameter `with_mean` of the `StandardScaler` with possible values
  `True` or `False`;
- the parameter `with_std` of the `StandardScaler` with possible values
  `True` or `False`.

Once the computation has completed, print the best combination of parameters
stored in the `best_params_` attribute.

In [42]:
from sklearn.model_selection import RandomizedSearchCV

In [52]:
from scipy.stats import loguniform

In [87]:
loguniform(0.001,10).rvs()

2.2627632805886275

```
logisticregression__C 0.001, 10 loguniform

columntransformer__num__with_mean True False

columntransformer__num__with_std True False
```

In [95]:
param_distributions = {
    'logisticregression__C' : loguniform(0.001,10),
    'columntransformer__num__with_mean': [True, False],
    'columntransformer__num__with_std': [True, False]    
}

In [137]:
RandomizedSearchCV?

In [123]:
model_rand_search = RandomizedSearchCV(model, \
                    param_distributions=param_distributions,\
                  n_iter=100, n_jobs=10)

In [124]:
from sklearn.model_selection import cross_validate

In [125]:
cv_result = cross_validate(model_rand_search, \
                           data_train, target_train, cv=3,\
                           n_jobs=5, return_estimator=True )

In [131]:
cv_result['test_score']

array([0.83719597,        nan, 0.84922195])

In [127]:
for idx, est in enumerate(cv_result['estimator']):
    print(est.best_params_)

{'columntransformer__num__with_mean': True, 'columntransformer__num__with_std': True, 'logisticregression__C': 0.003405552838392023}
{'columntransformer__num__with_mean': False, 'columntransformer__num__with_std': True, 'logisticregression__C': 0.49786101042449415}
{'columntransformer__num__with_mean': True, 'columntransformer__num__with_std': True, 'logisticregression__C': 0.13349691524312013}


In [128]:
len(cv_result)

4

In [129]:
for idx, est in enumerate(cv_result['estimator']):
    print(est.cv_results_)

{'mean_fit_time': array([0.88033752, 1.99633627, 1.72507529, 2.05569563, 0.58644247,
       1.73952699, 1.61384888, 1.10881696, 2.06584625, 0.72578921,
       1.83967714, 1.17241564, 1.71748028, 1.6801507 , 1.72111006,
       1.70910048, 1.0469708 , 2.09613886, 0.82285023, 2.06160059,
       0.62452803, 1.74527245, 1.71520896, 1.71712132, 1.69940023,
       1.7375875 , 1.74973006, 1.77736411, 1.76343875, 1.76270185,
       1.72635112, 1.85845923, 1.15159149, 1.70490565, 1.7439642 ,
       1.70114427, 1.73925562, 1.802005  , 1.67078333, 1.98136253,
       0.8583354 , 1.70523171, 1.7515099 , 1.72030625, 1.75372243,
       1.85226054, 1.51570759, 1.73862596, 1.74414649, 1.17499876,
       1.69030046, 1.74022751, 1.69303098, 1.70716729, 1.68691359,
       1.77517762, 0.88836474, 1.72214537, 1.68796377, 1.71345148,
       1.75719495, 1.73635607, 1.72754755, 1.72211618, 1.74249287,
       1.72153811, 1.70400305, 1.90503516, 1.17709861, 1.78617392,
       1.7030509 , 2.07204962, 0.98808398, 1

take result of one estimator

In [132]:
est = cv_result['estimator'][0]

In [144]:
cv_results = pd.DataFrame.from_dict(est.cv_results_)

In [145]:
cv_results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_columntransformer__num__with_mean',
       'param_columntransformer__num__with_std', 'param_logisticregression__C',
       'params', 'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score'],
      dtype='object')

Rename columns

In [146]:
column_name_mapping = {
    "param_logisticregression__C": "C",
    "param_columntransformer__num__with_mean": "centering",
    "param_columntransformer__num__with_std": "scaling",
    "split4_test_score": "mean test accuracy",  # use split4 as acc
}

cv_results = cv_results.rename(columns=column_name_mapping)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,centering,scaling,C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean test accuracy,mean_test_score,std_test_score,rank_test_score
0,0.880338,0.024454,0.112254,0.062405,True,True,0.003406,"{'columntransformer__num__with_mean': True, 'c...",0.839066,,0.832310,0.839681,0.833538,,,1
1,1.996336,0.074733,0.096800,0.050897,False,False,0.045341,"{'columntransformer__num__with_mean': False, '...",0.842138,,0.835790,0.837838,0.840909,,,72
2,1.725075,0.068059,0.099921,0.045291,False,False,0.058248,"{'columntransformer__num__with_mean': False, '...",0.842138,,0.838657,0.842138,0.837224,,,71
3,2.055696,0.117734,0.101822,0.048415,True,False,0.37116,"{'columntransformer__num__with_mean': True, 'c...",0.846028,,0.841114,0.844390,0.821458,,,70
4,0.586442,0.032573,0.096971,0.041660,True,True,0.002498,"{'columntransformer__num__with_mean': True, 'c...",0.832105,,0.826577,0.832719,0.828010,,,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.787988,0.102990,0.114971,0.048992,True,True,0.002424,"{'columntransformer__num__with_mean': True, 'c...",0.832924,,0.826167,0.832310,0.828419,,,29
96,1.504343,0.101859,0.058355,0.021101,True,True,0.058722,"{'columntransformer__num__with_mean': True, 'c...",0.853808,,0.847256,0.849304,0.847256,,,28
97,1.478877,0.067019,0.136215,0.014572,True,True,0.067981,"{'columntransformer__num__with_mean': True, 'c...",0.855242,,0.847461,0.849509,0.847256,,,27
98,1.471200,0.124224,0.033503,0.020104,False,True,0.046246,"{'columntransformer__num__with_mean': False, '...",0.852785,,0.846437,0.849918,0.846028,,,87


In [147]:
cv_results = cv_results[column_name_mapping.values()].sort_values(
    "mean test accuracy", ascending=False)

In [148]:
cv_results

Unnamed: 0,C,centering,scaling,mean test accuracy
78,2.957724,False,True,0.850328
93,9.084115,False,True,0.850123
89,1.927475,False,True,0.849918
26,7.387324,False,True,0.849918
38,4.061186,True,True,0.849713
...,...,...,...,...
3,0.37116,True,False,0.821458
79,0.892836,True,False,0.820844
42,0.001437,True,False,0.820229
23,0.001532,False,False,0.817977


In [150]:
column_scaler = ["centering", "scaling"]
cv_results[column_scaler] = cv_results[column_scaler].astype(np.int64)

In [159]:
cv_results["logC"] = np.log10(cv_results.C.astype(float))

In [160]:
import plotly.express as px

fig = px.parallel_coordinates(
    cv_results,
    color="mean test accuracy",
    dimensions=["logC", "centering", "scaling", "mean test accuracy"],
    color_continuous_scale=px.colors.diverging.Tealrose,
)
fig.show()