
<br>
<br>

# Automatic select value for imputation: 

<br>
<br>


In [49]:

import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


In [50]:

df = pd.read_csv("dataset/train.csv")
df.sample(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
704,705,0,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
845,846,0,3,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S


In [51]:

df.drop(columns=["PassengerId","Name","Ticket","Cabin"],axis=1,inplace=True)
df.sample(2)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
816,0,3,female,23.0,0,0,7.925,S
40,0,3,female,40.0,1,0,9.475,S


In [52]:
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,1:],df.iloc[:,:1],test_size=0.2,random_state=2)

In [53]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         148
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [54]:
X_train.tail()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
534,3,female,30.0,0,0,8.6625,S
584,3,male,,0,0,8.7125,C
493,1,male,71.0,0,0,49.5042,C
527,1,male,,0,0,221.7792,S
168,1,male,,0,0,25.925,S


In [72]:
numerical_feature = ["Age","Fare"]
categorical_feature = ["Sex","Embarked"]


# pipeline for numerical feature:
numerical_pipe = Pipeline(
    steps=[
        ("impute",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]
)

# pipe line for categorical feature:
categorical_pipe = Pipeline(
    steps=[
        ("impute",SimpleImputer(strategy="most_frequent")),
        ("ohe",OneHotEncoder(handle_unknown="ignore"))
    ]
)

# make a transformer (pipe ultemitly except transformer)
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numerical_pipe,numerical_feature),
        ('cat',categorical_pipe,categorical_feature)
    ]
)

# final pipeline:
cls = Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("classifier",LogisticRegression())
])
cls

<br>
<br>
<br>

# Now apply gridSearchCV for finding best strategy: 

<br>
<br>
<br>

In [78]:
param_grid = {
    'preprocessor__num__impute__strategy': ['mean', 'median'],
    'preprocessor__cat__impute__strategy': ['most_frequent', 'constant'],
    'classifier__C': [0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(cls, param_grid, cv=10)

In [86]:
# for y_train we get an warning. 
## (y_train) Convert to NumPy array and reshape to 1D  

grid_search.fit(X_train,y_train.values.ravel())  


In [87]:
grid_search.best_params_

{'classifier__C': 1.0,
 'preprocessor__cat__impute__strategy': 'most_frequent',
 'preprocessor__num__impute__strategy': 'mean'}

In [88]:
grid_search.cv_results_

{'mean_fit_time': array([0.02296994, 0.01459424, 0.01414762, 0.01609907, 0.01814027,
        0.01859612, 0.01627891, 0.01895523, 0.0164777 , 0.01650448,
        0.01554198, 0.02575059, 0.01856418, 0.01553636, 0.01743989,
        0.01772652]),
 'std_fit_time': array([0.00551733, 0.00189535, 0.00151856, 0.00289995, 0.00491131,
        0.003338  , 0.00174046, 0.00275593, 0.00159585, 0.00166457,
        0.00164351, 0.00842847, 0.00550209, 0.00117039, 0.00146955,
        0.00128786]),
 'mean_score_time': array([0.00786259, 0.00557415, 0.0057312 , 0.00621481, 0.0076236 ,
        0.00661342, 0.00597529, 0.00684326, 0.00649741, 0.00632391,
        0.00574532, 0.0110105 , 0.00770209, 0.00618806, 0.00650716,
        0.00650198]),
 'std_score_time': array([0.00220166, 0.00091996, 0.00107386, 0.00156264, 0.00246467,
        0.00205979, 0.00150213, 0.00215636, 0.00130702, 0.00194397,
        0.00098874, 0.00628707, 0.00250123, 0.00150653, 0.00118079,
        0.00138614]),
 'param_classifier__C': ma

In [92]:
result = pd.DataFrame(grid_search.cv_results_)
result.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_classifier__C', 'param_preprocessor__cat__impute__strategy',
       'param_preprocessor__num__impute__strategy', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'split5_test_score',
       'split6_test_score', 'split7_test_score', 'split8_test_score',
       'split9_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score'],
      dtype='object')

In [96]:

# see the result:

result[["param_classifier__C",'param_preprocessor__cat__impute__strategy','param_preprocessor__num__impute__strategy','mean_test_score']].sort_values('mean_test_score',ascending=False)



Unnamed: 0,param_classifier__C,param_preprocessor__cat__impute__strategy,param_preprocessor__num__impute__strategy,mean_test_score
4,1.0,most_frequent,mean,0.787852
5,1.0,most_frequent,median,0.787852
6,1.0,constant,mean,0.787852
7,1.0,constant,median,0.787852
8,10.0,most_frequent,mean,0.787852
9,10.0,most_frequent,median,0.787852
10,10.0,constant,mean,0.787852
11,10.0,constant,median,0.787852
12,100.0,most_frequent,mean,0.787852
13,100.0,most_frequent,median,0.787852
