In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
df=pd.read_csv(r'../Datasets/titanic_train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df=df.drop(columns=['PassengerId','Name','Ticket','Cabin'])
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [4]:
x=df.drop(columns=['Survived'])
y=df['Survived']

In [5]:
x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [6]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)

In [12]:
numerical_features=[col for col in x.columns if len(x[col].unique())>20 and x[col].dtype!='object']
numerical_features

['Age', 'Fare']

In [13]:
categorical_features=[col for col in x.columns if x[col].dtype=='object']
categorical_features

['Sex', 'Embarked']

In [21]:
numerical_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

categorical_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoding',OneHotEncoder(dtype=int,handle_unknown='ignore',sparse_output=False))
])

In [22]:
trf_preprocess=ColumnTransformer(transformers=[
    ('num',numerical_pipeline,numerical_features),
    ('cat',categorical_pipeline,categorical_features)
],remainder='passthrough')

In [23]:
model_pipeline=Pipeline(steps=[
    ('preprocess',trf_preprocess),
    ('classifier',LogisticRegression())
])

In [24]:
from sklearn import set_config
set_config(display='diagram')
model_pipeline

0,1,2
,steps,"[('preprocess', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'int'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [34]:
param_grid={
    'preprocess__num__imputer__strategy':['mean','median'],
    'preprocess__cat__imputer__strategy':['most_frequent','constant'],
    'classifier__C':[0.1,1,10,50,100]
}

In [35]:
grid_search=GridSearchCV(model_pipeline,param_grid,cv=10)

In [36]:
grid_search.fit(x_train,y_train)

0,1,2
,estimator,Pipeline(step...egression())])
,param_grid,"{'classifier__C': [0.1, 1, ...], 'preprocess__cat__imputer__strategy': ['most_frequent', 'constant'], 'preprocess__num__imputer__strategy': ['mean', 'median']}"
,scoring,
,n_jobs,
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'int'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [37]:
print('Grid search best params:')
print(grid_search.best_params_)

Grid search best params:
{'classifier__C': 0.1, 'preprocess__cat__imputer__strategy': 'most_frequent', 'preprocess__num__imputer__strategy': 'mean'}


In [39]:
print(f"Best result:{grid_search.best_score_:.3f}")

Best result:0.812


In [42]:
cv_results=pd.DataFrame(grid_search.cv_results_)
cv_results=cv_results.sort_values("mean_test_score",ascending=False)
cv_results[['param_classifier__C','param_preprocess__cat__imputer__strategy','param_preprocess__num__imputer__strategy','mean_test_score']]

Unnamed: 0,param_classifier__C,param_preprocess__cat__imputer__strategy,param_preprocess__num__imputer__strategy,mean_test_score
0,0.1,most_frequent,mean,0.811718
1,0.1,most_frequent,median,0.811718
3,0.1,constant,median,0.811718
2,0.1,constant,mean,0.810309
5,1.0,most_frequent,median,0.806084
6,1.0,constant,mean,0.806084
7,1.0,constant,median,0.806084
4,1.0,most_frequent,mean,0.804675
8,10.0,most_frequent,mean,0.804675
10,10.0,constant,mean,0.803286
