In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv("/home/yash/Downloads/Titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.drop(columns = ["PassengerId","Name","Ticket","Cabin"],inplace = True)
# inplcae = True means it drop all this collumns from original dataframe df

In [4]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [5]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns = ["Survived"]),df["Survived"],test_size = 0.2,random_state = 0)
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
140,3,female,,0,2,15.2458,C
439,2,male,31.0,0,0,10.5,S
817,2,male,31.0,1,1,37.0042,C
378,3,male,20.0,0,0,4.0125,C
491,3,male,21.0,0,0,7.25,S


In [6]:
numericals_colls = ["Age","Fare"]
numericals_transformer = Pipeline(steps = [
    ("Imputer" , SimpleImputer(strategy = "median")),
    ("Scaler"   , StandardScaler())
])

categorical_colls = ["Embarked","Sex"]
categorical_transformer  = Pipeline(steps = [
    ("Imputer" , SimpleImputer(strategy = "most_frequent")),
    ("OHE"     , OneHotEncoder(handle_unknown = "ignore"))
])
# here pipeline perform Imputation on both and scale a numerical data and encode a categorical data 

In [7]:
preprocess = ColumnTransformer(
    transformers=[
        ("num",numericals_transformer,numericals_colls),
        ("cat",categorical_transformer,categorical_colls)
    ]
)
# here ColumnTransformer apply pipeline on numerical data and categorical data and combine both

In [8]:
preprocess

In [9]:
clf = Pipeline(steps = [
    ("preprocess",preprocess),
    ("classifier",LogisticRegression())
])
# here Pipeline perform ColumnTransformer and after that it train model by LogisticRegression
# means here Imputation + Scaling/Encoding + Train a model = Done✔️

In [10]:
clf

In [11]:
param_grid = {
    "preprocess__num__Imputer__strategy": ["mean", "median"],
    "preprocess__cat__Imputer__strategy": ["most_frequent", "constant"],
    "classifier__C" : [0.1,1.0,10,100]
}
 
grid_search = GridSearchCV(clf,param_grid,cv = 10)

In [12]:
grid_search.fit(x_train,y_train)

print("Best Params : ",grid_search.best_params_)

Best Params :  {'classifier__C': 0.1, 'preprocess__cat__Imputer__strategy': 'most_frequent', 'preprocess__num__Imputer__strategy': 'mean'}


In [13]:
print(f"Internal CV score : {grid_search.best_score_ : 3f}")

Internal CV score :  0.782414


In [14]:
cv_results = pd.DataFrame(grid_search.cv_results_)

cv_results = cv_results.sort_values("mean_test_score", ascending=False)

cv_results = cv_results[
    [
        "param_classifier__C",
        "param_preprocess__cat__Imputer__strategy",
        "param_preprocess__num__Imputer__strategy",
        "mean_test_score"
    ]
]

cv_results


Unnamed: 0,param_classifier__C,param_preprocess__cat__Imputer__strategy,param_preprocess__num__Imputer__strategy,mean_test_score
0,0.1,most_frequent,mean,0.782414
1,0.1,most_frequent,median,0.782414
2,0.1,constant,mean,0.782414
3,0.1,constant,median,0.782414
4,1.0,most_frequent,mean,0.782414
5,1.0,most_frequent,median,0.782414
6,1.0,constant,mean,0.782414
7,1.0,constant,median,0.782414
8,10.0,most_frequent,mean,0.782414
9,10.0,most_frequent,median,0.782414
