In [29]:
import numpy as np
import pandas as pd

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2 ## score function is chi2
from sklearn.tree import DecisionTreeClassifier

In [31]:
df = pd.read_csv('titanic_train.csv')

In [32]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Make Pipeline

Use column tranformer for below steps
1. We have data missing in Age and Embarked columns
2. one hot encodign to sex and embarked columns due to categorical values
3. Scaling of columns
4. Feature Selection, k best features
5. Train model using decision tree classifier

In [33]:
## drop non useful columns
df.drop(columns=['PassengerId','Name', 'Ticket', 'Cabin'], inplace = True)

In [34]:
## TRAIN TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2, random_state=42)

In [35]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [36]:
y_train.head()

331    0
733    0
382    0
704    0
813    0
Name: Survived, dtype: int64

In [37]:
### perform first column trasnformer for imputing
## imputation transformer
## pass list of tuple in column Transformer
trf1=ColumnTransformer([
    ('impute_age', SimpleImputer(),[2]), ## nan filled with mean, rather than calling by column name we take column index for receiving of code in later stage of pipeline
    ('impute_embarked', SimpleImputer(strategy='most_frequent'),[6])
], remainder='passthrough')


In [38]:
### perform first column trasnformer for One Hot Encoding
## ONE HOT ENCODING

trf2=ColumnTransformer([
    ('ohe_sex_embarked', OneHotEncoder(sparse=False, handle_unknown='ignore'), [1,6])
], remainder='passthrough')

In [39]:
## ### perform first column trasnformer for SCALING
## Scaling
trf3 = ColumnTransformer([
    ('scale', MinMaxScaler(),slice(0,10))  ## on all columns, 10 because after OHE we get 2 columns and 3 from embarked and old age and embakrd will be dropped
])
## MinMax scaler is used because we are using F.S of Kbestfeature, chi2

In [40]:
### FEATURE SELECTION
trf4 = SelectKBest(score_func=chi2, k=8)

In [41]:
###Train the model
trf5 = DecisionTreeClassifier()

#### CREATE PIPELINE

In [42]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5),
])

### PIPELINE VS MAKE_PIPELINE

Pipeline requires naming of steps, make_pipelines does not.
(Sample applier to ColumnTransformer vs nake_column_transformer)


In [43]:
## alternative syntax
pipe = make_pipeline(trf1, trf2, trf3, trf4, trf5)

In [48]:
#train
pipe.fit(X_train, y_train)
## becuse we have trained model in pipeline so we call .fit
## if we are not using train model in pipeline, so we call fit_transform

In [47]:
from sklearn import set_config
set_config(display='diagram')

In [49]:
## pipe object is trained now


In [51]:
pipe.named_steps

#shows steps in pipelines

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'selectkbest': SelectKBest(k=8, score_func=<function chi2 at 0x0000018470A7C790>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [53]:
pipe.named_steps['columntransformer-1'].transformers_


[('impute_age', SimpleImputer(), [2]),
 ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder', 'passthrough', [0, 1, 3, 4, 5])]

In [54]:
pipe.named_steps['columntransformer-1'].transformers_[0]


('impute_age', SimpleImputer(), [2])

In [55]:
pipe.named_steps['columntransformer-1'].transformers_[0][0]


'impute_age'

In [56]:
pipe.named_steps['columntransformer-1'].transformers_[0][1]


In [57]:
pipe.named_steps['columntransformer-1'].transformers_[0][0]


'impute_age'

In [63]:
pipe.named_steps['columntransformer-1'].transformers_[0][1].statistics_
##shows mean value as selected by simple imputer

array([29.49884615])

In [64]:
pipe.named_steps['columntransformer-1'].transformers_[1][1].statistics_
##shows most frequent value as selected by simple imputer

array(['S'], dtype=object)

In [65]:
##PREDICT
y_pred=pipe.predict(X_test)

In [66]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0], dtype=int64)

In [67]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6256983240223464

In [68]:
## acccuracy low as compared to wihtout pipeline
## because we have taken feature selection here
## try remving FS and run again

### CROSS VALIDATION USING PIPELINE

In [69]:
#Cross validation is we can train test split n number of times our data and find best accuracy

In [70]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=6, scoring='accuracy').mean()

0.6390471442814414

GRIDSEARCH USING PIPELINE

In [75]:
##gridsearchcv

params = {
 'decisiontreeclassifier__max_depth':[1,2,3,4,5,None]   
}
## decisiontreeclassifier__max_depth  = name of pipe step and parameter i.e max depth

In [76]:
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

In [77]:
grid.best_score_

0.6391214419383433

In [78]:
grid.best_params_

{'decisiontreeclassifier__max_depth': 2}

### EXPORTING THE PIPELINE

In [80]:
import pickle
pickle.dump(pipe, open('pipe.pkl','wb'))

## previously we give different OHE, CLF model training and OHE separatelt but here only pipe