<a href="https://colab.research.google.com/github/yashasangani/Machine-Learning/blob/main/Pipeline/titanic_using_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

In [18]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier


In [19]:
df = pd.read_csv("https://raw.githubusercontent.com/yashasangani/Datasets/main/train.csv",delimiter = ",")

In [103]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Let's Plan

In [21]:
df.drop(columns = ["PassengerId","Name","Ticket","Cabin"],inplace = True)
df.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S


# Step 1 : Train/Test split

In [22]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=["Survived"]),df["Survived"],test_size = 0.2,random_state = 0)
x_train.shape

(712, 7)

In [23]:
y_train.sample(5)

Unnamed: 0,Survived
514,0
745,0
512,1
506,1
719,0


In [68]:
# imputation transformer

trf1 = make_column_transformer(
                            (SimpleImputer(),[2]),
                            (SimpleImputer(strategy = 'most_frequent'),[6])
,remainder = 'passthrough')


In [69]:
# OneHotEncoding transformer

trf2 = make_column_transformer(
                              (OneHotEncoder(sparse_output=False, handle_unknown="ignore"),[1,6]),remainder = 'passthrough')

In [84]:
# Scaling

trf3 = make_column_transformer((MinMaxScaler(),slice(0,10)))

In [85]:
# Feature selection

# trf4 = SelectKBest(score_func = chi2,k=8)



In [86]:
# train the model

trf5 = DecisionTreeClassifier()

# Create Pipeline

In [95]:
pipe = Pipeline([
    ("trf1",trf1),
     ("trf2",trf2),
    ("trf3",trf3),
    # ("trf4",trf4),
    ("trf5",trf5)
])

# Pipeline Vs make_pipeline
### Pipeline requires naming of steps,make_pipeline does not

(same applies to ColumnTransformer vs make_column_transformer)

In [96]:
# Alternate syntax

pipe = make_pipeline(trf1,trf2,trf3,trf5)

In [97]:
# train
pipe.fit(x_train,y_train)

In [98]:
# Display Pipeline

from sklearn import set_config
set_config(display = 'diagram')

# Explore the Pipeline

In [99]:
#code here
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('simpleimputer-1', SimpleImputer(), [2]),
                                 ('simpleimputer-2',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('onehotencoder',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('minmaxscaler', MinMaxScaler(),
                                  slice(0, 10, None))]),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [100]:
# predict
y_pred = pipe.predict(x_test)

In [101]:
y_pred

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0])

In [102]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6759776536312849

#Cross Validation using Pipeline


In [104]:
#cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,x_train,y_train,cv = 5,scoring = 'accuracy').mean()


0.626484782822811

# Grid Search using Pipeline

In [105]:
# Gridsearchcv

params = {'trf5_max_depth':[1,2,3,4,5,None]}

In [106]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,params, cv = 5,scoring = 'accuracy')
grid.fit(x_train,y_train)

ValueError: Invalid parameter 'trf5_max_depth' for estimator Pipeline(steps=[('columntransformer-1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('simpleimputer-1',
                                                  SimpleImputer(), [2]),
                                                 ('simpleimputer-2',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  [6])])),
                ('columntransformer-2',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False),
                                                  [1, 6])])),
                ('columntransformer-3',
                 ColumnTransformer(transformers=[('minmaxscaler',
                                                  MinMaxScaler(),
                                                  slice(0, 10, None))])),
                ('decisiontreeclassifier', DecisionTreeClassifier())]). Valid parameters are: ['memory', 'steps', 'verbose'].

# Exporting  the pipeline

In [107]:
# export
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))