# Pipeline

In [1]:
# Pipeline is a technique that chains together multiple steps of preprocessing so that output of the each step become input of the other
# It makes easy to apply same step on train and test while deployement 

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest,chi2

In [3]:
df=pd.read_csv("//Users//udayladdha//Desktop//DataSets//train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# steps of our pipeline that we will follow
1. imputing missing values in column age and embarked using column transformer
2. applying ohe on sex and embarked using one hot in column transformer
3. scaling
4. feature selection (best 5 feature)
5. train model using decesion tree

In [4]:
df.drop(columns=["PassengerId","Name","Ticket","Cabin"],inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [6]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [7]:
x_train , x_test , y_train , y_test=train_test_split(df.drop(columns="Survived"),df["Survived"],test_size=0.2,random_state=42)

In [8]:
x_train.shape , x_test.shape, y_train.shape , y_test.shape

((712, 7), (179, 7), (712,), (179,))

In [9]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [10]:
# imputaion transformer
trf1=ColumnTransformer([
    ("impute_age",SimpleImputer(),[2]), # whenever use pipeline try to use index number instead of column name which decreases the risk of pipeline breakage as columntransformer return array 
    ("impute_embarked",SimpleImputer(strategy="most_frequent"),[6])
],remainder="passthrough")

In [11]:
# NOTE: whenever we use column transformer its indexing gets changed ,the columns we transform gets on first place rest all shifts accordingly so always check indexing 

In [12]:
# one hot encoding
trf2=ColumnTransformer([
    ("ohe_sex_embarked",OneHotEncoder(sparse_output=False,handle_unknown="ignore"),[3,1])
],remainder="passthrough")

In [13]:
# scaling
trf3=ColumnTransformer([
    ("scaling",MinMaxScaler(),slice(0,10))
],remainder="passthrough")

In [14]:
# feature selection
trf4=SelectKBest(score_func=chi2,k=8)

In [15]:
# train model
trf5=DecisionTreeClassifier()

In [16]:
pipe=Pipeline([
    ("trf1",trf1),
    ("trf2",trf2),
    ("trf3",trf3),
    ("trf4",trf4),
    ("trf5",trf5)
])

In [17]:
#alternate syntax
# pipe=make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [18]:
pipe.fit(x_train,y_train)

# Exploring pipeline

In [19]:
pipe.named_steps # steps involved in our pipeline

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [3, 1])]),
 'trf3': ColumnTransformer(remainder='passthrough',
                   transformers=[('scaling', MinMaxScaler(),
                                  slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x13080cd30>),
 'trf5': DecisionTreeClassifier()}

In [20]:
pipe.named_steps["trf1"] # calling out particular step from our pipeline


In [21]:
pipe.named_steps["trf1"].transformers_# list of transformers in trf1

[('impute_age', SimpleImputer(), [2]),
 ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder', 'passthrough', [0, 1, 3, 4, 5])]

In [22]:
pipe.named_steps["trf1"].transformers_[0] # extracting first element from the list of transformer of trf1

('impute_age', SimpleImputer(), [2])

In [23]:
pipe.named_steps["trf1"].transformers_[0][1].statistics_ # mean value of our first simple imputer which it created

array([29.49884615])

In [24]:
pipe.named_steps["trf1"].transformers_[1][1].statistics_ # this the most frequent value that our second simple imputer created

array(['S'], dtype=object)

In [25]:
# in this way we can check every step of our pipeline which helps in debugging too