sklearn.pipeline.Pipeline(steps, *, memory=None, verbose=False)


Pipeline allows you to sequentially apply a list of transformers to preprocess the data and, if desired, conclude the sequence with a final predictor for predictive modeling.

# Without Using Piplines

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [3]:
df=pd.read_csv('survival.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
 df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [6]:
x_train, x_test, y_train,y_test=train_test_split(df.iloc[:,1:], df.iloc[:,0:1],test_size=0.2)
x_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
728,2,male,25.0,1,0,26.0000,S
447,1,male,34.0,0,0,26.5500,S
232,2,male,59.0,0,0,13.5000,S
713,3,male,29.0,0,0,9.4833,S
754,2,female,48.0,1,2,65.0000,S
...,...,...,...,...,...,...,...
559,3,female,36.0,1,0,17.4000,S
854,2,female,44.0,1,0,26.0000,S
842,1,female,30.0,0,0,31.0000,C
558,1,female,39.0,1,1,79.6500,S


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [8]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [9]:
# applying imputation
si_age=SimpleImputer()
si_embarked=SimpleImputer(strategy='most_frequent')
x_train_age=si_age.fit_transform(x_train[['Age']])
x_train_embarked=si_embarked.fit_transform(x_train[['Embarked']])

x_test_age=si_age.fit_transform(x_test[['Age']])
x_test_embarked=si_embarked.fit_transform(x_test[['Embarked']])


In [10]:
x_train_embarked

array([['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
      

In [14]:
# onehotencoding on sex and embarked
ohe_sex=OneHotEncoder(sparse=False,handle_unknown='ignore')
ohe_embarked=OneHotEncoder(sparse=False,handle_unknown='ignore')

x_train_sex=ohe_sex.fit_transform(x_train[['Sex']])
x_train_embarked=ohe_embarked.fit_transform(x_train_embarked)

x_test_sex=ohe_sex.fit_transform(x_test[['Sex']])
x_test_embarked=ohe_embarked.fit_transform(x_test_embarked)




In [16]:
x_train_sex

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [17]:
x_train_embarked

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [18]:
x_train_rem=x_train.drop(columns=['Sex','Age','Embarked'])
x_test_rem=x_test.drop(columns=['Sex','Age','Embarked'])

In [20]:
x_train_transformed=np.concatenate((x_train_rem,x_train_age,x_train_sex,x_train_embarked),axis=1)
x_test_transformed =np.concatenate((x_test_rem,x_test_age,x_test_sex,x_test_embarked),axis=1)

In [21]:
x_train_transformed

array([[2., 1., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       [2., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 1., 1., ..., 0., 0., 1.],
       [3., 0., 2., ..., 1., 0., 0.]])

In [22]:
x_train_transformed.shape

(712, 10)

In [23]:
clf=DecisionTreeClassifier()
clf.fit(x_train_transformed,y_train)

In [24]:
y_pred=clf.predict(x_test_transformed)

In [25]:
y_pred

array([0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1], dtype=int64)

In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7877094972067039

The pickle module implements binary protocols for serializing and de-serializing a Python object structure.

 Warning:=> The pickle module is not secure. Only unpickle data you trust.It is possible to construct malicious pickle data which will execute arbitrary code during unpickling. Never unpickle data that could have come from an untrusted source, or that could have been tampered with.

In [27]:
import pickle

In [30]:
pickle.dump(ohe_sex,open('models/ohe_sex.pk1','wb'))
pickle.dump(ohe_embarked,open('models/ohe_embarked.pk1','wb'))
pickle.dump(clf,open('models/clf.pk1','wb'))

# Use the Pickle model which we create above

In [31]:
ohe_sex=pickle.load(open('models/ohe_sex.pk1','rb'))
ohe_embarked=pickle.load(open('models/ohe_embarked.pk1','rb'))
clf=pickle.load(open('models/clf.pk1','rb'))

In [33]:
# Assume user input
# pclass/gender/age/SibSp/Parch/Fare/Embarked
test_input=np.array([2,'male',31.0,0,0,10.5,'S'],dtype=object).reshape(1,7)

In [34]:
test_input

array([[2, 'male', 31.0, 0, 0, 10.5, 'S']], dtype=object)

In [37]:
test_input_sex=ohe_sex.transform(test_input[:,1].reshape(1,1))



In [38]:
test_input_sex

array([[0., 1.]])

In [40]:
test_input_embarked=ohe_embarked.transform(test_input[:,-1].reshape(1,1))
test_input_embarked

array([[0., 0., 1.]])

In [41]:
test_input_age=test_input[:,2].reshape(1,1)
test_input_age

array([[31.0]], dtype=object)

In [42]:
test_input_transformed=np.concatenate((test_input[:,[0,3,4,5]],test_input_age,test_input_sex,test_input_embarked),axis=1)

In [43]:
test_input_transformed.shape

(1, 10)

In [44]:
clf.predict(test_input_transformed)

array([0], dtype=int64)