In [36]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split

In [37]:
df=pd.read_csv('titanic.csv')

In [38]:
df.sample(8)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
316,317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24.0,1,0,244367,26.0,,S
795,796,0,2,"Otter, Mr. Richard",male,39.0,0,0,28213,13.0,,S
542,543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11.0,4,2,347082,31.275,,S
543,544,1,2,"Beane, Mr. Edward",male,32.0,1,0,2908,26.0,,S
666,667,0,2,"Butler, Mr. Reginald Fenton",male,25.0,0,0,234686,13.0,,S
654,655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18.0,0,0,365226,6.75,,Q
87,88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S
554,555,1,3,"Ohman, Miss. Velin",female,22.0,0,0,347085,7.775,,S


In [39]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# column dropping and applying train test split

In [40]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [41]:
df.sample(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
856,1,1,female,45.0,1,1,164.8667,S
756,0,3,male,28.0,0,0,7.7958,S
663,0,3,male,36.0,0,0,7.4958,S
424,0,3,male,18.0,1,1,20.2125,S
371,0,3,male,18.0,1,0,6.4958,S
9,1,2,female,14.0,1,0,30.0708,C
370,1,1,male,25.0,1,0,55.4417,C
254,0,3,female,41.0,0,2,20.2125,S
361,0,2,male,29.0,1,0,27.7208,C
410,0,3,male,,0,0,7.8958,S


In [42]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                random_state=42)

In [43]:
X_train.sample(10)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
640,3,male,20.0,0,0,7.8542,S
119,3,female,2.0,4,2,31.275,S
697,3,female,,0,0,7.7333,Q
167,3,female,45.0,1,4,27.9,S
104,3,male,37.0,2,0,7.925,S
361,2,male,29.0,1,0,27.7208,C
424,3,male,18.0,1,1,20.2125,S
352,3,male,15.0,1,1,7.2292,C
348,3,male,3.0,1,1,15.9,S
377,1,male,27.0,0,2,211.5,C


# Creating pipelining nodes

In [53]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')


In [54]:
# one hot encoding
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [55]:

# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])


In [56]:
# Feature selection
trf4 = SelectKBest(score_func=chi2,k=10)

In [57]:
#Model making
trf5=DecisionTreeClassifier()

# Creating pipeline

In [59]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

In [60]:
#syntax2 using make_pipeline method
pipe2=make_pipeline(trf1,trf2,trf3,trf4,trf5)

# Exploring pipeline

In [61]:
from sklearn import set_config
set_config(display='diagram')

In [62]:
pipe.fit(X_train,y_train)

In [65]:
y_pred=pipe.predict(X_test)

In [66]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)*100

62.56983240223464

In [67]:
# export 
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))