In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df=pd.read_csv("train.csv")

In [3]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
822,823,0,1,"Reuchlin, Jonkheer. John George",male,38.0,0,0,19972,0.0,,S
138,139,0,3,"Osen, Mr. Olaf Elon",male,16.0,0,0,7534,9.2167,,S
245,246,0,1,"Minahan, Dr. William Edward",male,44.0,2,0,19928,90.0,C78,Q
712,713,1,1,"Taylor, Mr. Elmer Zebley",male,48.0,1,0,19996,52.0,C126,S
43,44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.0,1,2,SC/Paris 2123,41.5792,,C


In [61]:
df.duplicated().sum()

0

In [62]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [63]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
df.drop(columns=["PassengerId","Name","Ticket","Cabin"],inplace=True)

In [5]:
df.sample(4)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
816,0,3,female,23.0,0,0,7.925,S
380,1,1,female,42.0,0,0,227.525,C
814,0,3,male,30.5,0,0,8.05,S
717,1,2,female,27.0,0,0,10.5,S


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train,X_test,y_train,y_test=train_test_split(df.drop(columns=["Survived"]),df["Survived"],test_size=0.2,random_state=42)

In [94]:
X_train.shape

(712, 7)

In [95]:
X_test.shape

(179, 7)

In [96]:
X_train.sample(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
789,1,male,46.0,0,0,79.2,C
353,3,male,25.0,1,0,17.8,S
763,1,female,36.0,1,2,120.0,S
473,2,female,23.0,0,0,13.7917,C
788,3,male,1.0,1,2,20.575,S


In [97]:
y_train.sample(5)

26     0
589    0
666    0
428    0
598    0
Name: Survived, dtype: int64

In [9]:
from sklearn.compose import ColumnTransformer


In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [99]:
t1=ColumnTransformer([
    ("impute_age",SimpleImputer(),[2]),
    ("impute_embarked",SimpleImputer(strategy="most_frequent"),[6])
],remainder="passthrough")

In [100]:

t2=ColumnTransformer([
    ("encode_sex_embarked",OneHotEncoder(sparse=False,handle_unknown="ignore"),[1,6])
],remainder="passthrough")

In [111]:
#scaling
t3=ColumnTransformer([
    ("scaling",MinMaxScaler(),slice(0,10))
])

In [102]:
#feature selection
t4=SelectKBest(score_func=chi2,k=8)

In [103]:
#train the model
t5=DecisionTreeClassifier()

CREATE PIPELINE

In [104]:
from sklearn.pipeline import Pipeline,make_pipeline

In [112]:
pipe=Pipeline([
    ("t1",t1),
    ("t2",t2),
    ("t3",t3),
    ("t4",t4),
    ("t5",t5)
])

# Pipeline Vs make_pipeline
Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)

In [109]:
#alternate syntax
pipe=make_pipeline(t1,t2,t3,t4,t5)

In [113]:
pipe.fit(X_train,y_train)

In [115]:
y_pred=pipe.predict(X_test)

In [116]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0], dtype=int64)

In [117]:
from sklearn.metrics import accuracy_score

In [118]:
accuracy_score(y_test,y_pred)

0.6256983240223464

# Exporting the pipeline

In [120]:
import pickle 


In [121]:
pickle.dump(pipe,open('pipe.pkl','wb'))

In [122]:
X_test.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')