In [2]:
import pandas as pd
import numpy as np

In [16]:
data = pd.read_csv('./datasets/titanic_data.csv')
data = data.drop(columns=['PassengerId', 'Ticket', 'Cabin'], axis=1)
data

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C


In [None]:
# now i need to join two columns SibSip and Parch as one feature called as Family so we do not have multiple columns denoting same thing
data['Family'] = data['SibSp']+data['Parch']+1 # +1 because counting person in family as well -- just for sanity
data.drop(["SibSp", "Parch"], axis=1, inplace=True)

In [None]:
data

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked,Family
0,0,3,"Braund, Mr. Owen Harris",male,22.0,7.2500,S,2
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,71.2833,C,2
2,1,3,"Heikkinen, Miss. Laina",female,26.0,7.9250,S,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,53.1000,S,2
4,0,3,"Allen, Mr. William Henry",male,35.0,8.0500,S,1
...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,13.0000,S,1
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,30.0000,S,1
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,23.4500,S,4
889,1,1,"Behr, Mr. Karl Howell",male,26.0,30.0000,C,1


In [29]:
X = data.drop("Survived", axis=1)
Y = data["Survived"]

In [32]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

In [None]:
X_train.shape[0] != X_test.shape[0] #compares rows of both


True

In [None]:
np.ceil(Y_train.mean()) == np.ceil(Y_test.mean()) # as stratify gives us almost equal proportions of survived in both test and train sets.

np.True_

In [53]:
X_train.sample(9)

Unnamed: 0,Pclass,Name,Sex,Age,Fare,Embarked,Family
604,1,"Homer, Mr. Harry (""Mr E Haven"")",male,35.0,26.55,C,1
623,3,"Hansen, Mr. Henry Damsgaard",male,21.0,7.8542,S,1
159,3,"Sage, Master. Thomas Henry",male,,69.55,S,11
418,2,"Matthews, Mr. William John",male,30.0,13.0,S,1
70,2,"Jenkin, Mr. Stephen Curnow",male,32.0,10.5,S,1
222,3,"Green, Mr. George Henry",male,51.0,8.05,S,1
665,2,"Hickman, Mr. Lewis",male,32.0,73.5,S,3
660,1,"Frauenthal, Dr. Henry William",male,50.0,133.65,S,3
140,3,"Boulos, Mrs. Joseph (Sultana)",female,,15.2458,C,3


In [54]:
numeric_features = ["Age", "Fare", "Family"]
categorical_features = ["Pclass", "Sex","Embarked"]

In [61]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])
categorical_transformer = Pipeline(
   steps=[
       ("imputer", SimpleImputer(strategy="most_frequent")),
       ("encoder", OneHotEncoder(handle_unknown="ignore"))
   ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("categ", categorical_transformer, categorical_features)
    ],
    remainder="drop"
)

In [62]:
from sklearn.tree import DecisionTreeClassifier

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", DecisionTreeClassifier(random_state=42))
])

In [63]:
clf.fit(X_train,Y_train)
Y_pred = clf.predict(X_test)
Y_pred

array([0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0])

In [64]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))


Accuracy: 0.8100558659217877
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       110
           1       0.75      0.75      0.75        69

    accuracy                           0.81       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179

