# Problem Statement

Predicting Survival in the Titanic Data Set
We will be using a decision tree to make predictions about the Titanic data set from
Kaggle. This data set provides information on the Titanic passengers and can be used to
predict whether a passenger survived or not.

In [1]:
#Importing Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
from pandas import Series,DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report

In [2]:
Url = 'https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv'
titanic_df = pd.read_csv(Url)

In [3]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# Function for Data Preprocessing
Feature Selection(as given in the assignment)

Handling missing value

In [6]:
def data_preprocessing(df):
    df = df.loc[:,['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
    df.convert_objects(convert_numeric=True)
    df.fillna(0, inplace=True)
    return df

Function To Handle categorical/Non-numerical data

In [7]:
def handle_non_numeric_data(df):
    columns = df.columns.values
    
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1
                    
            df[column] = list(map(convert_to_int, df[column]))
    
    return df

# Data Preprocessing
1. Call Function data_preprocessing on titanic dataset to select features and handle missing data
2. Call function handle_non_numeric_data on titanic dataset to handle non-numeric data

In [8]:
titanic_df1 = data_preprocessing(titanic_df)
titanic_df1.tail()

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare
886,0,male,27.0,0,0,13.0
887,1,female,19.0,0,0,30.0
888,0,female,0.0,1,2,23.45
889,1,male,26.0,0,0,30.0
890,0,male,32.0,0,0,7.75


In [9]:
titanic_df1 = data_preprocessing(titanic_df)
titanic_df1.tail()

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare
886,0,male,27.0,0,0,13.0
887,1,female,19.0,0,0,30.0
888,0,female,0.0,1,2,23.45
889,1,male,26.0,0,0,30.0
890,0,male,32.0,0,0,7.75


In [39]:
titanic_df1 = handle_non_numeric_data(titanic_df1)
print(titanic_df1.tail())

     Survived  Sex   Age  SibSp  Parch   Fare
886         0    0  27.0      0      0  13.00
887         1    1  19.0      0      0  30.00
888         0    1   0.0      1      2  23.45
889         1    0  26.0      0      0  30.00
890         0    0  32.0      0      0   7.75


In [40]:
X = np.array(titanic_df1.drop(['Survived'], 1).astype(float))
Y = np.array(titanic_df1['Survived'])

X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)
d_tree = DecisionTreeClassifier(min_samples_split=20, random_state=99)
d_tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, presort=False, random_state=99,
            splitter='best')

# Apply Model
*Split Dataset into training and test dataset

*Apply Machine Learning Model(Decision tree classifier)

*Fit the model on input parameters X,Y

In [41]:
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)
d_tree = DecisionTreeClassifier(min_samples_split=20, random_state=99)
d_tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, presort=False, random_state=99,
            splitter='best')

In [42]:
'''
Model Evaluation
Predict outcome for test dataset using the model applied on trainning dataset
Calculate accuracy using the actual and predicted outcome
'''
Y_pred = d_tree.predict(X_test)
print("Accuracy is ", accuracy_score(Y_test,Y_pred)*100)

Accuracy is  77.61194029850746


In [43]:
# this produces a 2x2 numpy array (matrix)
confusion = metrics.confusion_matrix(Y_test, Y_pred)

TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print(f"Accuracy calculation using confusion metrics : {((TP + TN) / float(TP + TN + FP + FN))}")

Accuracy calculation using confusion metrics : 0.7761194029850746


In [44]:
print(f"classification_error using accuracy_score is : {1 - metrics.accuracy_score(Y_test, Y_pred)}")
print(f"classification_error using confusion metrics is : {(FP + FN) / float(TP + TN + FP + FN)}")

classification_error using accuracy_score is : 0.22388059701492535
classification_error using confusion metrics is : 0.22388059701492538
