# Decision Trees


In [59]:
import numpy as np
import pandas as pd
import os

In [60]:
# Load and prepare Titanic data

titanic_train = pd.read_csv("d:/student/titanic_train.csv")    # Read the data

# Impute median Age for NA Age values
new_age_var = np.where(titanic_train["Age"].isnull(), # Logical check
                       28,                       # Value if check is true
                       titanic_train["Age"])     # Value if check is false

titanic_train["Age"] = new_age_var 

In [61]:
from sklearn import tree
from sklearn import preprocessing

In [82]:
# Initialize label encoder
label_encoder = preprocessing.LabelEncoder()

# Convert Sex variable to numeric
encoded_sex = label_encoder.fit_transform(titanic_train["Sex"])

# Initialize model
tree_model = tree.DecisionTreeClassifier(criterion = "entropy")

# Train the model
tree_model.fit(X = pd.DataFrame(encoded_sex), 
               y = titanic_train["Survived"])

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [90]:
# Save tree as dot file
with open("tree1.dot", 'w') as f:
     f = tree.export_graphviz(tree_model, 
                              feature_names=["Sex"], 
                              out_file=f)
   

In [93]:
# Get survival probability
preds = tree_model.predict_proba(X = pd.DataFrame(encoded_sex))

pd.crosstab(preds[:,0], titanic_train["Sex"])              # Display image*

Sex,female,male
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.257962,314,0
0.811092,0,577


In [94]:
# Make data frame of predictors
predictors = pd.DataFrame([encoded_sex, titanic_train["Pclass"]]).T

# Train the model
tree_model.fit(X = predictors, 
               y = titanic_train["Survived"])

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [95]:
with open("tree2.dot", 'w') as f:
     f = tree.export_graphviz(tree_model, 
                              feature_names=["Sex", "Class"], 
                              out_file=f)

In [97]:
# Get survival probability
preds = tree_model.predict_proba(X = predictors)

# Create a table of predictions by sex and class
pd.crosstab(preds[:,0], columns = [titanic_train["Pclass"], 
                                   titanic_train["Sex"]])

Pclass,1,1,2,2,3,3
Sex,female,male,female,male,female,male
row_0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0.031915,94,0,0,0,0,0
0.078947,0,0,76,0,0,0
0.5,0,0,0,0,144,0
0.631148,0,122,0,0,0,0
0.842593,0,0,0,108,0,0
0.864553,0,0,0,0,0,347


In [98]:
predictors = pd.DataFrame([encoded_sex,
                           titanic_train["Pclass"],
                           titanic_train["Age"],
                           titanic_train["Fare"]]).T

# Initialize model with maximum tree depth set to 8
tree_model = tree.DecisionTreeClassifier(max_depth = 8)

tree_model.fit(X = predictors, 
               y = titanic_train["Survived"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [99]:
with open("tree3.dot", 'w') as f:
     f = tree.export_graphviz(tree_model, 
                              feature_names=["Sex", "Class","Age","Fare"], 
                              out_file=f)

In [100]:
tree_model.score(X = predictors, 
                 y = titanic_train["Survived"])

0.8911335578002245

In [101]:
# Read and prepare test data
titanic_test = pd.read_csv("titanic_test.csv")    # Read the data

# Impute median Age for NA Age values
new_age_var = np.where(titanic_test["Age"].isnull(), # Logical check
                       28,                       # Value if check is true
                       titanic_test["Age"])      # Value if check is false

titanic_test["Age"] = new_age_var 

In [118]:
# Convert test variables to match model features
encoded_sex_test = label_encoder.fit_transform(titanic_test["Sex"])

test_features = pd.DataFrame([encoded_sex_test,
                              titanic_test["Pclass"],
                              titanic_test["Age"],
                              titanic_test["Fare"]]).T
np.all(np.isnan(test_features))

False

In [130]:

# Make test set predictions
test_preds = tree_model.predict(X=test_features.fillna(0))


# Create a submission for Kaggle
submission = pd.DataFrame({"PassengerId":titanic_test["PassengerId"],
                           "Survived":test_preds})

# Save submission to CSV
submission.to_csv("d:/student/tutorial_dectree_submission.csv", index=False)        # Do not save index values