# Decision Tree Exercises

### Exercises

Use the titanic data:

- 1. What is your baseline prediction? 
- 2. What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). 
- 3. When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [18]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import acquire

In [19]:
df = acquire.get_titanic_data()

In [4]:
df

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


In [5]:
df.value_counts()

Unnamed: 0  passenger_id  survived  pclass  sex     age   sibsp  parch  fare      embarked  class   deck  embark_town  alone
1           1             1         1       female  38.0  1      0      71.2833   C         First   C     Cherbourg    0        1
571         571           1         1       female  53.0  2      0      51.4792   S         First   C     Southampton  0        1
577         577           1         1       female  39.0  1      0      55.9000   S         First   E     Southampton  0        1
581         581           1         1       female  39.0  1      1      110.8833  C         First   C     Cherbourg    0        1
583         583           0         1       male    36.0  0      0      40.1250   C         First   A     Cherbourg    1        1
                                                                                                                               ..
325         325           1         1       female  36.0  0      0      135.6333  C         Fir

In [15]:
# Which label appears most frequently in our dataset?
df.survived.value_counts()

0    549
1    342
Name: survived, dtype: int64

In [9]:
# Baseline prediction will be the most prevalent which is 0 for survived (in other words, did not survive)
df['baseline'] = 0
df

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,baseline
0,0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,0
2,2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1,0
3,3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0,0
4,4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1,0
887,887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1,0
888,888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0,0
889,889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1,0


In [17]:
# Baseline accuracy:

baseline_accuracy = (df.survived == df.baseline).mean()

print(f'baseline accuracy: {baseline_accuracy:.2%}')

baseline accuracy: 61.62%


In [56]:
#df.replace(['P','A','T','H'],[1, 0, 0.9, 0.5])

#Replace string values in columns to prepare data for model

df.replace({'sex':{'male':1, 'female':0}}, inplace=True)  #male is 1, female is 0

df.replace({'class':{'First':1, 'Second':2, 'Third':3}}, inplace=True) #First class is 2, Second is 2, and Third is 3

df.replace({'deck':{'A':1, 'B':2, 'C':3}}, inplace=True) #NaN is 0, A is 1, B is 2, C is 3

df.replace({'embark_town':{'Cherbourg':1, 'Southampton':2, 'Queenstown':3}}, inplace=True) #First class is 2, Second is 2, and Third is 3

df.replace({'embarked':{'C':1, 'S':2, 'Q':3}}, inplace=True) #First class is 2, Second is 2, and Third is 3






In [57]:
# drop rows where embarked/embark town are null values
df = df[~df.embark_town.isnull()]

In [68]:
# drop rows where age town are null values
df = df[~df.age.isnull()]

In [69]:
# drop the deck column
#df = df.drop(columns='deck')

In [70]:
# encode embarked using dummy columns
titanic_dummies = pd.get_dummies(df.embark_town, drop_first=True)

In [71]:
df

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
0,0,0,0,3,1,22.0,1,0,7.2500,2,3,2.0,0
1,1,1,1,1,0,38.0,1,0,71.2833,1,1,1.0,0
2,2,2,1,3,0,26.0,0,0,7.9250,2,3,2.0,1
3,3,3,1,1,0,35.0,1,0,53.1000,2,1,2.0,0
4,4,4,0,3,1,35.0,0,0,8.0500,2,3,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,885,885,0,3,0,39.0,0,5,29.1250,3,3,3.0,0
886,886,886,0,2,1,27.0,0,0,13.0000,2,2,2.0,1
887,887,887,1,1,0,19.0,0,0,30.0000,2,1,2.0,1
889,889,889,1,1,1,26.0,0,0,30.0000,1,1,1.0,1


In [72]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed, 
                                            stratify=df[target])
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed,
                                       stratify=train_validate[target])
    return train, validate, test

In [73]:
train.shape

(497, 13)

In [74]:
# split into train, validate, test
train, validate, test = train_validate_test_split(df, target='survived', seed=123)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [75]:
X_train

Unnamed: 0.1,Unnamed: 0,passenger_id,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone
450,450,450,2,1,36.0,1,2,27.7500,2,2,2.0,0
543,543,543,2,1,32.0,1,0,26.0000,2,2,2.0,0
157,157,157,3,1,30.0,0,0,8.0500,2,3,2.0,1
462,462,462,1,1,47.0,0,0,38.5000,2,1,2.0,1
397,397,397,2,1,46.0,0,0,26.0000,2,2,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
820,820,820,1,0,52.0,1,1,93.5000,2,1,2.0,0
673,673,673,2,1,31.0,0,0,13.0000,2,2,2.0,1
310,310,310,1,0,24.0,0,0,83.1583,1,1,1.0,1
72,72,72,2,1,21.0,0,0,73.5000,2,2,2.0,1


In [76]:
#Train the model

#Create the Decision Tree object with desired hyper-parameters.

clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [77]:
#Fit the random forest algorithm to the training data.

# model.fit(X, y)

clf = clf.fit(X_train, y_train)

In [79]:
#Graphviz: Visualize the Decision Tree

import graphviz
from graphviz import Graph

dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'