# Decision Tree



1. Work through these same exercises using the Telco dataset.
2. Experiment with this model on other datasets with a higher number of output classes.

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn.metrics
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import graphviz
from graphviz import Graph

import acquire
import prepare

## Titanic Data

- Pos/Neg Case

Pos : Did not survive (0)

Neg : Survived (1)

- outcomes

TP: Did not survive

TN: Survived

FP: Did not survive when survived

FN: Survive when did not survive

- costs

FP: Less damages

FN: More damages

- metric

recall

In [2]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
train, validate, test = prepare.split_titanic(df)

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.



In [3]:
# Target = Survived, baseline prediction is 0 for not survived
train['baseline_prediction'] = 0
validate['baseline_prediction'] = 0
test['baseline_prediction'] = 0

In [4]:
pd.crosstab(train.survived, train.baseline_prediction)

baseline_prediction,0
survived,Unnamed: 1_level_1
0,307
1,191


In [5]:
print(f'Baseline prediction accuracy: {sklearn.metrics.accuracy_score(train.survived, train.baseline_prediction):.2%}')

Baseline prediction accuracy: 61.65%


### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)



In [6]:
# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [7]:
clf = DecisionTreeClassifier(max_depth=3)

In [8]:
X_train.head()

Unnamed: 0,passenger_id,pclass,sibsp,parch,fare,alone,sex_male,embark_town_Queenstown,embark_town_Southampton,baseline_prediction
863,863,3,8,2,69.55,0,0,0,1,0
665,665,2,2,0,73.5,0,1,0,1,0
272,272,2,0,1,19.5,0,0,0,1,0
581,581,1,1,1,110.8833,0,0,0,0,0
604,604,1,0,0,26.55,1,1,0,0,0


In [9]:
clf = clf.fit(X_train, y_train)

In [10]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [11]:
y_pred = clf.predict(X_train)
y_pred[0:5]

array([0, 0, 1, 1, 0])

In [12]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

array([[0.89473684, 0.10526316],
       [0.64545455, 0.35454545],
       [0.05063291, 0.94936709],
       [0.05063291, 0.94936709],
       [0.64545455, 0.35454545]])

In [13]:
y_train.value_counts()

0    307
1    191
Name: survived, dtype: int64

In [14]:
labels = sorted(y_train.unique())

pd.DataFrame(sklearn.metrics.confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,274,33
1,64,127


### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.



In [15]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.81


In [16]:
# confusion matrix
labels = sorted(y_train.unique())

pd.DataFrame(sklearn.metrics.confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,274,33
1,64,127


### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [17]:
print(sklearn.metrics.classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.89      0.85       307
           1       0.79      0.66      0.72       191

    accuracy                           0.81       498
   macro avg       0.80      0.78      0.79       498
weighted avg       0.80      0.81      0.80       498



### 5. Run through steps 2-4 using a different max_depth value.



### Max Depth = 5

In [18]:
clf_five = DecisionTreeClassifier(max_depth=5)
clf_five = clf_five.fit(X_train, y_train)

In [19]:
dot_data = export_graphviz(clf_five, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [20]:
y_pred = clf_five.predict(X_train)
y_pred[0:5]

array([0, 0, 1, 1, 1])

In [21]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf_five.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.85


In [22]:
# confusion matrix
labels = sorted(y_train.unique())

pd.DataFrame(sklearn.metrics.confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,289,18
1,55,136


In [23]:
# Classification Report

print(sklearn.metrics.classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89       307
           1       0.88      0.71      0.79       191

    accuracy                           0.85       498
   macro avg       0.86      0.83      0.84       498
weighted avg       0.86      0.85      0.85       498



### No max_depth

In [24]:
clf_no = DecisionTreeClassifier()
clf_no = clf.fit(X_train, y_train)

In [25]:
dot_data = export_graphviz(clf_no, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

'titanic_decision_tree.pdf'

In [26]:
y_pred = clf_no.predict(X_train)
y_pred[0:5]

array([0, 0, 1, 1, 0])

In [27]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf_no.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.81


In [28]:
# confusion matrix
labels = sorted(y_train.unique())

pd.DataFrame(sklearn.metrics.confusion_matrix(y_train, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,274,33
1,64,127


In [29]:
# Classification Report

print(sklearn.metrics.classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.89      0.85       307
           1       0.79      0.66      0.72       191

    accuracy                           0.81       498
   macro avg       0.80      0.78      0.79       498
weighted avg       0.80      0.81      0.80       498



### 6. Which model performs better on your in-sample data?



The no max_depth achieves a 100% in every score

### 7. Which model performs best on your out-of-sample data, the validate set?



In [30]:
y_pred = clf.predict(X_validate)
y_pred[0:5]

array([0, 1, 1, 0, 1])

In [31]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.81


In [32]:
# Produce y_predictions that come from the X_validate
y_pred = clf.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(sklearn.metrics.classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       132
           1       0.81      0.67      0.73        82

    accuracy                           0.81       214
   macro avg       0.81      0.79      0.79       214
weighted avg       0.81      0.81      0.81       214



In [33]:
y_pred = clf_five.predict(X_validate)
y_pred[0:5]

array([0, 1, 1, 0, 1])

In [34]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf_five.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.79


In [35]:
# Produce y_predictions that come from the X_validate
y_pred = clf_five.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(sklearn.metrics.classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.86      0.84       132
           1       0.75      0.67      0.71        82

    accuracy                           0.79       214
   macro avg       0.78      0.77      0.77       214
weighted avg       0.79      0.79      0.79       214



In [36]:
y_pred = clf_no.predict(X_validate)
y_pred[0:5]

array([0, 1, 1, 0, 1])

In [37]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf_no.score(X_validate, y_validate)))

Accuracy of Decision Tree classifier on validate set: 0.81


In [38]:
# Produce y_predictions that come from the X_validate
y_pred = clf_no.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(sklearn.metrics.classification_report(y_validate, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       132
           1       0.81      0.67      0.73        82

    accuracy                           0.81       214
   macro avg       0.81      0.79      0.79       214
weighted avg       0.81      0.81      0.81       214



# Random Forest Exercises


After making a few models, which one has the best performance (or closest metrics) on both train and validate?

### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.



In [39]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
train, validate, test = prepare.split_titanic(df)

In [40]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [41]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [42]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, random_state=123)

In [43]:
print(rf.feature_importances_)

[0.22813639 0.09211295 0.05116683 0.0565127  0.22244917 0.01718487
 0.28958355 0.01442691 0.02842662]


In [44]:
y_pred = rf.predict(X_train)

In [45]:
y_pred_proba = rf.predict_proba(X_train)

### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [46]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.97


In [47]:
print(confusion_matrix(y_train, y_pred))

[[307   0]
 [ 16 175]]


In [48]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       307
           1       1.00      0.92      0.96       191

    accuracy                           0.97       498
   macro avg       0.98      0.96      0.97       498
weighted avg       0.97      0.97      0.97       498



In [49]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.75


### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [50]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       307
           1       1.00      0.92      0.96       191

    accuracy                           0.97       498
   macro avg       0.98      0.96      0.97       498
weighted avg       0.97      0.97      0.97       498



### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.



### 5 x 5

In [51]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [52]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=123)

In [53]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, min_samples_leaf=5, random_state=123)

In [54]:
print(rf.feature_importances_)

[0.07452447 0.13433963 0.04838373 0.0529523  0.15497064 0.02039104
 0.47419248 0.01047852 0.02976719]


In [55]:
y_pred = rf.predict(X_train)

In [56]:
y_pred_proba = rf.predict_proba(X_train)

In [57]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.84


In [58]:
print(confusion_matrix(y_train, y_pred))

[[286  21]
 [ 57 134]]


In [59]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.75


### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?



The second model has a higher score on the validation set while the first has a higher score on the training set.

The first model performs better on in-sample data. It samples one at a time at a depth of 10, giving it a lot of small insights. The small insights are going to be more accurate for the data used to create the model.

### Some more models

### 10 x 5

In [60]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [61]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=10,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=123)

In [62]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=5, min_samples_leaf=10, random_state=123)

In [63]:
print(rf.feature_importances_)

[0.06175171 0.14844966 0.03541981 0.05098651 0.14184823 0.02115127
 0.50965588 0.00666552 0.0240714 ]


In [64]:
y_pred = rf.predict(X_train)

In [65]:
y_pred_proba = rf.predict_proba(X_train)

In [66]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.84


In [67]:
print(confusion_matrix(y_train, y_pred))

[[284  23]
 [ 58 133]]


In [68]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88       307
           1       0.85      0.70      0.77       191

    accuracy                           0.84       498
   macro avg       0.84      0.81      0.82       498
weighted avg       0.84      0.84      0.83       498



In [69]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.76


### 15 x 1

In [70]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [71]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=15,
                            n_estimators=100,
                            max_depth=1, 
                            random_state=123)

In [72]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=1, min_samples_leaf=15, random_state=123)

In [73]:
print(rf.feature_importances_)

[0.   0.21 0.05 0.1  0.13 0.07 0.39 0.   0.05]


In [74]:
y_pred = rf.predict(X_train)

In [75]:
y_pred_proba = rf.predict_proba(X_train)

In [76]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.78


In [77]:
print(confusion_matrix(y_train, y_pred))

[[276  31]
 [ 81 110]]


In [78]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.90      0.83       307
           1       0.78      0.58      0.66       191

    accuracy                           0.78       498
   macro avg       0.78      0.74      0.75       498
weighted avg       0.78      0.78      0.77       498



In [79]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.73


### 3 x 8

In [80]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [81]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=8, 
                            random_state=123)

In [82]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=8, min_samples_leaf=3, random_state=123)

In [83]:
print(rf.feature_importances_)

[0.14382192 0.11448805 0.05395882 0.0548723  0.17680403 0.01954309
 0.39515165 0.01342214 0.02793802]


In [84]:
y_pred = rf.predict(X_train)

In [85]:
y_pred_proba = rf.predict_proba(X_train)

In [86]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.90


In [87]:
print(confusion_matrix(y_train, y_pred))

[[295  12]
 [ 40 151]]


In [88]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92       307
           1       0.93      0.79      0.85       191

    accuracy                           0.90       498
   macro avg       0.90      0.88      0.89       498
weighted avg       0.90      0.90      0.89       498



In [89]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

Accuracy of random forest classifier on test set: 0.77


### Conclusion

I believe the 10 x 5 model works best as it has a balance between the samples tested per depth, not using too many samples per depth and just enough depth to be not too specific.