# Decision Tree



1. Work through these same exercises using the Telco dataset.
2. Experiment with this model on other datasets with a higher number of output classes.

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
from sklearn.impute import SimpleImputer

import sklearn.metrics
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import graphviz
from graphviz import Graph

import acquire
import prepare

## Titanic Data

- Pos/Neg Case

Pos : Did not survive (0)

Neg : Survived (1)

- outcomes

TP: Did not survive

TN: Survived

FP: Did not survive when survived

FN: Survive when did not survive

- costs

FP: Less damages

FN: More damages

- metric

recall

In [None]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
train, validate, test = prepare.split_titanic(df)

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.



In [None]:
# Target = Survived, baseline prediction is 0 for not survived
train['baseline_prediction'] = 0
validate['baseline_prediction'] = 0
test['baseline_prediction'] = 0

In [None]:
pd.crosstab(train.survived, train.baseline_prediction)

In [None]:
print(f'Baseline prediction accuracy: {sklearn.metrics.accuracy_score(train.survived, train.baseline_prediction):.2%}')

### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)



In [None]:
# create X & y version of train, where y is a series with just the target variable and X are all the features. 

X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [None]:
clf = DecisionTreeClassifier(max_depth=3)

In [None]:
X_train.head()

In [None]:
clf = clf.fit(X_train, y_train)

In [None]:
dot_data = export_graphviz(clf, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

In [None]:
y_pred = clf.predict(X_train)
y_pred[0:5]

In [None]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[0:5]

In [None]:
y_train.value_counts()

In [None]:
labels = sorted(y_train.unique())

pd.DataFrame(sklearn.metrics.confusion_matrix(y_train, y_pred), index=labels, columns=labels)

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.



In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_train, y_train)))

In [None]:
# confusion matrix
labels = sorted(y_train.unique())

pd.DataFrame(sklearn.metrics.confusion_matrix(y_train, y_pred), index=labels, columns=labels)

### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
print(sklearn.metrics.classification_report(y_train, y_pred))

### 5. Run through steps 2-4 using a different max_depth value.



### Max Depth = 5

In [None]:
clf_five = DecisionTreeClassifier(max_depth=5)
clf_five = clf_five.fit(X_train, y_train)

In [None]:
dot_data = export_graphviz(clf_five, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

In [None]:
y_pred = clf_five.predict(X_train)
y_pred[0:5]

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf_five.score(X_train, y_train)))

In [None]:
# confusion matrix
labels = sorted(y_train.unique())

pd.DataFrame(sklearn.metrics.confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
# Classification Report

print(sklearn.metrics.classification_report(y_train, y_pred))

### No max_depth

In [None]:
clf_no = DecisionTreeClassifier()
clf_no = clf.fit(X_train, y_train)

In [None]:
dot_data = export_graphviz(clf_no, feature_names= X_train.columns, rounded=True, filled=True, out_file=None)
graph = graphviz.Source(dot_data) 

graph.render('titanic_decision_tree', view=True)

In [None]:
y_pred = clf_no.predict(X_train)
y_pred[0:5]

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf_no.score(X_train, y_train)))

In [None]:
# confusion matrix
labels = sorted(y_train.unique())

pd.DataFrame(sklearn.metrics.confusion_matrix(y_train, y_pred), index=labels, columns=labels)

In [None]:
# Classification Report

print(sklearn.metrics.classification_report(y_train, y_pred))

### 6. Which model performs better on your in-sample data?



The no max_depth achieves a 100% in every score

### 7. Which model performs best on your out-of-sample data, the validate set?



In [None]:
y_pred = clf.predict(X_validate)
y_pred[0:5]

In [None]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf.score(X_validate, y_validate)))

In [None]:
# Produce y_predictions that come from the X_validate
y_pred = clf.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(sklearn.metrics.classification_report(y_validate, y_pred))

In [None]:
y_pred = clf_five.predict(X_validate)
y_pred[0:5]

In [None]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf_five.score(X_validate, y_validate)))

In [None]:
# Produce y_predictions that come from the X_validate
y_pred = clf_five.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(sklearn.metrics.classification_report(y_validate, y_pred))

In [None]:
y_pred = clf_no.predict(X_validate)
y_pred[0:5]

In [None]:
print('Accuracy of Decision Tree classifier on validate set: {:.2f}'
     .format(clf_no.score(X_validate, y_validate)))

In [None]:
# Produce y_predictions that come from the X_validate
y_pred = clf_no.predict(X_validate)

# Compare actual y values (from validate) to predicted y_values from the model run on X_validate
print(sklearn.metrics.classification_report(y_validate, y_pred))

# Random Forest Exercises


After making a few models, which one has the best performance (or closest metrics) on both train and validate?

### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.



In [None]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
train, validate, test = prepare.split_titanic(df)

In [None]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [None]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

In [None]:
rf.fit(X_train, y_train)

In [None]:
print(rf.feature_importances_)

In [None]:
y_pred = rf.predict(X_train)

In [None]:
y_pred_proba = rf.predict_proba(X_train)

### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
print(classification_report(y_train, y_pred))

### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.



### 5 x 5

In [None]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [None]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=123)

In [None]:
rf.fit(X_train, y_train)

In [None]:
print(rf.feature_importances_)

In [None]:
y_pred = rf.predict(X_train)

In [None]:
y_pred_proba = rf.predict_proba(X_train)

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?



The second model has a higher score on the validation set while the first has a higher score on the training set.

The first model performs better on in-sample data. It samples one at a time at a depth of 10, giving it a lot of small insights. The small insights are going to be more accurate for the data used to create the model.

### Some more models

### 10 x 5

In [None]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [None]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=10,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=123)

In [None]:
rf.fit(X_train, y_train)

In [None]:
print(rf.feature_importances_)

In [None]:
y_pred = rf.predict(X_train)

In [None]:
y_pred_proba = rf.predict_proba(X_train)

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

### 15 x 1

In [None]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [None]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=15,
                            n_estimators=100,
                            max_depth=1, 
                            random_state=123)

In [None]:
rf.fit(X_train, y_train)

In [None]:
print(rf.feature_importances_)

In [None]:
y_pred = rf.predict(X_train)

In [None]:
y_pred_proba = rf.predict_proba(X_train)

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

### 3 x 8

In [None]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [None]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=8, 
                            random_state=123)

In [None]:
rf.fit(X_train, y_train)

In [None]:
print(rf.feature_importances_)

In [None]:
y_pred = rf.predict(X_train)

In [None]:
y_pred_proba = rf.predict_proba(X_train)

In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))

### Conclusion

I believe the 10 x 5 model works best as it has a balance between the samples tested per depth, not using too many samples per depth and just enough depth to be not too specific.

# KNN Exercises


### 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [None]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
train, validate, test = prepare.split_titanic(df)

In [None]:
X_train = train.drop(columns=['survived'])
y_train = train.survived

X_validate = validate.drop(columns=['survived'])
y_validate = validate.survived

X_test = test.drop(columns=['survived'])
y_test = test.survived

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5, weights = 'uniform')

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_train)

In [None]:
y_pred_proba = knn.predict_proba(X_train)

### 2. Evaluate your results using the model score, confusion matrix, and classification report.



In [None]:
print(f'Accuracy of KNN classifier on training set: {knn.score(X_train, y_train):.2}')

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



### 4. Run through steps 2-4 setting k to 10



In [None]:
knn = KNeighborsClassifier(n_neighbors = 10, weights = 'uniform')

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_train)

In [None]:
y_pred_proba = knn.predict_proba(X_train)

In [None]:
print(f'Accuracy of KNN classifier on training set: {knn.score(X_train, y_train):.2}')

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

### 5. Run through setps 2-4 setting k to 20



In [None]:
knn = KNeighborsClassifier(n_neighbors = 20)

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred = knn.predict(X_train)

In [None]:
y_pred_proba = knn.predict_proba(X_train)

In [None]:
print(f'Accuracy of KNN classifier on training set: {knn.score(X_train, y_train):.2}')

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn.score(X_validate, y_validate)))

In [None]:
k_range = range(1, 20)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))
plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.scatter(k_range, scores)
plt.xticks([0,5,10,15,20])
plt.show()

### 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?



The model with k set to 5. This is a more generalized setting for the model and prevents overfitting.

### 7. Which model performs best on our out-of-sample data from validate?

The model with k set to 20 is slightly (0.01) better than the other two. This is because it had a larger comparison to better estimate when working with unknown data.

# Logistic Regression Exercises

### 1. Create a model that includes age in addition to fare and pclass. Does this model perform better than your baseline?


4. Use you best 3 models to predict and evaluate on your validate sample.

5. Choose you best model from the validation performation, and evaluate it on the test dataset. How do the performance metrics compare to validate? to train?

### Baseline Model

In [63]:
### impute age
imputer = SimpleImputer(missing_values = None, strategy='most_frequent')
imputer = imputer.fit(train[['age']])
train[['age']] = imputer.transform(train[['age']])

validate[['age']] = imputer.transform(validate[['age']])

test[['age']] = imputer.transform(test[['age']])


NameError: name 'SimpleImputer' is not defined

In [32]:
train, validate, test = prepare.split_titanic(prepare.prep_titanic(acquire.get_titanic_data()))

In [33]:
base_X_train = train.drop(columns=['survived'])
base_y_train = train.survived

base_X_validate = validate.drop(columns=['survived'])
base_y_validate = validate.survived

base_X_test = test.drop(columns=['survived'])
base_y_test = test.survived

In [34]:
base_logit = LogisticRegression(C=1)

In [35]:
base_logit.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123)

In [36]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-7.13867494e-05 -5.99195592e-01 -3.02111933e-01 -1.04905030e-01
   5.19823749e-03  2.15685005e-03 -2.54599268e+00  2.99159443e-01
  -6.34319369e-02]]
Intercept: 
 [2.48506217]


In [37]:
y_pred = logit.predict(X_train)

In [38]:
y_pred_proba = logit.predict_proba(X_train)

In [39]:
print(f'Accuracy of Logistic Regression classifier on training set: {logit.score(X_train, y_train):.2}')

Accuracy of Logistic Regression classifier on training set: 0.8


In [40]:
print(confusion_matrix(y_train, y_pred))

[[269  38]
 [ 62 129]]


In [41]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       307
           1       0.77      0.68      0.72       191

    accuracy                           0.80       498
   macro avg       0.79      0.78      0.78       498
weighted avg       0.80      0.80      0.80       498



### Model Pclass/Age/Fare (PAF)

In [42]:
paf = LogisticRegression(C=0.1, random_state=123)

In [43]:
paf_train = train.drop(columns = ['sex_male','passenger_id', 'sibsp','parch','alone','embark_town_Queenstown','embark_town_Southampton'])
paf_validate = validate.drop(columns = ['sex_male','passenger_id', 'sibsp','parch','alone','embark_town_Queenstown','embark_town_Southampton'])
paf_test = test.drop(columns = ['sex_male','passenger_id', 'sibsp','parch','alone','embark_town_Queenstown','embark_town_Southampton'])

In [44]:
paf_X_train = paf_train.drop(columns=['survived'])
paf_y_train = paf_train.survived

paf_X_validate = paf_validate.drop(columns=['survived'])
paf_y_validate = paf_validate.survived

paf_X_test = paf_test.drop(columns=['survived'])
paf_y_test = paf_test.survived

In [45]:
paf.fit(paf_X_train, paf_y_train)

LogisticRegression(C=0.1, random_state=123)

In [46]:
print('Coefficient: \n', paf.coef_)
print('Intercept: \n', paf.intercept_)

Coefficient: 
 [[-0.472109    0.00812017]]
Intercept: 
 [0.3574087]


In [47]:
paf_y_pred = paf.predict(paf_X_train)

In [48]:
paf_y_pred_proba = paf.predict_proba(paf_X_train)

In [49]:
print(f'Accuracy of Logistic Regression classifier on training set: {paf.score(paf_X_train, paf_y_train):.2}')

Accuracy of Logistic Regression classifier on training set: 0.68


In [50]:
print(confusion_matrix(paf_y_train, paf_y_pred))

[[264  43]
 [118  73]]


In [51]:
print(classification_report(paf_y_train, paf_y_pred))

              precision    recall  f1-score   support

           0       0.69      0.86      0.77       307
           1       0.63      0.38      0.48       191

    accuracy                           0.68       498
   macro avg       0.66      0.62      0.62       498
weighted avg       0.67      0.68      0.65       498



### 2. Include sex in your model as well. Note that you'll need to encode or create a dummy variable of this feature before including it in a model. (pafs)

In [52]:
pafs = LogisticRegression(C=0.5, random_state=123)

In [53]:
pafs_train = train.drop(columns = ['passenger_id', 'sibsp','parch','alone','embark_town_Queenstown','embark_town_Southampton'])
pafs_validate = validate.drop(columns = ['passenger_id', 'sibsp','parch','alone','embark_town_Queenstown','embark_town_Southampton'])
pafs_test = test.drop(columns = ['passenger_id', 'sibsp','parch','alone','embark_town_Queenstown','embark_town_Southampton'])

In [54]:
pafs_X_train = paf_train.drop(columns=['survived'])
pafs_y_train = paf_train.survived

pafs_X_validate = paf_validate.drop(columns=['survived'])
pafs_y_validate = paf_validate.survived

pafs_X_test = paf_test.drop(columns=['survived'])
pafs_y_test = paf_test.survived

In [55]:
pafs.fit(paf_X_train, paf_y_train)

LogisticRegression(C=0.5, random_state=123)

In [56]:
print('Coefficient: \n', pafs.coef_)
print('Intercept: \n', pafs.intercept_)

Coefficient: 
 [[-0.54545038  0.0071105 ]]
Intercept: 
 [0.55408139]


In [57]:
pafs_y_pred = pafs.predict(pafs_X_train)

In [58]:
pafs_y_pred_proba = pafs.predict_proba(pafs_X_train)

In [59]:
print(f'Accuracy of Logistic Regression classifier on training set: {pafs.score(paf_X_train, pafs_y_train):.2}')

Accuracy of Logistic Regression classifier on training set: 0.67


In [60]:
print(confusion_matrix(pafs_y_train, pafs_y_pred))

[[261  46]
 [118  73]]


In [61]:
print(classification_report(pafs_y_train, pafs_y_pred))

              precision    recall  f1-score   support

           0       0.69      0.85      0.76       307
           1       0.61      0.38      0.47       191

    accuracy                           0.67       498
   macro avg       0.65      0.62      0.62       498
weighted avg       0.66      0.67      0.65       498



### 3. Try out other combinations of features and models.

