# Trees

In [None]:
import numpy as np
import matplotlib.pyplot as plt
# % matplotlib inline
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree


In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [None]:
print(cancer.DESCR)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=0)

In [None]:
cancer.feature_names

In [None]:
cancer.target_names

# tree visualization

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=2)
clf.fit(X_train, y_train)

In [None]:
acc = clf.score(X_test, y_test)*100
print("Test Accuracy {:.2f}%".format(acc))

In [None]:

fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(clf,
               feature_names = list(cancer.feature_names), 
               class_names=list(cancer.target_names),
               filled = True);

# Parameter Tuning

In [None]:
clf = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
plt.figure(figsize=(15, 5))
tree.plot_tree(clf,
               feature_names = list(cancer.feature_names), 
               class_names=list(cancer.target_names),
               filled = True);

In [None]:
acc = clf.score(X_test, y_test)*100
print("Test Accuracy {:.2f}%".format(acc))

In [None]:
clf = DecisionTreeClassifier(max_leaf_nodes=8).fit(X_train, y_train)
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(clf,
               feature_names = list(cancer.feature_names), 
               class_names=list(cancer.target_names),
               filled = True);

In [None]:
acc = clf.score(X_test, y_test)*100
print("Test Accuracy {:.2f}%".format(acc))

In [None]:
clf = DecisionTreeClassifier(min_samples_split=50).fit(X_train, y_train)
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(clf,
               feature_names = list(cancer.feature_names), 
               class_names=list(cancer.target_names),
               filled = True);

In [None]:
acc = clf.score(X_test, y_test)*100
print("Test Accuracy {:.2f}%".format(acc))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth':range(1, 7)}
grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid, cv=10)
grid.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth':range(1, 7)}
grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid,
                    cv=10, return_train_score=True)
grid.fit(X_train, y_train)

In [None]:
scores = pd.DataFrame(grid.cv_results_)
scores.plot(x='param_max_depth', y=['mean_train_score', 'mean_test_score'], ax=plt.gca())
plt.legend(loc=(1, 0))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_leaf_nodes': range(2, 20)}
grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid,
                    cv=10,
                   return_train_score=True)
grid.fit(X_train, y_train)

scores = pd.DataFrame(grid.cv_results_)
scores.plot(x='param_max_leaf_nodes', y=['mean_train_score', 'mean_test_score'], ax=plt.gca())
plt.legend(loc=(1, 0))

In [None]:
grid.best_params_

In [None]:
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(grid.best_estimator_,
               feature_names = list(cancer.feature_names), 
               class_names=list(cancer.target_names),
               filled = True);

In [None]:
pd.Series(grid.best_estimator_.feature_importances_,
          index=cancer.feature_names).plot(kind="barh")

##  Exercise: Use another Classification

In [None]:
clf = # TODO: write your own model here 

In [None]:
acc = clf.score(X_test, y_test)*100
print("Test Accuracy {:.2f}%".format(acc))

In [None]:
plt.figure(figsize=(15, 5))
tree.plot_tree(clf,
               feature_names = list(cancer.feature_names), 
               class_names=list(cancer.target_names),
               filled = True);

In [None]:
clf = # TODO: try again with diffrenct parameters  
plt.figure(figsize=(15, 5))
tree.plot_tree(clf,
               feature_names = list(cancer.feature_names), 
               class_names=list(cancer.target_names),
               filled = True);

In [None]:
acc = clf.score(X_test, y_test)*100
print("Test Accuracy {:.2f}%".format(acc))

In [None]:
# write it with gridsearchCV!