In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

Prepare the data. Load breast cancer data

In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
print(cancer.target[0:20])
print(list(cancer.target_names))
print(cancer.data[0:5])
print(list(cancer.feature_names))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
['malignant', 'benign']
[[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
  1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
  6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
  1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
  4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 1.326e+03 8.474e-02 7.864e-02 8.690e-02
  7.017e-02 1.812e-01 5.667e-02 5.435e-01 7.339e-01 3.398e+00 7.408e+01
  5.225e-03 1.308e-02 1.860e-02 1.340e-02 1.389e-02 3.532e-03 2.499e+01
  2.341e+01 1.588e+02 1.956e+03 1.238e-01 1.866e-01 2.416e-01 1.860e-01
  2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 1.203e+03 1.096e-01 1.599e-01 1.974e-01
  1.279e-01 2.069e-01 5.999e-02 7.456e-01 7.869e-01 4.585e+00 9.403e+01
  6.150e-03 4.006e-02 3.832e-02 2.058e-02 2.250e-02 4.571e-03 2.357e+01
  2.553e+01 1.525e+02 1.709e+03 1.444e-01 4.245e-01 4.504e-01 2.430e-01
  3.613e-01 8.758e-02]
 

# Problem 1. 
Break the data into training (80%)/testing data(20%). Estimate a tree classification model with maximum depth of 2. Plot the tree and calculate the accuracy rate. Predict target using all features, don't forget to set random numbers to 42. 

In [None]:
# Starting point
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import pydotplus
X = cancer.data 
y = cancer.target
random.seed(42)
import os     

Problem 2: 
Estimate an unrestricted tree on training data and test it on testing data. Find two most important features and create a scatter plot of malignant and benign tumors along the two axes of two most important feature. Hint: For example of a graph look at:
https://stackoverflow.com/questions/12487060/matplotlib-color-according-to-class-labels
Do you think the data need rotation?

# Problem 3

Report accuracy using top 2 features on the full data from problem 2. Loop over 10000 random number between -1 and 1 (from -3 to 3 radians) to find an optimal rotation angle. Report accuracy improvelemt over unrotated data.

In [2]:
import numpy as np
np.random.seed(42)


 # Problem 4
In the main data drop variables used in the problem 3. Add instead the rotated variables (substitution). 
Estimate accuracy score using with a tree classifier with max depth = 2 (Same as in problem 1).  How much did we gain from rotation?

# Problem 5
Generate samples of 100,  10,000 and  100,000, moons using the code below. Set random seed at 42. Split data in training and testing sets. Estimate separately Logistic, Random Forest, SVC and the hard voting classifier. What happens to the accuracy score as you increase the number of observations?  Measyre and report the time it takes for each estimation.

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

Answer: As we increase the number of observations the quality of prediction using Logistic goes down, which SVC takes the lead. SVC though is very slow. Though generally Voting classifier performs better it is inferior to SVC with the large number of data points. 

# Problem 6
Generate data using the code provided below. Using testing accuracy as metric, estimate bagging random trees estimator with 200 estimators. Try different numbers of samples: 10, 30, 50, and 200. What is optimal number of samples to be used? <br>
BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=5,
    max_samples= ?, bootstrap=True, n_jobs=-1, random_state=42)

In [None]:
X, y = make_moons(n_samples=500, noise=0.40, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Problem 7
Find optimal learning rate, number of estimators and maximum depth using GradientBoostingClassifier, and Randomize grid search. Set a grid: learning rate from 0.01 to 3, number of estimators from 1 to 20, and maximum depth from 1 to 10. Try 300 iterations. Example for randomizeSearch:

rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=?, cv=5, scoring='neg_mean_squared_error', random_state=42)
gbrt = GradientBoostingClassifier(max_depth=?, n_estimators=?, learning_rate = ?, random_state=42)  

Which estimator was the best? What was the accuracy of the best estimator?


In [None]:
X, y = make_moons(n_samples=2000, noise=0.40, random_state=42)