# model selection

## Bagging


![title](9.png)

![title](10.png)

## grid search and random search

In [None]:
import numpy as np

from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# get some data
digits = load_digits()
X, y = digits.data, digits.target

# build a classifier
clf = RandomForestClassifier(n_estimators=20)


# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()

grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

# homework

In [None]:
class RandomForest:
    def __init__(self, 
                 max_depth=4,
                 each_tree_data=70,
                 tree_count=5,
                 criterion="gini"):
        self.trees = []
        for i in range(tree_count):
            self.trees.append(DecisionTree())
    
    def fit(self, X, y):
        for index, tree in enumerate(self.trees):
            # todo, use self.each_tree_data            
            self.trees[index].fit(X_part, y_part)
    
    def predict(self, X):
        predictions = []
        for index, tree in enumerate(self.trees):
            predictions.append(self.tree[index].predict(X))
        # handle predictions
        return y

In [None]:
try:
    iris = load_iris()
    for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3],
                                    [1, 2], [1, 3], [2, 3]]):
        X = iris.data[:, pair]
        y = iris.target

        clf = RandomForest().fit(X, y)

        plt.figure(figsize=(16,9))
        plt.subplot()
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                         np.arange(y_min, y_max, 0.02))
        plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)

        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)

        cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)
        plt.xlabel(iris.feature_names[pair[0]])
        plt.ylabel(iris.feature_names[pair[1]])

        for i, color in zip(range(3), "ryb"):
            idx = np.where(y == i)
            plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
                        cmap=plt.cm.RdYlBu, edgecolor='black', s=130)
    plt.show()
except:
    print("Hmm, something is wrong :[")

## stacking and blending

![title](11.png)

In [None]:
class blend:
    def __init__(self, models: list, model_params: list,
                 blender_model, blender_param: dict,
                 train_percent=80):
        self.models = []
        for index, model in enumerate(models):
            self.models.append(model(**model_params[index]))
        self.blender = blender_model(**blender_param)
        self.train_percent = train_percent
    
    def fit(self, X, y):
        #X_train, y_train, X_test, y_test use self.train_percent
        predictions = []
        for index, _ in enumerate(self.models):
            self.models[index].fit(X_train, y_train)
            predictions.append(self.models[index].predict(X_test))
        self.blender_model.fit(predictions, y_test)
    
    def predict(self, X):
        for index, _ in enumerate(self.models):
            predictions.append(self.models[index].predict(X))
        return self.blender_model.predict(predictions)

In [None]:
class stack:
    def __init__(self, folds, 
                 models: list, model_params: list,
                 blender_model, blender_param: dict)
        self.blender = blender_model(**blender_param)
        self.folds = folds
        self.models = models
        self.model_params = model_params
    
    def fit(self, X, y):
        # todo folds
        all_predictions = []
        for fold in folds:
            # len(fold) = len(X)*(self.folds-1) / self.folds
            # len(out_of_fold) = len(X) / self.folds
            predictions = []
            models = [self.models[index](**self.model_params[index])
                      for index, _ in enumerate(self.models)]
            for index, _ in enumerate(models):
                models[index].fit(fold)
                predictions.append(models[index].predict(out_of_fold))
            all_predictions.append(predictions)
        # todo, all_predictions shape like y, len(all_predictions) = len(y)
        self.blender_model.fit(all_predictions, y)
    
    def predict(X):
        pass           