In [1]:
from sklearn.datasets import load_iris, make_moons
from sklearn.tree import DecisionTreeClassifier, export_graphviz, DecisionTreeRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, ShuffleSplit
from sklearn.metrics import accuracy_score, mean_squared_error
from scipy.stats import reciprocal, uniform

from graphviz import Source
import numpy as np
import pandas as pd
from scipy.stats import mode

# EXERCISE 7

In [2]:
X, y = make_moons(n_samples=1000, noise=0.4, random_state=23)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=34)

In [4]:
tree_clf = DecisionTreeClassifier(random_state=23)
# tree_clf.fit(X_train, y_train)


In [5]:
print([range(2, 20)])

[range(2, 20)]


In [6]:
param_grid = {
    'criterion':['gini', "entropy"],
    "max_depth": list(range(2, 10)),
    'max_leaf_nodes': list(range(2, 200)),
    'min_samples_split': list(range(2,15))
}

tree_search = GridSearchCV(tree_clf, param_grid, scoring='neg_log_loss', 
    return_train_score=True)
tree_search.fit(X_train,y_train)

In [7]:
tree_search.best_estimator_

In [8]:
-cross_val_score(tree_search.best_estimator_, X_train, y_train, scoring='neg_log_loss')

array([0.58798867, 0.39537259, 0.35264846, 0.32033878, 0.43242762])

In [9]:
y_prd = tree_search.best_estimator_.predict(X_test)
rmse = mean_squared_error(y_test, y_prd, squared=True)
rmse

0.156

In [10]:
accuracy_score(y_test, y_prd)

0.844

# EXERCISE 8

In [11]:
X, y = make_moons(n_samples=1000, noise=0.4, random_state=23)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=34)
rs = ShuffleSplit(n_splits=1000, random_state=23, train_size=0.1)
tree_clf = DecisionTreeClassifier(random_state=23, max_depth=3, max_leaf_nodes=6)
best_estmiator = {
    'max_depth': 3,
    'max_leaf_nodes': 6  
}

In [12]:
X_split = X[:50]
y_split = y[:50]
ss = ShuffleSplit(n_splits=10, random_state=23, train_size=0.1)
for train_index, test_index in ss.split(X_split):
    print(train_index, test_index)

[47  9 40 38 19] [14 18 17 32 13 33 20  8 36 28 29 10 44  3 22 48 30 35 24 23  4 42  7  1
 49 41 45 15 46 16 34 37  0  5 21 11  2 43  6 25 26 39 27 12 31]
[14 30 31 38 22] [29 40 46  3 13 49 15 39 17  9  1 37 12 44 36 26  8 28  4 33  5 35 24  7
 10 11 20 25 27 23 43 18  2 42 16 19 48 47 45 41 34 32  6 21  0]
[29 42 23 35 32] [10 18  3 16 12 13 47 48  2 25  1 15  8 39 41 40 30  7 28  6  5  9 14 17
 26 24 22 37 20 19 44 11 36  0 34 31 43 33 27 21 49 45  4 38 46]
[14  1 25 40  2] [ 5  0 26 15 45 39 13 32 47 22 12 30  8 37 18 19 28 10  7 42 33 29 36  4
 11 48 27 17 34 44 46 35  9  3 31 21 24 43 41 49  6 23 20 38 16]
[ 9  0 30 17 42] [27 15 33 44 26 11  7  6 14 36 19 45 23 28 16 10 31 34 49  8 47 39 46 48
 22  5 25 37 12 40 21 43  3 18 13 35  1 20  2  4 24 32 29 41 38]
[17 23 33 42 24] [ 0  2  1 27 38 19 11 14 32 29 25 18 12  6 43 37 34 22 47  4 36 48 39 35
 13 30 45 46 10 40  7 21 28 49 44  5  3 31  9  8 15 16 20 26 41]
[34 13 28  0 19] [ 3 14  2 18 26  6 24 30 31  7 25 32 43 35 23 42 15  

In [13]:
indicies = [13, 24, 48,  13, 35]

X[np.array(indicies)]

array([[ 0.25810903,  0.57928944],
       [ 0.36627942, -0.56118022],
       [ 0.69085725, -0.48529975],
       [ 0.25810903,  0.57928944],
       [-1.30290366,  0.76896536]])

In [14]:
forrest_accuracy = list()

for train_index, test_index in rs.split(X_train):
    X_temp = X_train[np.array(train_index)]
    y_temp = y_train[np.array(train_index)]
    tree_clf.fit(X_temp, y_temp)
    y_pred =tree_clf.predict(X_test)
    # rmse = mean_squared_error(y_test, y_prd, squared=True)
    forrest_accuracy.append(accuracy_score(y_test, y_pred))
    
print(np.mean(forrest_accuracy), min(forrest_accuracy), max(forrest_accuracy))

0.8068759999999999 0.616 0.868


In [15]:
forrest_lst = list()

for instance in X_test:
    instance = instance
    instance_lst = list()
    for train_index, test_index in ss.split(X_train):
        X_temp = X_train[np.array(train_index)]
        y_temp = y_train[np.array(train_index)]
        tree_clf.fit(X_temp, y_temp)
        y_pred = tree_clf.predict(instance.reshape(1, -1))
        instance_lst.append(y_pred)

    instance_mode = mode(instance_lst).mode[0][0]
    forrest_lst.append(instance_mode)

  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mode[0][0]
  instance_mode = mode(instance_lst).mod

In [16]:
accuracy_score(y_test, forrest_lst)

0.844