# Markdown

## Excercise 7: Train and fine-tune decision tree on moons dataset

In [22]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import pandas as pd

In [6]:
make_moons(10, noise=0.4, random_state=42)

(array([[ 1.8187233 ,  0.19709933],
        [ 0.47475553,  0.49703886],
        [-0.22855207,  0.13036686],
        [-0.04501961,  0.38014787],
        [ 1.32657803, -1.1095504 ],
        [-0.17121843,  0.70303727],
        [ 1.71866248, -0.35584826],
        [-1.25178998,  0.23908819],
        [ 0.31668843,  0.86479999],
        [ 0.34178088, -0.41328105]]),
 array([1, 0, 1, 0, 1, 0, 1, 0, 0, 1], dtype=int64))

In [7]:
X, y = make_moons(10000, noise=0.4, random_state=42)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
params = {'max_depth': [1, 2, 3], 'max_leaf_nodes': [2, 3, 4, 5, 10, None]}
dtc = DecisionTreeClassifier(criterion='gini')
dtc_gs = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=3)
dtc_gs.fit(X_train, y_train)

In [20]:
pd.DataFrame(dtc_gs.cv_results_).sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_leaf_nodes,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
8,0.002999,1.123916e-07,0.001001,1.6e-05,2,4.0,"{'max_depth': 2, 'max_leaf_nodes': 4}",0.852614,0.855122,0.855122,0.854286,0.001183,1
15,0.003987,1.643648e-05,0.001002,1e-06,3,5.0,"{'max_depth': 3, 'max_leaf_nodes': 5}",0.852614,0.855122,0.855122,0.854286,0.001183,1
14,0.004004,8.574971e-06,0.000668,0.000472,3,4.0,"{'max_depth': 3, 'max_leaf_nodes': 4}",0.852614,0.855122,0.855122,0.854286,0.001183,1
11,0.003001,2.144295e-06,0.001004,9e-06,2,,"{'max_depth': 2, 'max_leaf_nodes': None}",0.852614,0.855122,0.855122,0.854286,0.001183,1
10,0.003085,0.0009226766,0.000332,0.000469,2,10.0,"{'max_depth': 2, 'max_leaf_nodes': 10}",0.852614,0.855122,0.855122,0.854286,0.001183,1


In [21]:
dtc_gs.best_estimator_.fit(X_train, y_train)

In [23]:
preds = dtc_gs.best_estimator_.predict(X_test)
accuracy_score(y_test, preds)

0.856

## Exercise 8: Grow a forest
Train decision trees on subset of training samples. Classification happens based on each decision tree's guess -> voting

In [25]:
from sklearn.model_selection import ShuffleSplit
from scipy.stats import mode

In [57]:
rs = ShuffleSplit(n_splits=1000, random_state=42, test_size=6900) # each split should hold 100 samples -> testsize 6900 as 7000 training samples available

In [58]:
splits = []
for train_idx, test_idx in rs.split(X_train):
    splits.append((X_train[train_idx], y_train[train_idx]))

In [73]:
forest = []
mean_accuracy = 0
preds_forest = []
for split in splits:
    # train decision tree on each split
    tree = DecisionTreeClassifier(max_depth=2, max_leaf_nodes=4)
    tree.fit(split[0], split[1])
    forest.append(tree)
    preds = tree.predict(X_test)
    mean_accuracy += accuracy_score(y_test, preds)
    preds_forest.append(preds)
print('Mean accuracy: {}'.format(mean_accuracy/1000))

Mean accuracy: 0.8270186666666665


In [74]:
import numpy as np

In [79]:
# voting based classification
preds_mode = np.apply_along_axis(mode, axis=0, arr=preds_forest)

  res = asanyarray(func1d(inarr_view[ind0], *args, **kwargs))
  buff[ind] = asanyarray(func1d(inarr_view[ind], *args, **kwargs))


In [80]:
accuracy_score(y_test, preds_mode[0][0])

0.8606666666666667