In [1]:
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)


In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
X_train

array([[-0.56413534,  0.29283681],
       [-1.16033479,  0.96512577],
       [-0.06598769, -0.15191052],
       ...,
       [ 0.38876425, -0.78662881],
       [ 2.50492832,  0.21133631],
       [ 0.35428745,  0.74582457]])

In [4]:
y_train

array([0, 0, 1, ..., 1, 1, 0])

In [5]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_depth=3)
dt_clf.fit(X_train, y_train)

y_pred = dt_clf.predict(X_train)

from sklearn.metrics import accuracy_score, recall_score, f1_score
accuracy_score(y_train, y_pred)

0.8555

In [6]:
# Validate the model on test data
y_pred = dt_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.863

In [7]:
# Use GridSearchCV to search forr optimal hyperparameter
from sklearn.model_selection import GridSearchCV

dt_clf = DecisionTreeClassifier(max_depth=3)
parameters = {'max_leaf_nodes':list(range(2, 100)), 'min_samples_split': [2, 3, 4], 'max_depth': [2, 3, 4, 6]}
clf = GridSearchCV(dt_clf, parameters)
clf.fit(X_train, y_train)
clf.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=17,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [8]:
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8695

In [24]:
# Generate 1000 subsets of the training set,
# each containing 100 instances selected randomly

from sklearn.model_selection import ShuffleSplit
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rs = ShuffleSplit(n_splits=100, test_size=.99, random_state=0)
rs.get_n_splits(X)

100

In [25]:
for train_index, test_index in rs.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)


TRAIN: [4305 7316 2787 6738 9123 7742 6242 6521 5819 6167 3455 2418 8393 4464
  973 1634 2435 9008 3762 6001 8448 7365 7263 1472 5795 9797 8752 4469
 2957 7098 1684 9359 3918 9396 7012 5251 1071 7108 4420 2251 2956 9781
 2659 4187 7250 8622 4353 4984 1641 8948 3560 3622 6021 7221 8994 6172
 1207 8343 2292  714 6687 8736 4735 2745 3337 7456 8615 3219  659  797
  755 8291 2496 7599 1871 2046 7877 4851 5072 2163 6036 6921 6216  537
 9893 2897 7768 2222 2599  705 3468 6744 5874 4373 7891 9225 4859 3264
 9845 2732] TEST: [9394  898 2398 ... 6926 5531 2610]
TRAIN: [1945 4287 3644 4521 6405 7599 8303 4132 1704 2264 7905 7790 6625 9725
 1653 4589 1380 3127  790 3552 6025 7400  800 5294 7502 2745 9598 6906
  475  855 8875 6890 3954 9898 3384  149 7830 4768 3066 2770 1666 6422
 3672 7410 4494 4362 4641 3386 5894 5491  939 7941 4460 4358 9791 4922
 1176 4126 2331 6824 2378 6493 8117 6668 4760 3270 6299 9922  109 4421
 4561 5519 6012 1729  504 8948 7304 5182 6247 6231 1405 6404 5015 6325
 7608 236

In [26]:
dt_clf = DecisionTreeClassifier(max_depth=6, max_leaf_nodes=17, min_samples_split=2)

mini_sets = []

for train_index, test_index in rs.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train_sub = X[train_index]
    y_train_sub = y[train_index]
    mini_sets.append((X_train_sub, y_train_sub))
    


TRAIN: [4305 7316 2787 6738 9123 7742 6242 6521 5819 6167 3455 2418 8393 4464
  973 1634 2435 9008 3762 6001 8448 7365 7263 1472 5795 9797 8752 4469
 2957 7098 1684 9359 3918 9396 7012 5251 1071 7108 4420 2251 2956 9781
 2659 4187 7250 8622 4353 4984 1641 8948 3560 3622 6021 7221 8994 6172
 1207 8343 2292  714 6687 8736 4735 2745 3337 7456 8615 3219  659  797
  755 8291 2496 7599 1871 2046 7877 4851 5072 2163 6036 6921 6216  537
 9893 2897 7768 2222 2599  705 3468 6744 5874 4373 7891 9225 4859 3264
 9845 2732] TEST: [9394  898 2398 ... 6926 5531 2610]
TRAIN: [1945 4287 3644 4521 6405 7599 8303 4132 1704 2264 7905 7790 6625 9725
 1653 4589 1380 3127  790 3552 6025 7400  800 5294 7502 2745 9598 6906
  475  855 8875 6890 3954 9898 3384  149 7830 4768 3066 2770 1666 6422
 3672 7410 4494 4362 4641 3386 5894 5491  939 7941 4460 4358 9791 4922
 1176 4126 2331 6824 2378 6493 8117 6668 4760 3270 6299 9922  109 4421
 4561 5519 6012 1729  504 8948 7304 5182 6247 6231 1405 6404 5015 6325
 7608 236

In [27]:
len(mini_sets)

100

In [31]:
accuracy_scores = []
for X_train_sub, y_train_sub in mini_sets:
    dt_clf.fit(X_train_sub, y_train_sub)
    
    y_pred = dt_clf.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

import numpy as np
np.mean(accuracy_scores)


0.803985

In [32]:
predictions = []
for X_train_sub, y_train_sub in mini_sets:
    dt_clf.fit(X_train_sub, y_train_sub)
    
    y_pred = dt_clf.predict(X_test)
    predictions.append(y_pred)

import numpy as np
from scipy import stats
a = np.array([[6, 8, 3, 0],
              [3, 2, 1, 7],
              [8, 1, 8, 4],
              [5, 3, 0, 5],
              [4, 7, 5, 9]])
stats.mode(a)

ModeResult(mode=array([[3, 1, 0, 0]]), count=array([[1, 1, 1, 1]]))

In [33]:
result = stats.mode(a)

In [34]:
result.mode

array([[3, 1, 0, 0]])

In [35]:
result = stats.mode(predictions)

In [36]:
result.mode

array([[1, 1, 0, ..., 0, 0, 0]])

In [38]:
accuracy_score(y_test, result.mode.flatten())

0.8735