In [115]:
from sklearn.datasets import make_moons
import numpy as np
X,y = make_moons(n_samples = 10000, noise = 0.4)

In [160]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_test.shape

(2000,)

In [138]:
from sklearn.tree import export_graphviz, DecisionTreeClassifier as dtc
tree_clf = dtc()
tree_clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [139]:
#printing the decision tree model
def print_tree(tree, features, classes, file):
    export_graphviz(
            tree,
            out_file= "./"+file+".dot",
            feature_names=features,
            class_names=classes,
            rounded=True,
            filled=True
        )
print_tree(
    tree_clf,
    ["X", "Y"],
    ["Top", "Bottom"],
    "moons",
)

In [140]:
#measuring accuracy
from sklearn.metrics import precision_recall_fscore_support as prfs
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(tree_clf, X_train, y_train, cv=5)
prfs(y_train, y_train_pred)

(array([0.80487186, 0.80068983]),
 array([0.79678473, 0.80865887]),
 array([0.80080788, 0.80465462]),
 array([3981, 4019]))

In [141]:
tree_clf.get_params

<bound method BaseEstimator.get_params of DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')>

In [142]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': range(1,10,1),
    'max_leaf_nodes': range(2,100,5),
}

search = GridSearchCV(tree_clf, param_grid, cv=5, scoring="accuracy")

In [143]:
#gridsearch searching
search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': range(1, 10), 'max_leaf_nodes': range(2, 100, 5)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [144]:
print("Acheaved score of:\t", search.best_score_)
print("With following paramaters:\t", search.best_params_)
print("The model:", search.best_estimator_)

Acheaved score of:	 0.86175
With following paramaters:	 {'max_depth': 5, 'max_leaf_nodes': 17}
The model: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=17, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [145]:
final_model = DecisionTreeClassifier(max_leaf_nodes=17, max_depth = 6)
final_model.fit(X_train,y_train)
y_test_pred = final_model.predict(X_test)

In [146]:
prfs(y_test, y_test_pred)

(array([0.84836852, 0.85908142]),
 array([0.86751717, 0.83893986]),
 array([0.857836  , 0.84889118]),
 array([1019,  981]))

In [151]:
#developing own 
from sklearn.model_selection import ShuffleSplit
rs = ShuffleSplit(n_splits=1000, test_size=0.2, random_state=42)
mini_sets = []
for mini_train_index, mini_test_index in rs.split(X):
    X_mini_train = X[mini_train_index]
    y_mini_train = y[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

In [161]:
from sklearn.metrics import accuracy_score
from sklearn.base import clone

#getting the forest
forest = [clone(DecisionTreeClassifier(max_leaf_nodes=17, max_depth = 6)) for _ in range(1000)]
accuracy_scores = []

for tree, (X_mini, y_mini) in zip(forest, mini_sets):
    tree.fit(X_mini, y_mini)
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))
np.mean(accuracy_scores)

0.872487

In [163]:
Y_pred = np.empty([1000, len(X_test)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)

In [164]:
from scipy.stats import mode

y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)
accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))

0.8755