<a href="https://colab.research.google.com/github/ykykyk112/machine_learning/blob/main/sklearn_exercise/Ensemble_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt

test = np.load('/content/drive/MyDrive/Colab_Notebooks/quickdraw/bird.npy')
print(test.shape)

In [None]:
idx = np.arange(test.shape[0])
rnd_idx = np.random.choice(idx, 10000, replace=False)
t = test[rnd_idx]
t[0:10000].shape
del idx

In [None]:
def make_ndarray(target_list, each_size) :
    # 784 means bitmap size
    data = np.empty((len(target_list)*each_size, 784))
    target = np.empty((len(target_list)*each_size))
    for label, filename in enumerate(target_list) :
        npy = np.load('/content/drive/MyDrive/Colab_Notebooks/quickdraw/{}.npy'.format(filename))
        tmp = np.arange(npy.shape[0])
        idx = np.random.choice(tmp, each_size, replace=False)
        data[label*each_size:(label+1)*each_size] = npy[idx]
        target[label*each_size:(label+1)*each_size] = label
        print('{} is Complete'.format(filename))
    s_idx = np.arange(target.shape[0])
    np.random.shuffle(s_idx)
    data = data[s_idx]
    target = target[s_idx]
    print('result shape : {0} & {1}'.format(data.shape, target.shape))
    return data, target

target_list = ['ant', 'bicycle', 'bird', 'butterfly', 'camel', 'clock', 'cow', 'diamond']
data, target = make_ndarray(target_list=target_list, each_size=10000)

In [None]:
def show_image(index) :
    plt.imshow(data[index].reshape(28, 28), cmap='binary')
    plt.title(target[index])
show_image(21)

In [None]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(data, target, test_size = 0.2, random_state=42)

In [None]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier # Neural network

In [None]:
# classifier generation
dt_clf = tree.DecisionTreeClassifier(random_state=42)
random_clf = RandomForestClassifier(random_state=42)
mlp_clf = MLPClassifier(random_state=42)
ext_clf = ExtraTreesClassifier(random_state=42)

In [None]:
# Training  classifiers
dt_clf.fit(X_train, y_train)
random_clf.fit(X_train, y_train)
mlp_clf.fit(X_train, y_train)
ext_clf.fit(X_train, y_train)

In [None]:
print(dt_clf.score(X_test, y_test))
print(random_clf.score(X_test, y_test))
print(mlp_clf.score(X_test, y_test))
print(ext_clf.score(X_test, y_test))

In [None]:
from sklearn.ensemble import VotingClassifier
named_estimators = [('rf', random_clf),
                    ('mlp', mlp_clf),
                    ('ext', ext_clf),
                    ('dt', dt_clf)]

In [None]:
voting_clf = VotingClassifier(estimators=named_estimators, voting='hard')
voting_clf.fit(X_train, y_train)

In [None]:
voting_score = voting_clf.score(X_test, y_test)
voting_score

In [None]:
voting_clf.estimators_

In [None]:
voting_clf.voting = 'soft'
new_voting_score = voting_clf.score(X_test, y_test)
print('old score : {0}, new score : {1}'.format(voting_score, new_voting_score))

In this case, hard voting method shows better performance

In [None]:
from sklearn.ensemble import BaggingClassifier
bagging_clf = BaggingClassifier(bootstrap = True, oob_score = True, random_state = 42)
bagging_clf.fit(X_train, y_train)
bagging_score = bagging_clf.score(X_test, y_test)
bagging_score_oob = bagging_clf.oob_score_

In [None]:
print('bagging score :', bagging_score, 'oob_score :', bagging_score_oob)

In [None]:
params = {'n_estimators' : 100, 'max_samples' : 0.3, 'n_jobs' : -1}
bagging_clf.set_params(**params)

In [None]:
bagging_clf.fit(X_train, y_train)
print('bagging score :', bagging_clf.score(X_test, y_test), 'oob_score :', bagging_clf.oob_score_)

In [None]:
rnd_clf = RandomForestClassifier(oob_score = True, random_state=42)
rnd_clf.fit(X_train, y_train)
print('method score :', rnd_clf.score(X_test, y_test), 'oob_score :', rnd_clf.oob_score_)

In [None]:
from sklearn.model_selection import GridSearchCV
grid_param = {'n_estimators' : [10, 100, 200], 'max_depth' : [None, 3, 5], 'criterion' : ['gini', 'entropy']}
grid_search = GridSearchCV(rnd_clf, param_grid= grid_param, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(X_train, y_train)