# Naive approach
- Init bags of equal size, and build a tree for each bag
- If the tree built by a bag is working well, add more data to that bag to challenge it
  Else, keep only the part that it is predicting well

In [21]:
from sklearn.datasets import *
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import BaggingClassifier
import numpy as np
import pandas as pd
import random
from sklearn import tree
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [22]:
def load_data(dataset_name, test_size=0.2, random_state=1):
    class Object(object):
        pass
    if dataset_name == 'mnist':
        data = load_digits()
    elif dataset_name == 'breast_cancer':
        data = load_breast_cancer()
    elif dataset_name == 'iris':
        data = load_iris()
    elif dataset_name == 'wine':
        data = load_wine()
    elif dataset_name == 'abalone':
        df = pd.read_csv('data/abalone.csv')
        df['Sex'] = df['Sex'].apply(lambda x: 0 if x=='M' else 1)
        labels = []
        for r in df['Rings']:
            if 0<=r and r<=7:
                label = 1
            elif 8<=r and r<=10:
                label = 2
            elif 11<=r and r<=15:
                label = 3
            elif r>15:
                label = 4
            labels.append(label)
        data = Object()
        data.data = np.asarray(df.loc[:, df.columns!='Rings'])
        data.target = np.asarray(labels)
    elif dataset_name == 'synthetic':
        df = pd.read_csv('data/synth_data.csv')
        df = pd.get_dummies(df, prefix=['a', 'b', 'c'])
        data = Object()
        data.data = np.asarray(df.loc[:, df.columns!='label'])
        data.target = np.asarray(df['label'])
    X = data.data
    y = data.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    print('Train: ', X_train.shape, ' | Test: ', X_test.shape)
    print('Train labels: ', np.unique(y_train, return_counts=True))
    print('Test labels: ', np.unique(y_test, return_counts=True))
    return X_train, X_test, y_train, y_test

In [79]:
def run_experiment(dataset_name,
                   n_iter=50,
                   acc_threshold=0.8,
                   add_size=50,
                   n_biomes=10,
                   initial_size=100,
                   biome_max_depth=10,
                   biome_min_samples_leaf=10):
    X_train, X_test, y_train, y_test = load_data(dataset_name, test_size=0.2)
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    
    # decision tree performance
    dec_tree_accs = []
    for i in range(10):
        clf = DecisionTreeClassifier()
        clf.fit(X_train, np.asarray(y_train))
        preds = clf.predict(X_test)
        dec_tree_accs.append(accuracy_score(np.asarray(y_test), preds))
    print('Decision tree accuracy: ', round(sum(dec_tree_accs)*100/10, 2)) 
    print('Decision tree depth: ', clf.get_depth())
    
    # bagging performance
    bagging_accs = []
    for i in range(10):
        clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=biome_max_depth),
                                n_estimators=n_biomes)
        clf.fit(X_train, np.asarray(y_train))
        preds = clf.predict(X_test)
        bagging_accs.append(accuracy_score(np.asarray(y_test), preds))
    print('Bagging accuracy: ', round(sum(bagging_accs)*100/10, 2))
    
    # experiment
    pop_size = X_train.shape[0]
    # create initial biomes
    biomes = []
    permuted_indices = np.random.permutation(pop_size)
    for i in range(n_biomes):
        biome_idx = random.choices(y_train.index, k=initial_size)
        temp_X = X_train.loc[biome_idx, :]
        temp_y = y_train.loc[biome_idx, :]
        biomes.append({'X': temp_X, 'y': temp_y})
    all_iter_acc = []
    for it in range(n_iter):
        iter_acc = []
        iter_bag_size = []
        for i, biome in enumerate(biomes):
            class_weight = {}
            iter_bag_size.append(biome['X'].shape[0])
            if len(np.unique(biome['y'][0])) == len(np.unique(y_test)):
                for j in np.unique(y_test):
                    if j!= i:
                        class_weight[j] = 1
                    else:
                        class_weight[j] = 2
            biome['clf'] = DecisionTreeClassifier(max_depth=biome_max_depth, 
                                                  min_samples_leaf=biome_min_samples_leaf, 
                                                  class_weight=class_weight)      
            biome['clf'].fit(biome['X'], np.asarray(biome['y'][0]))
            preds = biome['clf'].predict(biome['X'])
            acc = accuracy_score(np.asarray(biome['y'][0]), preds)
            # extract good part and bad part
            good_part_X = biome['X'].loc[preds == biome['y'][0], :]
            good_part_y = biome['y'].loc[preds == biome['y'][0], :]
            bad_part_X = biome['X'].loc[preds != biome['y'][0], :]
            bad_part_y = biome['y'].loc[preds != biome['y'][0], :]
            # get new part
            if acc > acc_threshold and len(set(biome['X'].index)) < pop_size/2:
                biome_complement_idx = list(set(X_train.index) - set(biome['X'].index))
                new_idx = random.choices(biome_complement_idx, k = add_size)
                new_part_X = X_train.loc[new_idx, :]
                new_part_y = y_train.loc[new_idx, :]
                biome['X'] = pd.concat([biome['X'], new_part_X])
                biome['y'] = pd.concat([biome['y'], new_part_y])
            elif acc > acc_threshold and len(set(biome['X'].index)) >= pop_size/2:
                pass
            else:
                biome['X'] = good_part_X
                biome['y'] = good_part_y
            iter_acc.append(round(acc*100, 1))
        print(it, ' Bag size: ', iter_bag_size)
        print(it, ' Accuracy: ', iter_acc)
        all_iter_acc.append(iter_acc)
    print('==========================================================================')
    final_acc = []
    final_depth = []
    final_leaves = []
    final_biome_size = []
    idx_set = set()
    class_acc = np.zeros((n_biomes, len(np.unique(y_test))))
    for i, biome in enumerate(biomes):
        preds = biome['clf'].predict(X_test)
        acc = accuracy_score(y_test, preds)
        final_acc.append(round(acc*100, 1))
        cm = confusion_matrix(y_test, preds)
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        class_acc[i, :] = cm.diagonal()
        final_depth.append(biome['clf'].get_depth())
        final_leaves.append(biome['clf'].get_n_leaves())
        final_biome_size.append(biome['X'].shape[0])
        idx_set.update(biome['X'].index)
    print('Final size:  ', final_biome_size)
    print('Final acc:   ', final_acc)
    print('Final depth: ', final_depth)
    print('Final leaves:', final_leaves)
    print('Cover: ', len(idx_set)/pop_size)
    preds_list = []
    for biome in biomes:
        preds_list.append(biome['clf'].predict(X_test))
    scores = np.zeros((len(preds_list), len(preds_list)))
    for i in range(len(preds_list)):
        for j in range(len(preds_list)):
            scores[i, j] = round(sum(preds_list[i] == preds_list[j])/len(preds_list[0]), 2)
    print('Similarity between tree prediction: ')
    print(scores)
    from scipy import stats
    temp_preds = np.stack(preds_list)
    final_preds = stats.mode(temp_preds).mode[0]
    voting_acc = accuracy_score(y_test, final_preds)
    print('Voting accuracy: ', round(voting_acc*100, 2))
    # stacking
    from sklearn.linear_model import LogisticRegression
    meta_X_train = np.zeros((len(y_train), n_biomes))
    meta_X_test = np.zeros((len(y_test), n_biomes))
    for i, biome in enumerate(biomes):
        meta_X_train[:, i] = biome['clf'].predict(X_train)
        meta_X_test[:, i] = biome['clf'].predict(X_test)
    meta_clf = LogisticRegression()
    meta_clf.fit(meta_X_train, np.asarray(y_train))
    # print(meta_clf.get_depth(), meta_clf.get_n_leaves())
    meta_preds = meta_clf.predict(meta_X_test)
    meta_acc = accuracy_score(np.asarray(y_test), meta_preds)
    print('Stacking accuracy: ', round(meta_acc*100, 2))

In [80]:
run_experiment('mnist', n_iter=200, acc_threshold=0.85, add_size=20)

Train:  (1437, 64)  | Test:  (360, 64)
Train labels:  (array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([135, 147, 141, 142, 143, 152, 144, 142, 145, 146]))
Test labels:  (array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([43, 35, 36, 41, 38, 30, 37, 37, 29, 34]))
Decision tree accuracy:  85.81
Decision tree depth:  13


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


Bagging accuracy:  94.97
0  Bag size:  [100, 100, 100, 100, 100, 100, 100, 100, 100, 100]
0  Accuracy:  [67.0, 75.0, 57.0, 72.0, 66.0, 72.0, 66.0, 65.0, 73.0, 63.0]
1  Bag size:  [67, 75, 57, 72, 66, 72, 66, 65, 73, 63]
1  Accuracy:  [79.1, 92.0, 73.7, 75.0, 77.3, 77.8, 75.8, 75.4, 78.1, 76.2]
2  Bag size:  [53, 95, 42, 54, 51, 56, 50, 49, 57, 48]
2  Accuracy:  [83.0, 86.3, 95.2, 87.0, 86.3, 83.9, 88.0, 65.3, 71.9, 85.4]
3  Bag size:  [44, 115, 62, 74, 71, 47, 70, 32, 41, 68]
3  Accuracy:  [100.0, 84.3, 80.6, 85.1, 78.9, 93.6, 82.9, 75.0, 100.0, 82.4]
4  Bag size:  [64, 97, 50, 94, 56, 67, 58, 24, 61, 56]
4  Accuracy:  [78.1, 84.5, 100.0, 76.6, 89.3, 85.1, 89.7, 100.0, 80.3, 96.4]
5  Bag size:  [50, 82, 70, 72, 76, 87, 78, 44, 49, 76]
5  Accuracy:  [100.0, 98.8, 78.6, 97.2, 80.3, 73.6, 82.1, 70.5, 100.0, 85.5]
6  Bag size:  [70, 102, 55, 92, 61, 64, 64, 31, 69, 96]
6  Accuracy:  [82.9, 92.2, 94.5, 90.2, 91.8, 89.1, 95.3, 87.1, 81.2, 81.2]
7  Bag size:  [58, 122, 75, 112, 81, 84, 84, 51

59  Bag size:  [598, 750, 580, 728, 486, 735, 606, 522, 512, 712]
59  Accuracy:  [87.3, 85.9, 94.3, 86.1, 95.7, 85.6, 88.0, 87.0, 89.1, 88.2]
60  Bag size:  [618, 750, 600, 728, 506, 735, 626, 542, 532, 732]
60  Accuracy:  [87.2, 85.9, 92.2, 86.1, 94.5, 85.6, 88.0, 88.4, 89.5, 88.3]
61  Bag size:  [638, 750, 620, 728, 526, 735, 646, 562, 552, 732]
61  Accuracy:  [87.6, 85.9, 93.4, 86.1, 94.7, 85.6, 87.9, 88.3, 89.7, 88.3]
62  Bag size:  [658, 750, 640, 728, 546, 735, 666, 582, 572, 732]
62  Accuracy:  [87.8, 85.9, 93.3, 86.1, 94.1, 85.6, 87.8, 88.3, 89.2, 88.3]
63  Bag size:  [678, 750, 660, 728, 566, 735, 686, 602, 592, 732]
63  Accuracy:  [87.9, 85.9, 93.0, 86.1, 92.6, 85.6, 88.0, 88.4, 88.7, 88.3]
64  Bag size:  [698, 750, 680, 728, 586, 735, 706, 622, 612, 732]
64  Accuracy:  [88.4, 85.9, 91.5, 86.1, 90.1, 85.6, 88.1, 87.1, 88.1, 88.3]
65  Bag size:  [718, 750, 700, 728, 606, 735, 726, 642, 632, 732]
65  Accuracy:  [88.9, 85.9, 91.9, 86.1, 90.3, 85.6, 88.4, 86.6, 88.0, 88.3]
66  Ba

119  Bag size:  [738, 750, 740, 728, 746, 735, 746, 742, 752, 732]
119  Accuracy:  [88.1, 85.9, 90.1, 86.1, 88.5, 85.6, 87.7, 86.0, 86.7, 88.3]
120  Bag size:  [738, 750, 740, 728, 746, 735, 746, 742, 752, 732]
120  Accuracy:  [88.1, 85.9, 90.1, 86.1, 88.5, 85.6, 87.7, 86.0, 86.7, 88.3]
121  Bag size:  [738, 750, 740, 728, 746, 735, 746, 742, 752, 732]
121  Accuracy:  [88.1, 85.9, 90.1, 86.1, 88.5, 85.6, 87.7, 86.0, 86.7, 88.3]
122  Bag size:  [738, 750, 740, 728, 746, 735, 746, 742, 752, 732]
122  Accuracy:  [88.1, 85.9, 90.1, 86.1, 88.5, 85.6, 87.7, 86.0, 86.7, 88.3]
123  Bag size:  [738, 750, 740, 728, 746, 735, 746, 742, 752, 732]
123  Accuracy:  [88.1, 85.9, 90.1, 86.1, 88.5, 85.6, 87.7, 86.0, 86.7, 88.3]
124  Bag size:  [738, 750, 740, 728, 746, 735, 746, 742, 752, 732]
124  Accuracy:  [88.1, 85.9, 90.1, 86.1, 88.5, 85.6, 87.7, 86.0, 86.7, 88.3]
125  Bag size:  [738, 750, 740, 728, 746, 735, 746, 742, 752, 732]
125  Accuracy:  [88.1, 85.9, 90.1, 86.1, 88.5, 85.6, 87.7, 86.0, 86.7

176  Bag size:  [738, 750, 740, 728, 746, 735, 746, 742, 752, 732]
176  Accuracy:  [88.1, 85.9, 90.1, 86.1, 88.5, 85.6, 87.7, 86.0, 86.7, 88.3]
177  Bag size:  [738, 750, 740, 728, 746, 735, 746, 742, 752, 732]
177  Accuracy:  [88.1, 85.9, 90.1, 86.1, 88.5, 85.6, 87.7, 86.0, 86.7, 88.3]
178  Bag size:  [738, 750, 740, 728, 746, 735, 746, 742, 752, 732]
178  Accuracy:  [88.1, 85.9, 90.1, 86.1, 88.5, 85.6, 87.7, 86.0, 86.7, 88.3]
179  Bag size:  [738, 750, 740, 728, 746, 735, 746, 742, 752, 732]
179  Accuracy:  [88.1, 85.9, 90.1, 86.1, 88.5, 85.6, 87.7, 86.0, 86.7, 88.3]
180  Bag size:  [738, 750, 740, 728, 746, 735, 746, 742, 752, 732]
180  Accuracy:  [88.1, 85.9, 90.1, 86.1, 88.5, 85.6, 87.7, 86.0, 86.7, 88.3]
181  Bag size:  [738, 750, 740, 728, 746, 735, 746, 742, 752, 732]
181  Accuracy:  [88.1, 85.9, 90.1, 86.1, 88.5, 85.6, 87.7, 86.0, 86.7, 88.3]
182  Bag size:  [738, 750, 740, 728, 746, 735, 746, 742, 752, 732]
182  Accuracy:  [88.1, 85.9, 90.1, 86.1, 88.5, 85.6, 87.7, 86.0, 86.7

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [81]:
run_experiment('breast_cancer', acc_threshold=0.95, initial_size=30, add_size=5, n_iter=200)

Train:  (455, 30)  | Test:  (114, 30)
Train labels:  (array([0, 1]), array([170, 285]))
Test labels:  (array([0, 1]), array([42, 72]))
Decision tree accuracy:  94.3
Decision tree depth:  5


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


Bagging accuracy:  95.09
0  Bag size:  [30, 30, 30, 30, 30, 30, 30, 30, 30, 30]
0  Accuracy:  [96.7, 100.0, 96.7, 96.7, 96.7, 100.0, 90.0, 93.3, 93.3, 100.0]
1  Bag size:  [35, 35, 35, 35, 35, 35, 27, 28, 28, 35]
1  Accuracy:  [97.1, 100.0, 97.1, 97.1, 97.1, 100.0, 92.6, 92.9, 100.0, 97.1]
2  Bag size:  [40, 40, 40, 40, 40, 40, 25, 26, 33, 40]
2  Accuracy:  [95.0, 97.5, 97.5, 97.5, 95.0, 100.0, 92.0, 92.3, 100.0, 97.5]
3  Bag size:  [38, 45, 45, 45, 38, 45, 23, 24, 38, 45]
3  Accuracy:  [100.0, 97.8, 97.8, 95.6, 100.0, 100.0, 91.3, 91.7, 100.0, 97.8]
4  Bag size:  [43, 50, 50, 50, 43, 50, 21, 22, 43, 50]
4  Accuracy:  [100.0, 94.0, 98.0, 96.0, 100.0, 100.0, 90.5, 90.9, 97.7, 98.0]
5  Bag size:  [48, 47, 55, 55, 48, 55, 19, 20, 48, 55]
5  Accuracy:  [100.0, 100.0, 98.2, 94.5, 97.9, 100.0, 57.9, 90.0, 97.9, 98.2]
6  Bag size:  [53, 52, 60, 52, 53, 60, 11, 18, 53, 60]
6  Accuracy:  [98.1, 100.0, 98.3, 100.0, 98.1, 98.3, 100.0, 55.6, 96.2, 96.7]
7  Bag size:  [58, 57, 65, 57, 58, 65, 16, 1

60  Bag size:  [234, 232, 230, 234, 231, 234, 42, 28, 232, 234]
60  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 95.2, 100.0, 96.6, 97.4]
61  Bag size:  [234, 232, 230, 234, 231, 234, 47, 33, 232, 234]
61  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 93.6, 97.0, 96.6, 97.4]
62  Bag size:  [234, 232, 230, 234, 231, 234, 44, 38, 232, 234]
62  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 100.0, 89.5, 96.6, 97.4]
63  Bag size:  [234, 232, 230, 234, 231, 234, 49, 34, 232, 234]
63  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 91.8, 100.0, 96.6, 97.4]
64  Bag size:  [234, 232, 230, 234, 231, 234, 45, 39, 232, 234]
64  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 100.0, 92.3, 96.6, 97.4]
65  Bag size:  [234, 232, 230, 234, 231, 234, 50, 36, 232, 234]
65  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 96.0, 100.0, 96.6, 97.4]
66  Bag size:  [234, 232, 230, 234, 231, 234, 55, 41, 232, 234]
66  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 92.7, 90.2, 96.6, 97.4]
67  Bag size:  

123  Bag size:  [234, 232, 230, 234, 231, 234, 21, 53, 232, 234]
123  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 90.5, 90.6, 96.6, 97.4]
124  Bag size:  [234, 232, 230, 234, 231, 234, 19, 48, 232, 234]
124  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 100.0, 89.6, 96.6, 97.4]
125  Bag size:  [234, 232, 230, 234, 231, 234, 24, 43, 232, 234]
125  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 95.8, 88.4, 96.6, 97.4]
126  Bag size:  [234, 232, 230, 234, 231, 234, 29, 38, 232, 234]
126  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 93.1, 86.8, 96.6, 97.4]
127  Bag size:  [234, 232, 230, 234, 231, 234, 27, 33, 232, 234]
127  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 100.0, 84.8, 96.6, 97.4]
128  Bag size:  [234, 232, 230, 234, 231, 234, 32, 28, 232, 234]
128  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 96.9, 82.1, 96.6, 97.4]
129  Bag size:  [234, 232, 230, 234, 231, 234, 37, 23, 232, 234]
129  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 86.5, 78.3, 96.6, 97.4]
130 

185  Bag size:  [234, 232, 230, 234, 231, 234, 52, 52, 232, 234]
185  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 96.2, 94.2, 96.6, 97.4]
186  Bag size:  [234, 232, 230, 234, 231, 234, 57, 49, 232, 234]
186  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 94.7, 100.0, 96.6, 97.4]
187  Bag size:  [234, 232, 230, 234, 231, 234, 54, 54, 232, 234]
187  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 100.0, 92.6, 96.6, 97.4]
188  Bag size:  [234, 232, 230, 234, 231, 234, 59, 50, 232, 234]
188  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 94.9, 100.0, 96.6, 97.4]
189  Bag size:  [234, 232, 230, 234, 231, 234, 56, 55, 232, 234]
189  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 100.0, 96.4, 96.6, 97.4]
190  Bag size:  [234, 232, 230, 234, 231, 234, 61, 60, 232, 234]
190  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 100.0, 93.3, 96.6, 97.4]
191  Bag size:  [234, 232, 230, 234, 231, 234, 66, 56, 232, 234]
191  Accuracy:  [96.2, 96.1, 96.5, 95.3, 99.6, 96.2, 100.0, 92.9, 96.6, 97.4]


  return f(*args, **kwargs)


In [82]:
run_experiment('iris', n_iter=200, acc_threshold=0.8, add_size=2, initial_size=10, n_biomes=5)

Train:  (120, 4)  | Test:  (30, 4)
Train labels:  (array([0, 1, 2]), array([39, 37, 44]))
Test labels:  (array([0, 1, 2]), array([11, 13,  6]))
Decision tree accuracy:  96.67
Decision tree depth:  5
Bagging accuracy:  96.67
0  Bag size:  [10, 10, 10, 10, 10]
0  Accuracy:  [50.0, 40.0, 50.0, 40.0, 50.0]
1  Bag size:  [5, 4, 5, 4, 5]
1  Accuracy:  [100.0, 100.0, 100.0, 100.0, 100.0]
2  Bag size:  [7, 6, 7, 6, 7]
2  Accuracy:  [100.0, 66.7, 85.7, 83.3, 71.4]
3  Bag size:  [9, 4, 9, 8, 5]
3  Accuracy:  [100.0, 100.0, 77.8, 75.0, 100.0]
4  Bag size:  [11, 6, 7, 6, 7]
4  Accuracy:  [81.8, 66.7, 100.0, 100.0, 71.4]
5  Bag size:  [13, 4, 9, 8, 5]
5  Accuracy:  [69.2, 100.0, 100.0, 87.5, 100.0]
6  Bag size:  [9, 6, 11, 10, 7]
6  Accuracy:  [100.0, 66.7, 90.9, 90.0, 85.7]
7  Bag size:  [11, 4, 13, 12, 9]
7  Accuracy:  [90.9, 100.0, 76.9, 91.7, 66.7]
8  Bag size:  [13, 6, 10, 14, 6]
8  Accuracy:  [76.9, 83.3, 100.0, 78.6, 100.0]


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


9  Bag size:  [10, 8, 12, 11, 8]
9  Accuracy:  [100.0, 75.0, 83.3, 100.0, 100.0]
10  Bag size:  [12, 6, 14, 13, 10]
10  Accuracy:  [83.3, 100.0, 78.6, 84.6, 90.0]
11  Bag size:  [14, 8, 11, 15, 12]
11  Accuracy:  [71.4, 87.5, 100.0, 80.0, 75.0]
12  Bag size:  [10, 10, 13, 12, 9]
12  Accuracy:  [100.0, 70.0, 84.6, 100.0, 100.0]
13  Bag size:  [12, 7, 15, 14, 11]
13  Accuracy:  [83.3, 100.0, 73.3, 92.9, 81.8]
14  Bag size:  [14, 9, 11, 16, 13]
14  Accuracy:  [78.6, 88.9, 100.0, 87.5, 69.2]
15  Bag size:  [11, 11, 13, 18, 9]
15  Accuracy:  [100.0, 81.8, 84.6, 83.3, 100.0]
16  Bag size:  [13, 13, 15, 20, 11]
16  Accuracy:  [84.6, 69.2, 73.3, 80.0, 81.8]
17  Bag size:  [15, 9, 11, 16, 13]
17  Accuracy:  [73.3, 100.0, 100.0, 100.0, 69.2]
18  Bag size:  [11, 11, 13, 18, 9]
18  Accuracy:  [100.0, 90.9, 84.6, 94.4, 100.0]
19  Bag size:  [13, 13, 15, 20, 11]
19  Accuracy:  [84.6, 84.6, 73.3, 90.0, 90.9]
20  Bag size:  [15, 15, 11, 22, 13]
20  Accuracy:  [73.3, 73.3, 100.0, 81.8, 76.9]
21  Bag si

123  Bag size:  [20, 61, 18, 61, 55]
123  Accuracy:  [90.0, 98.4, 100.0, 95.1, 98.2]
124  Bag size:  [22, 61, 20, 61, 57]
124  Accuracy:  [86.4, 98.4, 90.0, 95.1, 98.2]
125  Bag size:  [24, 61, 22, 61, 59]
125  Accuracy:  [83.3, 98.4, 81.8, 95.1, 98.3]
126  Bag size:  [26, 61, 24, 61, 61]
126  Accuracy:  [80.8, 98.4, 75.0, 95.1, 98.4]
127  Bag size:  [28, 61, 18, 61, 61]
127  Accuracy:  [75.0, 98.4, 77.8, 95.1, 98.4]
128  Bag size:  [21, 61, 14, 61, 61]
128  Accuracy:  [100.0, 98.4, 100.0, 95.1, 98.4]
129  Bag size:  [23, 61, 16, 61, 61]
129  Accuracy:  [91.3, 98.4, 87.5, 95.1, 98.4]
130  Bag size:  [25, 61, 18, 61, 61]
130  Accuracy:  [84.0, 98.4, 83.3, 95.1, 98.4]
131  Bag size:  [27, 61, 20, 61, 61]
131  Accuracy:  [70.4, 98.4, 80.0, 95.1, 98.4]
132  Bag size:  [19, 61, 16, 61, 61]
132  Accuracy:  [84.2, 98.4, 100.0, 95.1, 98.4]
133  Bag size:  [21, 61, 18, 61, 61]
133  Accuracy:  [61.9, 98.4, 88.9, 95.1, 98.4]
134  Bag size:  [13, 61, 20, 61, 61]
134  Accuracy:  [76.9, 98.4, 85.0, 

  return f(*args, **kwargs)


In [83]:
run_experiment('wine', n_iter=100, acc_threshold=0.7, initial_size=20, n_biomes=5, add_size=5)

Train:  (142, 13)  | Test:  (36, 13)
Train labels:  (array([0, 1, 2]), array([45, 58, 39]))
Test labels:  (array([0, 1, 2]), array([14, 13,  9]))
Decision tree accuracy:  89.17
Decision tree depth:  5
Bagging accuracy:  94.44
0  Bag size:  [20, 20, 20, 20, 20]
0  Accuracy:  [60.0, 70.0, 65.0, 70.0, 75.0]
1  Bag size:  [12, 14, 13, 14, 25]
1  Accuracy:  [75.0, 64.3, 61.5, 64.3, 72.0]
2  Bag size:  [17, 9, 8, 9, 30]
2  Accuracy:  [76.5, 100.0, 100.0, 100.0, 73.3]
3  Bag size:  [22, 14, 13, 14, 35]
3  Accuracy:  [77.3, 78.6, 69.2, 85.7, 88.6]
4  Bag size:  [27, 19, 9, 19, 40]
4  Accuracy:  [77.8, 68.4, 100.0, 68.4, 95.0]
5  Bag size:  [32, 13, 14, 13, 45]
5  Accuracy:  [75.0, 100.0, 78.6, 100.0, 95.6]
6  Bag size:  [37, 18, 19, 18, 50]
6  Accuracy:  [83.8, 88.9, 63.2, 72.2, 96.0]


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


7  Bag size:  [42, 23, 12, 23, 55]
7  Accuracy:  [88.1, 73.9, 100.0, 73.9, 92.7]
8  Bag size:  [47, 28, 17, 28, 60]
8  Accuracy:  [89.4, 75.0, 76.5, 85.7, 91.7]
9  Bag size:  [52, 33, 22, 33, 65]
9  Accuracy:  [88.5, 81.8, 77.3, 84.8, 93.8]
10  Bag size:  [57, 38, 27, 38, 70]
10  Accuracy:  [89.5, 76.3, 77.8, 86.8, 92.9]
11  Bag size:  [62, 43, 32, 43, 75]
11  Accuracy:  [90.3, 93.0, 68.8, 90.7, 93.3]
12  Bag size:  [67, 48, 22, 48, 75]
12  Accuracy:  [89.6, 95.8, 81.8, 95.8, 93.3]
13  Bag size:  [72, 53, 27, 53, 75]
13  Accuracy:  [90.3, 90.6, 92.6, 94.3, 93.3]
14  Bag size:  [77, 58, 32, 58, 75]
14  Accuracy:  [90.9, 96.6, 87.5, 94.8, 93.3]
15  Bag size:  [77, 63, 37, 63, 75]
15  Accuracy:  [90.9, 92.1, 83.8, 95.2, 93.3]
16  Bag size:  [77, 68, 42, 68, 75]
16  Accuracy:  [90.9, 91.2, 83.3, 94.1, 93.3]
17  Bag size:  [77, 73, 47, 73, 75]
17  Accuracy:  [90.9, 91.8, 91.5, 94.5, 93.3]
18  Bag size:  [77, 78, 52, 78, 75]
18  Accuracy:  [90.9, 92.3, 94.2, 94.9, 93.3]
19  Bag size:  [77, 7

  return f(*args, **kwargs)


In [89]:
run_experiment('abalone', n_iter=200, acc_threshold=0.75, initial_size=100, n_biomes=10, add_size=10)

Train:  (3341, 8)  | Test:  (836, 8)
Train labels:  (array([1, 2, 3, 4]), array([ 672, 1509,  948,  212]))
Test labels:  (array([1, 2, 3, 4]), array([167, 382, 238,  49]))
Decision tree accuracy:  49.94
Decision tree depth:  22


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


Bagging accuracy:  59.81
0  Bag size:  [100, 100, 100, 100, 100, 100, 100, 100, 100, 100]
0  Accuracy:  [70.0, 73.0, 64.0, 66.0, 66.0, 75.0, 72.0, 70.0, 75.0, 68.0]
1  Bag size:  [70, 73, 64, 66, 66, 75, 72, 70, 75, 68]
1  Accuracy:  [92.9, 80.8, 93.8, 89.4, 86.4, 77.3, 83.3, 82.9, 92.0, 83.8]
2  Bag size:  [80, 83, 74, 76, 76, 85, 82, 80, 85, 78]
2  Accuracy:  [88.8, 80.7, 91.9, 84.2, 78.9, 75.3, 81.7, 78.8, 88.2, 84.6]
3  Bag size:  [90, 93, 84, 86, 86, 95, 92, 90, 95, 88]
3  Accuracy:  [90.0, 88.2, 86.9, 91.9, 86.0, 80.0, 76.1, 76.7, 88.4, 84.1]
4  Bag size:  [100, 103, 94, 96, 96, 105, 102, 100, 105, 98]
4  Accuracy:  [87.0, 77.7, 85.1, 87.5, 85.4, 77.1, 81.4, 79.0, 85.7, 82.7]
5  Bag size:  [110, 113, 104, 106, 106, 115, 112, 110, 115, 108]
5  Accuracy:  [85.5, 83.2, 84.6, 85.8, 82.1, 78.3, 81.2, 75.5, 83.5, 81.5]
6  Bag size:  [120, 123, 114, 116, 116, 125, 122, 120, 125, 118]
6  Accuracy:  [84.2, 81.3, 82.5, 84.5, 82.8, 76.8, 83.6, 75.8, 80.0, 81.4]
7  Bag size:  [130, 133, 124,

59  Bag size:  [570, 456, 570, 465, 523, 501, 477, 595, 491, 556]
59  Accuracy:  [78.2, 89.7, 77.2, 84.3, 78.0, 82.8, 84.3, 76.5, 83.9, 77.7]
60  Bag size:  [580, 466, 580, 475, 533, 511, 487, 605, 501, 566]
60  Accuracy:  [78.3, 90.1, 76.4, 83.6, 78.0, 81.8, 83.6, 77.2, 84.2, 76.7]
61  Bag size:  [590, 476, 590, 485, 543, 521, 497, 615, 511, 576]
61  Accuracy:  [77.1, 89.9, 76.4, 84.3, 77.9, 81.2, 82.7, 78.2, 83.6, 76.6]
62  Bag size:  [600, 486, 600, 495, 553, 531, 507, 625, 521, 586]
62  Accuracy:  [77.2, 87.4, 76.2, 83.6, 78.7, 81.0, 83.6, 78.4, 82.7, 77.5]
63  Bag size:  [610, 496, 610, 505, 563, 541, 517, 635, 531, 596]
63  Accuracy:  [77.7, 88.1, 76.7, 82.2, 77.8, 81.0, 83.0, 77.8, 81.4, 76.8]
64  Bag size:  [620, 506, 620, 515, 573, 551, 527, 645, 541, 606]
64  Accuracy:  [77.3, 87.9, 77.3, 81.0, 78.0, 81.1, 82.5, 76.4, 82.4, 77.7]
65  Bag size:  [630, 516, 630, 525, 583, 561, 537, 655, 551, 616]
65  Accuracy:  [77.6, 87.4, 74.8, 82.3, 77.5, 80.9, 82.1, 77.4, 81.1, 77.9]
66  Ba

117  Bag size:  [904, 1036, 981, 777, 889, 828, 1057, 908, 825, 934]
117  Accuracy:  [84.6, 76.7, 78.4, 94.9, 85.8, 89.3, 77.0, 87.0, 89.3, 81.7]
118  Bag size:  [914, 1046, 991, 787, 899, 838, 1067, 918, 835, 944]
118  Accuracy:  [84.9, 76.0, 78.8, 95.0, 85.5, 88.3, 77.0, 86.6, 89.2, 81.9]
119  Bag size:  [924, 1056, 1001, 797, 909, 848, 1077, 928, 845, 954]
119  Accuracy:  [83.2, 76.8, 78.6, 94.6, 85.1, 88.2, 77.2, 86.5, 88.9, 81.2]
120  Bag size:  [934, 1066, 1011, 807, 919, 858, 1087, 938, 855, 964]
120  Accuracy:  [84.6, 76.5, 78.3, 93.8, 85.1, 88.0, 77.2, 86.1, 88.7, 81.0]
121  Bag size:  [944, 1076, 1021, 817, 929, 868, 1097, 948, 865, 974]
121  Accuracy:  [84.1, 76.4, 78.5, 93.4, 84.8, 87.7, 77.1, 84.7, 88.4, 81.1]
122  Bag size:  [954, 1086, 1031, 827, 939, 878, 1107, 958, 875, 984]
122  Accuracy:  [83.9, 76.6, 78.1, 92.9, 83.7, 87.5, 77.1, 84.6, 88.7, 80.8]
123  Bag size:  [964, 1096, 1041, 837, 949, 888, 1117, 968, 885, 994]
123  Accuracy:  [83.4, 75.9, 78.1, 92.2, 83.9, 87.

172  Bag size:  [1454, 1279, 1159, 1327, 1439, 1378, 1607, 1458, 1375, 1484]
172  Accuracy:  [77.1, 81.6, 89.5, 77.8, 77.4, 77.4, 75.7, 77.5, 79.3, 78.2]
173  Bag size:  [1464, 1289, 1169, 1337, 1449, 1388, 1617, 1468, 1385, 1494]
173  Accuracy:  [76.6, 81.8, 89.2, 78.5, 77.3, 77.4, 75.6, 77.5, 78.9, 77.7]
174  Bag size:  [1474, 1299, 1179, 1347, 1459, 1398, 1627, 1478, 1395, 1504]
174  Accuracy:  [76.9, 81.4, 88.9, 78.8, 77.2, 78.2, 75.3, 77.7, 79.4, 77.7]
175  Bag size:  [1484, 1309, 1189, 1357, 1469, 1408, 1637, 1488, 1405, 1514]
175  Accuracy:  [77.1, 81.5, 87.8, 78.9, 77.3, 78.2, 75.3, 77.8, 78.9, 77.6]
176  Bag size:  [1494, 1319, 1199, 1367, 1479, 1418, 1647, 1498, 1415, 1524]
176  Accuracy:  [76.9, 81.7, 87.6, 79.2, 77.1, 78.6, 75.2, 77.9, 79.4, 77.2]
177  Bag size:  [1504, 1329, 1209, 1377, 1489, 1428, 1657, 1508, 1425, 1534]
177  Accuracy:  [76.5, 81.3, 87.8, 78.5, 77.4, 78.0, 74.8, 77.4, 79.4, 77.6]
178  Bag size:  [1514, 1339, 1219, 1387, 1499, 1438, 1240, 1518, 1435, 1544]

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
