In [32]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, matthews_corrcoef\
, precision_score, recall_score, f1_score
import time
import math

In [33]:
import os
import sys
new_path = "d:/Projects-my/ml/Ml-hse-final/ada-boost-standard/code-python/"
sys.path.append(new_path)

from ada_boost_standard_v1 import AdaBoostStandardClassifier_v1;
from ada_boost_standard_v2 import AdaBoostStandardClassifier_v2;
from ada_boost_linear_v1 import AdaBoostLinear_v1;

In [34]:
def correct_Y(y_data):
    return [-1 if item == 0 else item for item in y_data]

def uncorrect_Y(y_data):
    return [0 if item == -1 else item for item in y_data]

def compute_metrics(y_test, y_pred):
    print('Accuracy: {:.5f}'.format(accuracy_score(y_test, y_pred)))
    print('F-score: {:.5f}'.format(f1_score(y_test, y_pred)))
    print('Precision: {:.5f}'.format(precision_score(y_test, y_pred)))
    print('Recall: {:.5f}'.format(recall_score(y_test, y_pred)))
    print('Accuracy (balanced): {:.5f}'.format(balanced_accuracy_score(y_test, y_pred)))
    print('MCC: {:.5f}'.format(matthews_corrcoef(y_test, y_pred)))

def compute_confusion_matrix(y_test, y_pred, labels=[1, -1]):
    compute_metrics(y_test, y_pred)
    return pd.DataFrame(
        confusion_matrix(y_test, y_pred, labels=labels),
        columns=['a(x) = '+str(labels[0]), 'a(x) = '+str(labels[1])],
        index=['y = '+str(labels[0]), 'y = '+str(labels[1])],
    ).T

In [52]:
normal = [[1/2]]*4

def get_separable_y(x_data):
    return np.sign(np.squeeze(np.matmul(x_data, normal)))

#### Experiment with the separable data, feature 1 (starting from 0) allows to get error-free timestamp classifier

In [36]:
X_separable1 = np.array([[ 0.6476239 , -0.81753611, -1.61389785, -0.21274028],
       [-2.3748206 ,  0.82768797, -0.38732682, -0.30230275],
       [ 1.51783379,  1.22140561, -0.51080514, -1.18063218],
       [-0.98740462,  0.99958558, -1.70627019,  1.9507754 ],
       [-1.43411205,  1.50037656, -1.04855297, -1.42001794],
       [ 0.29484027, -0.79249401, -1.25279536,  0.77749036]])

y_separable1 = correct_Y(np.array([0, 0, 1, 1, 1, 0]))


In [37]:
clf = AdaBoostStandardClassifier_v2(n_estimators=10)
result, history = clf.fit(X_separable1, y_separable1, trace=True)
print(result, len(history))


error_free_classifier_found 3


In [38]:
y_pred_separable1 = clf.predict(X_separable1)
compute_confusion_matrix(y_separable1, y_pred_separable1, labels=[1, -1])

Accuracy: 1.00000
F-score: 1.00000
Precision: 1.00000
Recall: 1.00000
Accuracy (balanced): 1.00000
MCC: 1.00000


Unnamed: 0,y = 1,y = -1
a(x) = 1,3,0
a(x) = -1,0,3


#### Experiment with 1000 samples data set

In [53]:
X1000, y1000 = make_classification(n_samples=1000, n_features=4,
                            n_informative=2, n_redundant=0,
                            random_state=0, shuffle=False)

X_train1000, X_test1000, y_train1000, y_test1000 = train_test_split(X1000, y1000, test_size=0.33, random_state=42)
y_train1000pm, y_test1000pm = correct_Y(y_train1000), correct_Y(y_test1000)
y_train1000_sep, y_test1000_sep = get_separable_y(X_train1000), get_separable_y(X_test1000)
y_train1000cr_sep, y_test1000cr_sep = uncorrect_Y(y_train1000_sep), uncorrect_Y(y_test1000_sep)

##### Classifier AdaBoostLinear v1

In [54]:
start = time.time()
clf1000_linear = AdaBoostLinear_v1()
result, message = clf1000_linear.fit(X_train1000, y_train1000_sep, allow_nonseparable=True)
end = time.time()
print("Elapsed time: {}".format(end - start))
print(result, message)


Elapsed time: 5.625959396362305
True Optimization terminated successfully.


In [55]:
y_pred1000_linear = clf1000_linear.predict(X_test1000)
compute_confusion_matrix(y_test1000_sep, y_pred1000_linear, labels=[-1, 1])

Accuracy: 0.95758
F-score: 0.95652
Precision: 0.95062
Recall: 0.96250
Accuracy (balanced): 0.95772
MCC: 0.91517


Unnamed: 0,y = -1,y = 1
a(x) = -1,162,6
a(x) = 1,8,154


##### Classifier AdaBoostStandard v1

In [56]:
start = time.time()

clf1000v1 = AdaBoostStandardClassifier_v1(n_estimators=10)
result, history = clf1000v1.fit(X_train1000, y_train1000_sep, trace=True)
end = time.time()

print("Elapsed time: {}".format(end - start))
print(result, len(history['error']))
history['error']

Elapsed time: 41.538973331451416
iterations_exceeded 10


[0.2701492537313426,
 0.25732411391171034,
 0.2809114206886934,
 0.2730613971489954,
 0.29492723603788906,
 0.3004946216510261,
 0.3329447638014405,
 0.3157330718374596,
 0.373847452482741,
 0.3347125743490683]

In [57]:
y_pred1000v1 = clf1000v1.predict(X_test1000)
compute_confusion_matrix(y_test1000_sep, y_pred1000v1, labels=[-1, 1])

Accuracy: 0.88182
F-score: 0.87540
Precision: 0.89542
Recall: 0.85625
Accuracy (balanced): 0.88107
MCC: 0.76381


Unnamed: 0,y = -1,y = 1
a(x) = -1,154,23
a(x) = 1,16,137


##### Classifier AdaBoostStandard v2

In [60]:
start = time.time()

clf1000v2 = AdaBoostStandardClassifier_v2(n_estimators=10)
result, history = clf1000v2.fit(X_train1000, y_train1000_sep, trace=True)
end = time.time()

print("Elapsed time: {}".format(end - start))
print(result, len(history['error']))
history['error']

Elapsed time: 25.20303463935852
iterations_exceeded 10


[0.2701492537313426,
 0.2573241139117104,
 0.2809114206886942,
 0.2738758980725727,
 0.2947892754929494,
 0.3000387927741812,
 0.33236305572071617,
 0.31551586906089574,
 0.3735755658462361,
 0.3191962277803047]

In [63]:
y_pred1000v2 = clf1000v2.predict(X_test1000)
compute_confusion_matrix(y_test1000_sep, y_pred1000v2, labels=[-1, 1])

Accuracy: 0.89394
F-score: 0.89362
Precision: 0.86982
Recall: 0.91875
Accuracy (balanced): 0.89467
MCC: 0.78921


Unnamed: 0,y = -1,y = 1
a(x) = -1,148,13
a(x) = 1,22,147


##### Classifier sklearn.ensemble.AdaBoostClassifier

In [69]:
clf1000std = AdaBoostClassifier(n_estimators=1000, random_state=0)
clf1000std.fit(X_train1000, y_train1000cr_sep)
y_pred1000std = clf1000std.predict(X_test1000)
compute_confusion_matrix(y_test1000cr_sep, y_pred1000std, labels=[0, 1])

Accuracy: 0.97576
F-score: 0.97516
Precision: 0.96914
Recall: 0.98125
Accuracy (balanced): 0.97592
MCC: 0.95156


Unnamed: 0,y = 0,y = 1
a(x) = 0,165,3
a(x) = 1,5,157


#### Experiment with 10000 samples data set

In [71]:
X104, y104 = make_classification(n_samples=10000, n_features=4,
                            n_informative=2, n_redundant=0,
                            random_state=0, shuffle=False)

X_train104, X_test104, y_train104, y_test104 = train_test_split(X104, y104, test_size=0.33, random_state=42)
y_train104pm, y_test104pm = correct_Y(y_train104), correct_Y(y_test104)
y_train104_sep, y_test104_sep = get_separable_y(X_train104), get_separable_y(X_test104)
y_train104cr_sep, y_test104cr_sep = uncorrect_Y(y_train104_sep), uncorrect_Y(y_test104_sep)

##### Classifier sklearn.ensemble.AdaBoostClassifier

In [72]:
clf104std = AdaBoostClassifier(n_estimators=10, random_state=0)
clf104std.fit(X_train104, y_train104cr_sep)
y_pred104std = clf104std.predict(X_test104)
compute_confusion_matrix(y_test104cr_sep, y_pred104std, labels=[1, 0])

Accuracy: 0.88485
F-score: 0.88110
Precision: 0.87781
Recall: 0.88442
Accuracy (balanced): 0.88483
MCC: 0.76949


Unnamed: 0,y = 1,y = 0
a(x) = 1,1408,196
a(x) = 0,184,1512


##### Classifier AdaBoostLinear v1

In [73]:
start = time.time()
clf104_linear = AdaBoostLinear_v1()
result, message = clf104_linear.fit(X_train104, y_train104_sep, allow_nonseparable=True)
end = time.time()
print("Elapsed time: {}".format(end - start))
print(result, message)


Elapsed time: 2065.104259490967
True Optimization terminated successfully.


In [74]:
y_pred104_linear = clf104_linear.predict(X_test104)
compute_confusion_matrix(y_test104_sep, y_pred104_linear, labels=[-1, 1])

Accuracy: 0.98485
F-score: 0.98430
Precision: 0.98430
Recall: 0.98430
Accuracy (balanced): 0.98483
MCC: 0.96966


Unnamed: 0,y = -1,y = 1
a(x) = -1,1683,25
a(x) = 1,25,1567


##### Classifier AdaBoostStandard v2

In [51]:
start = time.time()
clf1000v2 = AdaBoostStandardClassifier_v2(n_estimators=10)
result, history = clf1000v2.fit(X_train104, y_train104pm, trace=True)
end = time.time()

print("Elapsed time: {}".format(end - start))
print(result, len(history['error']))

Elapsed time: 2495.8133506774902
iterations_exceeded 10


In [53]:
y_pred104v2 = clf1000v2.predict(X_test104)
compute_confusion_matrix(y_test104pm, y_pred104v2, labels=[1, -1])


Accuracy: 0.82576
F-score: 0.82634
Precision: 0.82559
Recall: 0.82709
Accuracy (balanced): 0.82575
MCC: 0.65151


Unnamed: 0,y = 1,y = 1.1
a(x) = 1,1368,289
a(x) = 1,286,1357
