# Import the Database

In [1]:
# set memory growth (necessary for training)
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

In [4]:
import pandas as pd

df = pd.read_csv('./cicy3o_tidy.csv')

# remove the outliers completely
df = df.loc[(df['h11'] >= 1) &
            (df['h11'] <= 16) &
            (df['h21'] >= 15) &
            (df['h21'] <= 86)
           ].reset_index(drop=True)

# select the matrix to compute the PCA
mat = df.filter(regex='^matrix_')

h11    = df['h11'].values.reshape(-1,)
h21    = df['h21'].values.reshape(-1,)

# Train Test Split

In [9]:
RAND = 42

In [10]:
from sklearn.model_selection import train_test_split

mat_train_80, mat_test_80, \
h11_train_80, h11_test_80, \
h21_train_80, h21_test_80 = train_test_split(mat, h11, h21,
                                             train_size=0.8, shuffle=True, random_state=RAND)

mat_train_30, mat_test_30, \
h11_train_30, h11_test_30, \
h21_train_30, h21_test_30 = train_test_split(mat, h11, h21,
                                             train_size=0.3, shuffle=True, random_state=RAND)

# Linear Regression

In [12]:
from sklearn.linear_model import Lasso
from sklearn.metrics import accuracy_score
import numpy as np

est_h11_80 = Lasso(alpha=2.0e-6, fit_intercept=False, max_iter=1e6, random_state=RAND)
est_h21_80 = Lasso(alpha=1.0e-6, fit_intercept=True, normalize=False, max_iter=1e6, random_state=RAND)
est_h11_30 = Lasso(alpha=2.0e-6, fit_intercept=False, max_iter=1e6, random_state=RAND)
est_h21_30 = Lasso(alpha=1.0e-6, fit_intercept=True, normalize=False, max_iter=1e6, random_state=RAND)

est_h11_80.fit(mat_train_80, h11_train_80)
est_h21_80.fit(mat_train_80, h21_train_80)
est_h11_30.fit(mat_train_30, h11_train_30)
est_h21_30.fit(mat_train_30, h21_train_30)

h11_pred_80 = np.floor(est_h11_80.predict(mat_test_80)).astype(int)
h21_pred_80 = np.floor(est_h21_80.predict(mat_test_80)).astype(int)
h11_pred_30 = np.floor(est_h11_30.predict(mat_test_30)).astype(int)
h21_pred_30 = np.floor(est_h21_30.predict(mat_test_30)).astype(int)

print('\nLINEAR REGRESSION')
print('80% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_80, h11_pred_80)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_80, h21_pred_80)))
print('')
print('30% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_30, h11_pred_30)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_30, h21_pred_30)))


LINEAR REGRESSION
80% training data:
    h11 accuracy: 0.511
    h21 accuracy: 0.117

30% training data:
    h11 accuracy: 0.490
    h21 accuracy: 0.103


# Linear SVM

In [14]:
from sklearn.svm import LinearSVR
from sklearn.metrics import accuracy_score
import numpy as np

est_h11_80 = LinearSVR(C=0.13, epsilon=0.7, fit_intercept=True, intercept_scaling=0.13, loss='epsilon_insensitive', max_iter=1e6, random_state=RAND)
est_h21_80 = LinearSVR(C=0.30, epsilon=0.0, fit_intercept=True, intercept_scaling=100, loss='epsilon_insensitive', max_iter=1e6, random_state=RAND)
est_h11_30 = LinearSVR(C=0.13, epsilon=0.7, fit_intercept=True, intercept_scaling=0.13, loss='epsilon_insensitive', max_iter=1e6, random_state=RAND)
est_h21_30 = LinearSVR(C=0.30, epsilon=0.0, fit_intercept=True, intercept_scaling=100, loss='epsilon_insensitive', max_iter=1e6, random_state=RAND)

est_h11_80.fit(mat_train_80, h11_train_80)
est_h21_80.fit(mat_train_80, h21_train_80)
est_h11_30.fit(mat_train_30, h11_train_30)
est_h21_30.fit(mat_train_30, h21_train_30)

h11_pred_80 = np.floor(est_h11_80.predict(mat_test_80)).astype(int)
h21_pred_80 = np.floor(est_h21_80.predict(mat_test_80)).astype(int)
h11_pred_30 = np.floor(est_h11_30.predict(mat_test_30)).astype(int)
h21_pred_30 = np.floor(est_h21_30.predict(mat_test_30)).astype(int)

print('\nLINEAR SVM')
print('80% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_80, h11_pred_80)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_80, h21_pred_80)))
print('')
print('30% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_30, h11_pred_30)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_30, h21_pred_30)))


LINEAR SVM
80% training data:
    h11 accuracy: 0.612
    h21 accuracy: 0.108

30% training data:
    h11 accuracy: 0.607
    h21 accuracy: 0.103


# Gaussian SVM

In [15]:
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score
import numpy as np

est_h11_80 = SVR(C=14, epsilon=0.01, gamma=0.03)
est_h21_80 = SVR(C=40, epsilon=0.01, gamma=0.06)
est_h11_30 = SVR(C=14, epsilon=0.01, gamma=0.03)
est_h21_30 = SVR(C=40, epsilon=0.01, gamma=0.06)

est_h11_80.fit(mat_train_80, h11_train_80)
est_h21_80.fit(mat_train_80, h21_train_80)
est_h11_30.fit(mat_train_30, h11_train_30)
est_h21_30.fit(mat_train_30, h21_train_30)

h11_pred_80 = np.rint(est_h11_80.predict(mat_test_80)).astype(int)
h21_pred_80 = np.rint(est_h21_80.predict(mat_test_80)).astype(int)
h11_pred_30 = np.rint(est_h11_30.predict(mat_test_30)).astype(int)
h21_pred_30 = np.rint(est_h21_30.predict(mat_test_30)).astype(int)

print('\nGAUSSIAN SVM')
print('80% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_80, h11_pred_80)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_80, h21_pred_80)))
print('')
print('30% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_30, h11_pred_30)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_30, h21_pred_30)))


GAUSSIAN SVM
80% training data:
    h11 accuracy: 0.700
    h21 accuracy: 0.222

30% training data:
    h11 accuracy: 0.576
    h21 accuracy: 0.161


# Random Forest

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
import numpy as np

est_h11_80 = RandomForestRegressor(criterion='mse',
                                   max_depth=100,
                                   max_leaf_nodes=100,
                                   min_samples_leaf=1,
                                   min_samples_split=2,
                                   min_weight_fraction_leaf=0.0,
                                   n_estimators=10,
                                   random_state=RAND,
                                   n_jobs=-1
                                  )
est_h21_80 = RandomForestRegressor(criterion='mae',
                                   max_depth=90,
                                   max_leaf_nodes=90,
                                   min_samples_leaf=30,
                                   min_samples_split=30,
                                   min_weight_fraction_leaf=3.0e-4,
                                   n_estimators=190,
                                   random_state=RAND,
                                   n_jobs=-1
                                  )
est_h11_30 = RandomForestRegressor(criterion='mse',
                                   max_depth=100,
                                   max_leaf_nodes=100,
                                   min_samples_leaf=1,
                                   min_samples_split=2,
                                   min_weight_fraction_leaf=0.0,
                                   n_estimators=10,
                                   random_state=RAND,
                                   n_jobs=-1
                                  )
est_h21_30 = RandomForestRegressor(criterion='mae',
                                   max_depth=90,
                                   max_leaf_nodes=90,
                                   min_samples_leaf=3,
                                   min_samples_split=30,
                                   min_weight_fraction_leaf=3.0e-4,
                                   n_estimators=190,
                                   random_state=RAND,
                                   n_jobs=-1
                                  )

est_h11_80.fit(mat_train_80, h11_train_80)
est_h21_80.fit(mat_train_80, h21_train_80)
est_h11_30.fit(mat_train_30, h11_train_30)
est_h21_30.fit(mat_train_30, h21_train_30)

h11_pred_80 = np.floor(est_h11_80.predict(mat_test_80)).astype(int)
h21_pred_80 = np.floor(est_h21_80.predict(mat_test_80)).astype(int)
h11_pred_30 = np.floor(est_h11_30.predict(mat_test_30)).astype(int)
h21_pred_30 = np.floor(est_h21_30.predict(mat_test_30)).astype(int)

print('\nRANDOM FOREST')
print('80% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_80, h11_pred_80)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_80, h21_pred_80)))
print('')
print('30% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_30, h11_pred_30)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_30, h21_pred_30)))


RANDOM FOREST
80% training data:
    h11 accuracy: 0.560
    h21 accuracy: 0.104

30% training data:
    h11 accuracy: 0.540
    h21 accuracy: 0.114


# Gradient Boosting (eng. feat.)

In [17]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import accuracy_score
import numpy as np

est_h11_80 = GradientBoostingRegressor(criterion='mae',
                                       learning_rate=0.3,
                                       loss='huber',
                                       alpha=0.4,
                                       max_depth=100,
                                       min_samples_split=2,
                                       min_weight_fraction_leaf=0.03,
                                       n_estimators=90,
                                       subsample=0.8,
                                       random_state=RAND
                                      )
est_h21_80 = GradientBoostingRegressor(criterion='mae',
                                       learning_rate=0.6,
                                       loss='ls',
                                       max_depth=85,
                                       min_samples_split=30,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=100,
                                       subsample=0.7,
                                       random_state=RAND
                                      )
est_h11_30 = GradientBoostingRegressor(criterion='mae',
                                       learning_rate=0.3,
                                       loss='huber',
                                       alpha=0.4,
                                       max_depth=100,
                                       min_samples_split=2,
                                       min_weight_fraction_leaf=0.03,
                                       n_estimators=90,
                                       subsample=0.8,
                                       random_state=RAND
                                      )
est_h21_30 = GradientBoostingRegressor(criterion='mae',
                                       learning_rate=0.6,
                                       loss='ls',
                                       max_depth=85,
                                       min_samples_split=30,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=100,
                                       subsample=0.7,
                                       random_state=RAND
                                      )

est_h11_80.fit(mat_train_80, h11_train_80)
est_h21_80.fit(mat_train_80, h21_train_80)
est_h11_30.fit(mat_train_30, h11_train_30)
est_h21_30.fit(mat_train_30, h21_train_30)

h11_pred_80 = np.floor(est_h11_80.predict(mat_test_80)).astype(int)
h21_pred_80 = np.floor(est_h21_80.predict(mat_test_80)).astype(int)
h11_pred_30 = np.floor(est_h11_30.predict(mat_test_30)).astype(int)
h21_pred_30 = np.floor(est_h21_30.predict(mat_test_30)).astype(int)

print('\nGRADIENT BOOSTING')
print('80% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_80, h11_pred_80)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_80, h21_pred_80)))
print('')
print('30% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_30, h11_pred_30)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_30, h21_pred_30)))


GRADIENT BOOSTING
80% training data:
    h11 accuracy: 0.495
    h21 accuracy: 0.115

30% training data:
    h11 accuracy: 0.461
    h21 accuracy: 0.103
