# Import the Database

In [1]:
# set memory growth (necessary for training)
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

In [2]:
import pandas as pd

df = pd.read_csv('./cicy3o_tidy.csv')

# remove the outliers completely
df = df.loc[(df['h11'] >= 1) &
            (df['h11'] <= 16) &
            (df['h21'] >= 15) &
            (df['h21'] <= 86)
           ].reset_index(drop=True)

# select the matrix to compute the PCA
mat = df.filter(regex='^matrix_')

# Compute the PCA

In [3]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.99).fit_transform(mat)
col = ['pca_{}'.format(n) for n in range(pca.shape[1])]

pca = pd.DataFrame(pca, columns=col)

df  = df.join(pca)

In [10]:
max_entry = mat.max().max()
mat = mat / max_entry

# Select Input Features

In [3]:
df_h11 = df.filter(regex='^num_cp$|^dim_cp|^pca').values
df_h21 = df.filter(regex='^num_cp$|^dim_cp|^dim_h0|^pca').values

df_nopca_h11 = df.filter(regex='^num_cp$|^dim_cp').values
df_nopca_h21 = df.filter(regex='^num_cp$|^dim_cp|^dim_h0').values

h11    = df['h11'].values.reshape(-1,)
h21    = df['h21'].values.reshape(-1,)

# Train Test Split

In [4]:
RAND = 42

In [52]:
from sklearn.model_selection import train_test_split

df_h11_train_80, df_h11_test_80, \
df_h21_train_80, df_h21_test_80, \
df_nopca_h11_train_80, df_nopca_h11_test_80, \
df_nopca_h21_train_80, df_nopca_h21_test_80, \
mat_train_80, mat_test_80, \
h11_train_80, h11_test_80, \
h21_train_80, h21_test_80 = train_test_split(df_h11, df_h21, df_nopca_h11, df_nopca_h21, mat, h11, h21,
                                             train_size=0.8, shuffle=True, random_state=RAND)

df_h11_train_30, df_h11_test_30, \
df_h21_train_30, df_h21_test_30, \
df_nopca_h11_train_30, df_nopca_h11_test_30, \
df_nopca_h21_train_30, df_nopca_h21_test_30, \
mat_train_30, mat_test_30, \
h11_train_30, h11_test_30, \
h21_train_30, h21_test_30 = train_test_split(df_h11, df_h21, df_nopca_h11, df_nopca_h21, mat, h11, h21,
                                             train_size=0.3, shuffle=True, random_state=RAND)

mat_val_80, mat_test_80, \
h11_cnn_val_80, h11_cnn_test_80, \
h21_cnn_val_80, h21_cnn_test_80 = train_test_split(mat_test_80, h11_test_80, h21_test_80,
                                                   train_size=1/2, shuffle=True, random_state=RAND
                                                  )

mat_val_30, mat_test_30, \
h11_cnn_val_30, h11_cnn_test_30, \
h21_cnn_val_30, h21_cnn_test_30 = train_test_split(mat_test_30, h11_test_30, h21_test_30,
                                                   train_size=1/7, shuffle=True, random_state=RAND
                                                  )

mat_train_80 = mat_train_80.values.reshape(-1, 12, 15, 1)
mat_train_30 = mat_train_30.values.reshape(-1, 12, 15, 1)
mat_val_80   = mat_val_80.values.reshape(-1, 12, 15, 1)
mat_val_30   = mat_val_30.values.reshape(-1, 12, 15, 1)
mat_test_80  = mat_test_80.values.reshape(-1, 12, 15, 1)
mat_test_30  = mat_test_30.values.reshape(-1, 12, 15, 1)

# Linear Regression (PCA + eng. feat.)

In [6]:
from sklearn.linear_model import Lasso
from sklearn.metrics import accuracy_score
import numpy as np

est_h11_80 = Lasso(alpha=0.07, fit_intercept=False, max_iter=1e6, random_state=RAND)
est_h21_80 = Lasso(alpha=2.0e-6, fit_intercept=True, normalize=True, max_iter=1e6, random_state=RAND)
est_h11_30 = Lasso(alpha=0.07, fit_intercept=False, max_iter=1e6, random_state=RAND)
est_h21_30 = Lasso(alpha=2.0e-6, fit_intercept=True, normalize=True, max_iter=1e6, random_state=RAND)

est_h11_80.fit(df_h11_train_80, h11_train_80)
est_h21_80.fit(df_h21_train_80, h21_train_80)
est_h11_30.fit(df_h11_train_30, h11_train_30)
est_h21_30.fit(df_h21_train_30, h21_train_30)

h11_pred_80 = np.floor(est_h11_80.predict(df_h11_test_80)).astype(int)
h21_pred_80 = np.floor(est_h21_80.predict(df_h21_test_80)).astype(int)
h11_pred_30 = np.floor(est_h11_30.predict(df_h11_test_30)).astype(int)
h21_pred_30 = np.floor(est_h21_30.predict(df_h21_test_30)).astype(int)

print('\nLINEAR REGRESSION')
print('80% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_80, h11_pred_80)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_80, h21_pred_80)))
print('')
print('30% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_30, h11_pred_30)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_30, h21_pred_30)))

80% training data:
    h11 accuracy: 0.637
    h21 accuracy: 0.202

30% training data:
    h11 accuracy: 0.625
    h21 accuracy: 0.193


# Linear SVM (eng. feat.)

In [17]:
from sklearn.svm import LinearSVR
from sklearn.metrics import accuracy_score
import numpy as np

est_h11_80 = LinearSVR(C=0.13, epsilon=0.9, fit_intercept=True, intercept_scaling=0.01, loss='epsilon_insensitive', max_iter=1e6, random_state=RAND)
est_h21_80 = LinearSVR(C=0.51, epsilon=0.0, fit_intercept=True, intercept_scaling=100, loss='epsilon_insensitive', max_iter=1e6, random_state=RAND)
est_h11_30 = LinearSVR(C=0.13, epsilon=0.9, fit_intercept=True, intercept_scaling=0.01, loss='epsilon_insensitive', max_iter=1e6, random_state=RAND)
est_h21_30 = LinearSVR(C=0.51, epsilon=0.0, fit_intercept=True, intercept_scaling=100, loss='epsilon_insensitive', max_iter=1e6, random_state=RAND)

est_h11_80.fit(df_nopca_h11_train_80, h11_train_80)
est_h21_80.fit(df_nopca_h21_train_80, h21_train_80)
est_h11_30.fit(df_nopca_h11_train_30, h11_train_30)
est_h21_30.fit(df_nopca_h21_train_30, h21_train_30)

h11_pred_80 = np.floor(est_h11_80.predict(df_nopca_h11_test_80)).astype(int)
h21_pred_80 = np.floor(est_h21_80.predict(df_nopca_h21_test_80)).astype(int)
h11_pred_30 = np.floor(est_h11_30.predict(df_nopca_h11_test_30)).astype(int)
h21_pred_30 = np.floor(est_h21_30.predict(df_nopca_h21_test_30)).astype(int)

print('\nLINEAR SVM')
print('80% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_80, h11_pred_80)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_80, h21_pred_80)))
print('')
print('30% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_30, h11_pred_30)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_30, h21_pred_30)))



80% training data:
    h11 accuracy: 0.630
    h21 accuracy: 0.183

30% training data:
    h11 accuracy: 0.626
    h21 accuracy: 0.185




# Gaussian SVM (PCA + eng. feat.)

In [18]:
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score
import numpy as np

est_h11_80 = SVR(C=1.0, epsilon=0.02, gamma=0.02)
est_h21_80 = SVR(C=45, epsilon=0.2, gamma=0.013)
est_h11_30 = SVR(C=1.0, epsilon=0.02, gamma=0.02)
est_h21_30 = SVR(C=45, epsilon=0.2, gamma=0.013)

est_h11_80.fit(df_h11_train_80, h11_train_80)
est_h21_80.fit(df_h21_train_80, h21_train_80)
est_h11_30.fit(df_h11_train_30, h11_train_30)
est_h21_30.fit(df_h21_train_30, h21_train_30)

h11_pred_80 = np.rint(est_h11_80.predict(df_h11_test_80)).astype(int)
h21_pred_80 = np.rint(est_h21_80.predict(df_h21_test_80)).astype(int)
h11_pred_30 = np.rint(est_h11_30.predict(df_h11_test_30)).astype(int)
h21_pred_30 = np.rint(est_h21_30.predict(df_h21_test_30)).astype(int)

print('\nGAUSSIAN SVM')
print('80% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_80, h11_pred_80)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_80, h21_pred_80)))
print('')
print('30% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_30, h11_pred_30)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_30, h21_pred_30)))

80% training data:
    h11 accuracy: 0.716
    h21 accuracy: 0.337

30% training data:
    h11 accuracy: 0.693
    h21 accuracy: 0.272


# Random Forest (PCA + eng. feat.)

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
import numpy as np

est_h11_80 = RandomForestRegressor(criterion='mae',
                                   max_depth=30,
                                   max_leaf_nodes=90,
                                   min_samples_leaf=1,
                                   min_samples_split=100,
                                   min_weight_fraction_leaf=0.0,
                                   n_estimators=10,
                                   random_state=RAND,
                                   n_jobs=-1
                                  )
est_h21_80 = RandomForestRegressor(criterion='mae',
                                   max_depth=100,
                                   max_leaf_nodes=100,
                                   min_samples_leaf=30,
                                   min_samples_split=2,
                                   min_weight_fraction_leaf=0.0,
                                   n_estimators=300,
                                   random_state=RAND,
                                   n_jobs=-1
                                  )
est_h11_30 = RandomForestRegressor(criterion='mae',
                                   max_depth=30,
                                   max_leaf_nodes=90,
                                   min_samples_leaf=1,
                                   min_samples_split=100,
                                   min_weight_fraction_leaf=0.0,
                                   n_estimators=10,
                                   random_state=RAND,
                                   n_jobs=-1
                                  )
est_h21_30 = RandomForestRegressor(criterion='mae',
                                   max_depth=100,
                                   max_leaf_nodes=100,
                                   min_samples_leaf=30,
                                   min_samples_split=2,
                                   min_weight_fraction_leaf=0.0,
                                   n_estimators=300,
                                   random_state=RAND,
                                   n_jobs=-1
                                  )

est_h11_80.fit(df_h11_train_80, h11_train_80)
est_h21_80.fit(df_h21_train_80, h21_train_80)
est_h11_30.fit(df_h11_train_30, h11_train_30)
est_h21_30.fit(df_h21_train_30, h21_train_30)

h11_pred_80 = np.floor(est_h11_80.predict(df_h11_test_80)).astype(int)
h21_pred_80 = np.floor(est_h21_80.predict(df_h21_test_80)).astype(int)
h11_pred_30 = np.floor(est_h11_30.predict(df_h11_test_30)).astype(int)
h21_pred_30 = np.floor(est_h21_30.predict(df_h21_test_30)).astype(int)

print('\nRANDOM FOREST')
print('80% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_80, h11_pred_80)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_80, h21_pred_80)))
print('')
print('30% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_30, h11_pred_30)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_30, h21_pred_30)))

80% training data:
    h11 accuracy: 0.627
    h21 accuracy: 0.164

30% training data:
    h11 accuracy: 0.584
    h21 accuracy: 0.144


# Gradient Boosting (eng. feat.)

In [26]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import accuracy_score
import numpy as np

est_h11_80 = GradientBoostingRegressor(criterion='friedman_mse',
                                       learning_rate=0.15,
                                       loss='ls',
                                       max_depth=2,
                                       min_samples_split=10,
                                       min_weight_fraction_leaf=0.2,
                                       n_estimators=100,
                                       subsample=0.1,
                                       random_state=RAND
                                      )
est_h21_80 = GradientBoostingRegressor(criterion='mae',
                                       learning_rate=0.04,
                                       loss='huber',
                                       alpha=0.99,
                                       max_depth=35,
                                       min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=200,
                                       subsample=0.1,
                                       random_state=RAND
                                      )
est_h11_30 = GradientBoostingRegressor(criterion='friedman_mse',
                                       learning_rate=0.15,
                                       loss='ls',
                                       max_depth=2,
                                       min_samples_split=10,
                                       min_weight_fraction_leaf=0.2,
                                       n_estimators=100,
                                       subsample=0.1,
                                       random_state=RAND
                                      )
est_h21_30 = GradientBoostingRegressor(criterion='mae',
                                       learning_rate=0.04,
                                       loss='huber',
                                       alpha=0.99,
                                       max_depth=35,
                                       min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=200,
                                       subsample=0.1,
                                       random_state=RAND
                                      )

est_h11_80.fit(df_nopca_h11_train_80, h11_train_80)
est_h21_80.fit(df_nopca_h21_train_80, h21_train_80)
est_h11_30.fit(df_nopca_h11_train_30, h11_train_30)
est_h21_30.fit(df_nopca_h21_train_30, h21_train_30)

h11_pred_80 = np.floor(est_h11_80.predict(df_nopca_h11_test_80)).astype(int)
h21_pred_80 = np.floor(est_h21_80.predict(df_nopca_h21_test_80)).astype(int)
h11_pred_30 = np.floor(est_h11_30.predict(df_nopca_h11_test_30)).astype(int)
h21_pred_30 = np.floor(est_h21_30.predict(df_nopca_h21_test_30)).astype(int)

print('\nGRADIENT BOOSTING')
print('80% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_80, h11_pred_80)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_80, h21_pred_80)))
print('')
print('30% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_test_30, h11_pred_30)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_test_30, h21_pred_30)))

80% training data:
    h11 accuracy: 0.569
    h21 accuracy: 0.232

30% training data:
    h11 accuracy: 0.554
    h21 accuracy: 0.201


# ConvNet (matrix)

In [65]:
# load and discard previous weights
convnet = tf.keras.models.load_model('./cnn_sequential.h5')
convnet_h11_80 = tf.keras.models.model_from_json(convnet.to_json())
convnet_h21_80 = tf.keras.models.model_from_json(convnet.to_json())
convnet_h11_30 = tf.keras.models.model_from_json(convnet.to_json())
convnet_h21_30 = tf.keras.models.model_from_json(convnet.to_json())

convnet_h11_80.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                       loss='mse',
                       metrics=['mse', 'mae']
                      )
convnet_h21_80.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                       loss='mse',
                       metrics=['mse', 'mae']
                      )
convnet_h11_30.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                       loss='mse',
                       metrics=['mse', 'mae']
                      )
convnet_h21_30.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                       loss='mse',
                       metrics=['mse', 'mae']
                      )

In [85]:
callbaks = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                 factor=0.3,
                                                 patience=80,
                                                 verbose=0,
                                                 min_lr=1.0e-6
                                                ),
            tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                             patience=200,
                                             verbose=0,
                                             restore_best_weights=True
                                            )
           ]

_ = convnet_h11_80.fit(x=mat_train_80,
                       y=h11_train_80,
                       batch_size=32,
                       epochs=10,
                       verbose=0,
                       callbacks=callbaks,
                       validation_data=(mat_val_80, h11_cnn_val_80)
                      )
_ = convnet_h21_80.fit(x=mat_train_80,
                       y=h21_train_80,
                       batch_size=32,
                       epochs=10,
                       verbose=0,
                       callbacks=callbaks,
                       validation_data=(mat_val_80, h21_cnn_val_80)
                      )
_ = convnet_h11_30.fit(x=mat_train_30,
                       y=h11_train_30,
                       batch_size=32,
                       epochs=10,
                       verbose=0,
                       callbacks=callbaks,
                       validation_data=(mat_val_30, h11_cnn_val_30)
                      )
_ = convnet_h21_30.fit(x=mat_train_30,
                       y=h21_train_30,
                       batch_size=32,
                       epochs=10,
                       verbose=0,
                       callbacks=callbaks,
                       validation_data=(mat_val_30, h21_cnn_val_30)
                      )

In [91]:
h11_cnn_pred_80 = np.rint(convnet_h11_80.predict(mat_test_80).reshape(-1,)).astype(int)
h21_cnn_pred_80 = np.rint(convnet_h21_80.predict(mat_test_80).reshape(-1,)).astype(int)
h11_cnn_pred_30 = np.rint(convnet_h11_30.predict(mat_test_30).reshape(-1,)).astype(int)
h21_cnn_pred_30 = np.rint(convnet_h21_30.predict(mat_test_30).reshape(-1,)).astype(int)

print('\nCONVNET')
print('80% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_cnn_test_80, h11_cnn_pred_80)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_cnn_test_80, h21_cnn_pred_80)))
print('')
print('30% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_cnn_test_30, h11_cnn_pred_30)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_cnn_test_30, h21_cnn_pred_30)))

80% training data:
    h11 accuracy: 0.814
    h21 accuracy: 0.248

30% training data:
    h11 accuracy: 0.621
    h21 accuracy: 0.214


# Inception (matrix)

In [86]:
# load and discard previous weights
inception = tf.keras.models.load_model('./cnn_inception.h5')
inception_h11_80 = tf.keras.models.model_from_json(inception.to_json())
inception_h21_80 = tf.keras.models.model_from_json(inception.to_json())
inception_h11_30 = tf.keras.models.model_from_json(inception.to_json())
inception_h21_30 = tf.keras.models.model_from_json(inception.to_json())

inception_h11_80.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                         loss='mse',
                         metrics=['mse', 'mae']
                        )
inception_h21_80.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                         loss='mse',
                         metrics=['mse', 'mae']
                        )
inception_h11_30.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                         loss='mse',
                         metrics=['mse', 'mae']
                        )
inception_h21_30.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                         loss='mse',
                         metrics=['mse', 'mae']
                        )

In [90]:
callbaks = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                 factor=0.3,
                                                 patience=75,
                                                 verbose=0,
                                                 min_lr=1.0e-6
                                                ),
            tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                             patience=200,
                                             verbose=0,
                                             restore_best_weights=True
                                            )
           ]

_ = inception_h11_80.fit(x=mat_train_80,
                         y=h11_train_80,
                         batch_size=32,
                         epochs=10,
                         verbose=0,
                         callbacks=callbaks,
                         validation_data=(mat_val_80, h11_cnn_val_80)
                        )
_ = inception_h21_80.fit(x=mat_train_80,
                         y=h21_train_80,
                         batch_size=32,
                         epochs=10,
                         verbose=0,
                         callbacks=callbaks,
                         validation_data=(mat_val_80, h21_cnn_val_80)
                        )
_ = inception_h11_30.fit(x=mat_train_30,
                         y=h11_train_30,
                         batch_size=32,
                         epochs=10,
                         verbose=0,
                         callbacks=callbaks,
                         validation_data=(mat_val_30, h11_cnn_val_30)
                        )
_ = inception_h21_30.fit(x=mat_train_30,
                         y=h21_train_30,
                         batch_size=32,
                         epochs=10,
                         verbose=0,
                         callbacks=callbaks,
                         validation_data=(mat_val_30, h21_cnn_val_30)
                        )

In [92]:
h11_cnn_pred_80 = np.rint(inception_h11_80.predict(mat_test_80).reshape(-1,)).astype(int)
h21_cnn_pred_80 = np.rint(inception_h21_80.predict(mat_test_80).reshape(-1,)).astype(int)
h11_cnn_pred_30 = np.rint(inception_h11_30.predict(mat_test_30).reshape(-1,)).astype(int)
h21_cnn_pred_30 = np.rint(inception_h21_30.predict(mat_test_30).reshape(-1,)).astype(int)

print('\nINCEPTION')
print('80% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_cnn_test_80, h11_cnn_pred_80)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_cnn_test_80, h21_cnn_pred_80)))
print('')
print('30% training data:')
print('    h11 accuracy: {:.3f}'.format(accuracy_score(h11_cnn_test_30, h11_cnn_pred_30)))
print('    h21 accuracy: {:.3f}'.format(accuracy_score(h21_cnn_test_30, h21_cnn_pred_30)))

80% training data:
    h11 accuracy: 0.644
    h21 accuracy: 0.190

30% training data:
    h11 accuracy: 0.538
    h21 accuracy: 0.098
