In [None]:
import h5py
import numpy as np
import tensorflow as tf
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_score, recall_score, fbeta_score
from tensorflow.keras.models import clone_model
from tqdm import tqdm
import pathlib
import sys
import os

In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    root_path=f'/content/drive/MyDrive/Colab Notebooks/concept-drift-online'
except:
    root_path = pathlib.Path(f'../').resolve()

In [None]:

attack_vector='network' #cache
machine_setting='static_browser_version' # dynamic_browser_version

data_path=f'{root_path}/data/{machine_setting}/{attack_vector}'


In [None]:
os.getcwd()

In [None]:
sys.path.append(str(root_path)) # Add the directory to the path
from utils import prepare_hdf5_data,predict_on_hdf5,get_data_paths_ordered,load_hdf5_data,create_subset_encoder#,OpenWorldLabelEncoder

In [None]:

import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


In [None]:
model_path=f'{data_path}/models/cnn_lstm.keras'
le_dir=f'{data_path}/models/label_encoder_100.pkl'
model0 = tf.keras.models.load_model(model_path)
with open(le_dir, 'rb') as f:
    le = pickle.load(f)

In [None]:
ordered_paths=get_data_paths_ordered(data_path)


In [None]:
batch=256
max_epochs=30
num_of_classes=100
n_incremental_samples_per_website=10

In [None]:
results={}
for path in tqdm(ordered_paths):
    curr_date_i=path.stem
    train_path_i = (path/'train.h5').as_posix()
    test_path_i = (path/'test.h5').as_posix()
    X_train_i, y_train_i = load_hdf5_data(train_path_i,le,n_samples=n_incremental_samples_per_website)
    train_i = prepare_hdf5_data(X_train_i, y_train_i, batch_size=batch, sample_validation=-1,shuffle=True)['data']
    model_i=clone_model(model0)
    model_i.set_weights(model0.get_weights())
    model_i.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),loss='categorical_crossentropy',  metrics=['accuracy'])
    model_i.fit(train_i,epochs=max_epochs,verbose=0)

    X_test_i, y_test_i = load_hdf5_data(test_path_i,le)
    test_datasets_i = prepare_hdf5_data(X_test_i, y_test_i, batch_size=batch, sample_validation=-1,shuffle=False)['data']
    all_predictions = []
    all_true_labels = []
    for features, labels in test_datasets_i:
        batch_predictions = model_i.predict_on_batch(features)
        all_predictions.append(batch_predictions)
        all_true_labels.append(labels.numpy())

    pred_i = np.concatenate(all_predictions, axis=0)
    y_test_aligned = np.concatenate(all_true_labels, axis=0)
    acci=(pred_i.argmax(axis=1)==y_test_aligned.argmax(axis=1)).mean()
    results.update({curr_date_i: {'accuracy': acci}})
    print(f'Date: {curr_date_i}, Accuracy: {acci}')

In [None]:
model_i.summary()

In [None]:
results={}
batch_size=512
n_open_world_labels=30
threshold=0.6
n_incremental_samples_per_website=10

hdf5_train_path = (pathlib.Path(data_path)/'2021_08_27/train.h5').as_posix()

num_of_ow_classes=num_of_classes-n_open_world_labels
X_data, y_data = load_hdf5_data(hdf5_train_path, le, n_samples=-1)
cw_mask=y_data[:,-n_open_world_labels:].any(axis=1)==0
owle=create_subset_encoder(le,num_of_ow_classes)
ow_label=num_of_ow_classes

ow_model_path=f'{data_path}/models/cnn_lstm_ow.keras'
ow_model= tf.keras.models.load_model(ow_model_path)
for path in tqdm(ordered_paths):
    curr_date_i=path.stem
    train_path_i = (path/'train.h5').as_posix()
    X_train_i, y_train_i = load_hdf5_data(train_path_i,le,n_samples=n_incremental_samples_per_website)
    trn_cw_mask_i=~y_train_i[:,-n_open_world_labels:].any(axis=1)
    train_datasets_i = prepare_hdf5_data(X_train_i[trn_cw_mask_i], y_train_i[trn_cw_mask_i,:num_of_ow_classes], batch_size=batch, sample_validation=-1,shuffle=True)['data']
    model_i=clone_model(ow_model)
    model_i.set_weights(ow_model.get_weights())
    model_i.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),loss='categorical_crossentropy',  metrics=['accuracy'])
    model_i.fit(train_datasets_i,epochs=max_epochs,verbose=0)
    test_path_i = (path/'test.h5').as_posix()
    X_test_i, y_test_i = load_hdf5_data(test_path_i,le)
    test_datasets_i = prepare_hdf5_data(X_test_i, y_test_i[:,:num_of_ow_classes], batch_size=batch, sample_validation=-1,shuffle=True)['data']
    all_predictions = []
    all_true_labels = []
    for features, labels in test_datasets_i:
        batch_predictions = model_i.predict_on_batch(features)
        all_predictions.append(batch_predictions)
        all_true_labels.append(labels.numpy())

    pred_i = np.concatenate(all_predictions, axis=0)
    pred_cw_mask_i=pred_i.max(axis=1)>=threshold
    pred_cat_i=pred_i.argmax(axis=1)
    pred_cat_i[~pred_cw_mask_i]=ow_label

    y_test_aligned = np.concatenate(all_true_labels, axis=0)
    y_test_cw_mask=y_test_aligned.any(axis=1)
    y_test_aligned_cat=y_test_aligned.argmax(axis=1)
    y_test_aligned_cat[~y_test_cw_mask]=ow_label

    acci=(pred_cat_i==y_test_aligned_cat).mean()
    y_true_indices = y_test_aligned_cat
    y_pred_indices = pred_cat_i
    precision = precision_score(y_true_indices, y_pred_indices, labels=list(range(num_of_ow_classes)), average='weighted')
    recall = recall_score(y_true_indices, y_pred_indices, labels=list(range(num_of_ow_classes)), average='weighted')
    f2score = fbeta_score(y_true_indices, y_pred_indices, beta=2, labels=list(range(num_of_ow_classes)), average='weighted')
    results.update({curr_date_i: {'accuracy': acci, 'precision': precision, 'recall': recall, 'f2score': f2score}})
    print(f'Date: {curr_date_i}, Accuracy: {acci}, Precision (sensitive): {precision}, Recall (sensitive): {recall}, F2-score (sensitive): {f2score}')

In [None]:
import json
import pandas as pd
with open(data_path+f'/transfer-ow-{n_incremental_samples_per_website}-sm.json','w') as f:
   json.dump(pd.DataFrame(results).to_json(), f)

In [None]:
data_path+f'/transfer-ow-{n_incremental_samples_per_website}-sm.json'

In [None]:
import numpy as np
import pandas as pd
pd.Series(labels.numpy().argmax(axis=1)).value_counts()

In [None]:
pd.Series(pred_i.max(axis=1)).hist()