<a href="https://colab.research.google.com/github/w-dan/MATM/blob/main/MATM-code/data-analysis/bert_OPTUNA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Boilerplate optuna for BERT-based models

# Initial setup

In [5]:
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable


In [42]:
# core
import os, logging, time, torch, gc, re
from dotenv import load_dotenv
import numpy as np

import optuna
import pickle
import json

# dataset
from datasets import Dataset
from pymongo import MongoClient
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# training
import torch
from transformers import(
    EarlyStoppingCallback,
    AdamW,
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
import optuna

# evaluation
import matplotlib.pyplot as plt
from typing import List, Dict, Any
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
import seaborn as sns
import pickle, json

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import torch

In [12]:
from bert_utils import *

In [13]:
load_dotenv(".env")
CONNECTION_STRING = os.getenv("CONNECTION_STRING")
TOKEN = os.getenv("HUGGINGFACE_TOKEN")
os.environ["TOKENIZERS_PARALLELISM"] = "true"

os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"

In [14]:
collection_name = "dataset"
DATABASE_NAME = "APTs"

# Dataset preparation

In [15]:
df = fetch_and_preprocess_data(DATABASE_NAME, collection_name, CONNECTION_STRING, preprocess=True, field_to_get="tactics", include_tactics=True)
df

[nltk_data] Downloading package punkt to /home/dalgora/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dalgora/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[+] Shape: (156, 2)
                                              corpus  \
0  decoding network data gh0st rat variant nccgro...   
1  5www .prevailion.com /what-wicked-webs-we-unwe...   
2  7www .mandiant.com /resources/blog/lightshift-...   
3  2015/11/24 russian financial cybercrime works ...   
4  9/28/21 4:23 pm foggyw eb argeted nobelium mal...   

                                             tactics  
0  [Execution, Persistence, Credential Access, Co...  
1               [Credential Access, Defense Evasion]  
2  [Defense Evasion, Privilege Escalation, Execut...  
3     [Credential Access, Exfiltration, Persistence]  
4  [Persistence, Credential Access, Command and C...  


Unnamed: 0,corpus,tactics
0,decoding network data gh0st rat variant nccgro...,"[Execution, Persistence, Credential Access, Co..."
1,5www .prevailion.com /what-wicked-webs-we-unwe...,"[Credential Access, Defense Evasion]"
2,7www .mandiant.com /resources/blog/lightshift-...,"[Defense Evasion, Privilege Escalation, Execut..."
3,2015/11/24 russian financial cybercrime works ...,"[Credential Access, Exfiltration, Persistence]"
4,9/28/21 4:23 pm foggyw eb argeted nobelium mal...,"[Persistence, Credential Access, Command and C..."
...,...,...
151,first release time 14:32 may 27 2015 updated t...,"[Execution, Command and Control, Defense Evasi..."
152,10/1/2019 helo innti attack scan lastline http...,"[Initial Access, Execution, Persistence, Defen..."
153,9/9/2020 teamtnt activity tar gets eave scope ...,"[Initial Access, Execution, Persistence, Privi..."
154,2stisc-gov-md.translate.goog /ro/stisc-atentie...,"[Initial Access, Credential Access, Collection]"


In [16]:
df_one_hot_encoded = process_tactics(df)
df_one_hot_encoded

tactics_length
5     53
6     32
7     13
4     12
3     12
8     11
9     10
11     6
10     4
12     2
2      1
Name: count, dtype: int64


Unnamed: 0,corpus,tactics_length,Execution,Persistence,Credential Access,Collection,Defense Evasion,Privilege Escalation,Exfiltration,Command and Control,Discovery,Initial Access,Resource Development,Lateral Movement,Impact,Reconnaissance
0,decoding network data gh0st rat variant nccgro...,4,1,1,1,1,0,0,0,0,0,0,0,0,0,0
1,5www .prevailion.com /what-wicked-webs-we-unwe...,2,0,0,1,0,1,0,0,0,0,0,0,0,0,0
2,7www .mandiant.com /resources/blog/lightshift-...,3,1,0,0,0,1,1,0,0,0,0,0,0,0,0
3,2015/11/24 russian financial cybercrime works ...,3,0,1,1,0,0,0,1,0,0,0,0,0,0,0
4,9/28/21 4:23 pm foggyw eb argeted nobelium mal...,3,0,1,1,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,first release time 14:32 may 27 2015 updated t...,4,1,0,0,0,1,0,0,1,0,0,0,1,0,0
152,10/1/2019 helo innti attack scan lastline http...,6,1,1,0,0,1,0,0,1,0,1,0,0,0,1
153,9/9/2020 teamtnt activity tar gets eave scope ...,6,1,1,0,0,0,1,0,0,1,1,0,0,0,1
154,2stisc-gov-md.translate.goog /ro/stisc-atentie...,3,0,0,1,1,0,0,0,0,0,1,0,0,0,0


In [17]:
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")

texts = df_one_hot_encoded['corpus'].tolist()
labels = df_one_hot_encoded.drop(columns=['corpus', 'tactics_length']).values

In [18]:
def convert_to_float32(x):
    if isinstance(x, list) or isinstance(x, np.ndarray):
        return np.array(x, dtype=np.float32)
    else:
        return np.float32(x)

In [19]:
labels_list = list(labels)

In [20]:
# making sure they are all the same length
max_length = max(len(label) for label in labels)
for i in range(len(labels)):
    if len(labels[i]) < max_length:
        labels[i] = np.pad(labels[i], (0, max_length - len(labels[i])), 'constant')

labels = np.vstack(labels)  # stacking the labels into a 2D array

In [21]:
print(len(texts))
print(len(labels))

156
156


In [22]:
train_dataset, val_dataset, test_dataset = prepare_datasets(texts, labels, tokenizer)
print(type(train_dataset))

<class 'bert_utils.CustomDataset'>


# Model and training

In [45]:
num_labels = labels.shape[1]
model = RobertaForSequenceClassification.from_pretrained(
    'FacebookAI/roberta-base',
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
data_collator = DataCollatorWithPadding(tokenizer)

In [47]:
def objective(trial):
    # hyperparameters to optimize
    num_train_epochs = trial.suggest_int('num_train_epochs', 50, 100)
    learning_rate = trial.suggest_float('learning_rate', 1e-6, 5e-5, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-1, log=True)

    training_args = TrainingArguments(
        output_dir=f'./results/{trial.number}',  # output directory
        num_train_epochs=num_train_epochs,       # total number of training epochs
        per_device_train_batch_size=8,  # batch size per device during training
        per_device_eval_batch_size=8,            # batch size for evaluation
        warmup_steps=500,                        # number of warmup steps for learning rate scheduler
        weight_decay=weight_decay,               # strength of weight decay (formerly 0.01)
        logging_dir=f'./logs/{trial.number}',    # directory for storing logs
        logging_steps=10,
        eval_steps=10,
        evaluation_strategy='epoch',
        learning_rate=learning_rate,
    )

    # model and trainer
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=max_length, problem_type="multi_label_classification")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator
    )


    # saving the training logs along with hyperparameters
    training_logs = trainer.train()
    log_data = {
        'trial_number': trial.number,
        'num_train_epochs': num_train_epochs,
        'learning_rate': learning_rate,
        'weight_decay': weight_decay,
        'training_logs': training_logs
    }

    with open(f'./results_roberta/{trial.number}/training_logs.pkl', 'wb') as f:
        pickle.dump(log_data, f)

    eval_result = trainer.evaluate()

    # saving the evaluation results
    with open(f'./results_roberta/{trial.number}/eval_results.json', 'w') as f:
        json.dump(eval_result, f)

    return eval_result['eval_loss']

Optuna search:

In [48]:
print(train_dataset.labels.shape)
print(test_dataset.labels.shape)
print(val_dataset.labels.shape)

(109, 14)
(24, 14)
(23, 14)


In [49]:
os.environ["NCCL_IB_DISABLE"] = "1"
os.environ["NCCL_SHM_DISABLE"] = "1"

In [50]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

# saving best hyperparameters
best_hyperparams = study.best_params
with open('./results/best_hyperparams.json', 'w') as f:
    json.dump(best_hyperparams, f)

print(f"Best Hyperparameters: {best_hyperparams}")

[I 2024-07-05 17:59:38,673] A new study created in memory with name: no-name-9c4aea86-accc-4cb7-93b7-95753b450fad
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.713934
2,0.714400,0.712997
3,0.712300,0.711383
4,0.712300,0.708901
5,0.708400,0.705116
6,0.702000,0.699926
7,0.702000,0.691746
8,0.693300,0.677146
9,0.676500,0.652308
10,0.643700,0.618836




[I 2024-07-05 18:04:42,497] Trial 0 finished with value: 0.50232994556427 and parameters: {'num_train_epochs': 72, 'learning_rate': 1.393647809788165e-05, 'weight_decay': 0.0014047417205432543}. Best is trial 0 with value: 0.50232994556427.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.715176
2,0.717500,0.713456
3,0.714900,0.710592
4,0.714900,0.706416
5,0.708100,0.700106
6,0.699900,0.689821
7,0.699900,0.66895
8,0.682400,0.62819
9,0.639800,0.565439
10,0.575000,0.523031




[I 2024-07-05 18:09:23,039] Trial 1 finished with value: 0.48317718505859375 and parameters: {'num_train_epochs': 67, 'learning_rate': 2.3849496915134395e-05, 'weight_decay': 0.00015919365532488254}. Best is trial 1 with value: 0.48317718505859375.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.680063
2,0.679200,0.677659
3,0.678300,0.673496
4,0.678300,0.666174
5,0.671200,0.649691
6,0.651600,0.609595
7,0.651600,0.545633
8,0.597800,0.517483
9,0.553800,0.500177
10,0.523800,0.487549




[I 2024-07-05 18:13:27,477] Trial 2 finished with value: 0.5227958559989929 and parameters: {'num_train_epochs': 58, 'learning_rate': 4.010013202109573e-05, 'weight_decay': 0.0060864737305380825}. Best is trial 1 with value: 0.48317718505859375.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.67913
2,0.683000,0.677699
3,0.680100,0.675253
4,0.680100,0.671545
5,0.676800,0.66583
6,0.668900,0.655542
7,0.668900,0.633235
8,0.649900,0.583009
9,0.609300,0.541682
10,0.555900,0.51448


[W 2024-07-05 18:16:59,853] Trial 3 failed with parameters: {'num_train_epochs': 64, 'learning_rate': 2.2159923706794467e-05, 'weight_decay': 0.009718092906696701} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/dalgora/.local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_2460/78686486.py", line 34, in objective
    training_logs = trainer.train()
  File "/home/dalgora/.local/lib/python3.10/site-packages/transformers/trainer.py", line 1932, in train
    return inner_training_loop(
  File "/home/dalgora/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2268, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/home/dalgora/.local/lib/python3.10/site-packages/transformers/trainer.py", line 3307, in training_step
    loss = self.compute_loss(model, inputs)
  File "/home/dalgora/.local/lib/python3

KeyboardInterrupt: 