# Hyperparameter tuning notebook
- Use this notebook to find which hyperparameters work best for transformer model
- Part of training set will be used as validation set. Testing set of dataset will NOT be used

In [None]:
import importlib
import subprocess
import sys
from utils.environment_specific import is_local_development

def install_if_missing(package_name, pip_name=None):
    try:
        importlib.import_module(package_name)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name or package_name])

if not is_local_development():
    install_if_missing("dotenv", "python-dotenv")
    install_if_missing("onnxruntime")

In [None]:
import argparse
import copy
import random
import os

from dotenv import load_dotenv
import numpy as np
import pandas as pd
import torch

from utils.dataset import get_validation_dataset_by_args
from utils.experiments import setup_mlflow_client, get_run_name
from utils.environment_specific import is_local_development
from utils.transformers import TransformerModelManager

# Parameters

In [None]:
# fmt: off
parser = argparse.ArgumentParser()
parser.add_argument("--seed", default=42, type=int, help="Random seed")
# Model args
parser.add_argument("--model_type", default="bert_tiny", type=str, help="Type of BERT model to use. see 'get_model_checkpoint' method")
parser.add_argument("--dropout", default=0, type=float, help="Dropout rate on final classification layer")
parser.add_argument("--bert_learning_rate", default=3e-5, type=float, help="AdamW learning rate for everything in model except final classifaction layer")
parser.add_argument("--classifier_learning_rate", default=2e-3, type=float, help="AdamW learning rate for classification layer of model")
parser.add_argument("--weight_decay", default=0.01, type=float, help="AdamW weight decay for both parts of the model")
parser.add_argument("--decision_threshold", default=0.5, type=float, help="If probability of a class 1 is higher than this, then the sample is classified as class 1")
parser.add_argument("--max_sequence_length", default=256)

# Training params
parser.add_argument("--batch_size", default=128, type=int, help="Batch size")
parser.add_argument("--epochs_max", default=5, type=int, help="Maximum number of epochs. Can stop early, however")
parser.add_argument("--patience", default=3, type=int, help="Number of epochs to wait for validation accuracy increase before stopping. If it is set to None, then early stopping is not used")
parser.add_argument("--freeze_epochs", default=1, type=int, help="Number of epochs to freeze BERT non-final layers initially.")
parser.add_argument("--loss", default="focal", choices=["cross_entropy", "focal"], type=str, help="Loss function used")
parser.add_argument("--focal_loss_gamma", default=2, help="Pass gama parameter if focal loss is being used. Otherwise has no effect")
parser.add_argument("--focal_loss_alpha", default=0.5, help="Pass alpha parameter if focal loss is being used. Otherwise has no effect")

# Dataset args
parser.add_argument("--dataset_name", default="joined", choices=["private_data", "any_public_dataset_name"])
parser.add_argument("--train_folds", default=[0,1,2], type=str, help="Which folds of the dataset should be used for training")
parser.add_argument("--eval_folds", default=[3], type=str, help="Which folds of the dataset should be used for evaluation")
parser.add_argument("--shorten_to_train", default=None, help="How much should train dataset be shortened (400u - 400 records), (10% - 10 percent of all records)")
parser.add_argument("--shorten_to_eval", default=None, help="How much should test or validation set be shortened")

default_args = parser.parse_args([])
# fmt: on

In [None]:
loaded = load_dotenv(".env")
if not loaded:
    loaded = load_dotenv("../../.env")
assert loaded is True

In [None]:
np.random.seed(default_args.seed)
torch.manual_seed(default_args.seed)
# in case any standard library uses some random function
random.seed(default_args.seed)

- threads, parallelism env, ...

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["NCCL_SHM_DISABLE"] = "1"
if not is_local_development():
    print("Databricks is running")
    assert torch.cuda.is_available(), "GPU is not available!"
else:
    print("Local development is running")

In [None]:
# if you find those version and useless warnings annoying, uncomment
# import warnings
pd.set_option("display.max_colwidth", 120)
pd.set_option("display.max_rows", 800)
# warnings.filterwarnings("ignore", category=FutureWarning)
# warnings.filterwarnings("ignore", category=UserWarning, module="_distutils_hack")

# Dataset preparation

In [None]:
if is_local_development():
    spark = None
    # default_args.shorten_to_train = default_args.shorten_to_train or "4000u"
    # default_args.shorten_to_eval = default_args.shorten_to_eval or "2000u"
    default_args.shorten_to_train = None
    default_args.shorten_to_eval = None
    default_args.shorten_to_train = default_args.shorten_to_train or "1000u"
    default_args.shorten_to_eval = default_args.shorten_to_eval or "1000u"

In [None]:
default_train_dataset, default_eval_dataset = get_validation_dataset_by_args(default_args, spark)

# Model

- load the model to all available GPUs

In [None]:
setup_mlflow_client()

In [None]:
def do_transformer_parameter_tuning_run(
    args, num_workers_train=0, num_workers_eval=0, persistent_workers=False, prefetch_factor=None, train_dataset=default_train_dataset, eval_dataset=default_eval_dataset
):
    # early stopping must not be enabled when using testing set
    run_name = get_run_name("VAL", args.model_type, args.dataset_name)
    transformer_model = TransformerModelManager.from_args(args)
    transformer_model.do_training_run(
        args,
        run_name=run_name,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        num_workers_train=num_workers_train,
        num_workers_eval=num_workers_eval,
        persistent_workers=persistent_workers,
        prefetch_factor=prefetch_factor,
    )

In [None]:
args = copy.deepcopy(default_args)
args.model_type="bert_tiny"
args.dropout = None
args.bert_learning_rate=1e-5
args.classifier_learning_rate=1e-5
args.weight_decay=0
args.freeze_epochs=0
args.focal_loss_alpha=-1
args.loss="focal"
do_transformer_parameter_tuning_run(args)

In [None]:
args = copy.deepcopy(default_args)
args.model_type = "bert_tiny"
args.freeze_epochs = 0
do_transformer_parameter_tuning_run(args)

In [0]:
args = copy.deepcopy(default_args)
args.model_type = "bert_tiny"
args.focal_loss_gamma = 2
do_transformer_parameter_tuning_run(args)

In [0]:
args = copy.deepcopy(default_args)
args.model_type = "bert_tiny"
args.focal_loss_gamma = 1
do_transformer_parameter_tuning_run(args)

In [0]:
args = copy.deepcopy(default_args)
args.model_type = "bert_tiny"
args.loss = "cross_entropy"
args.focal_loss_gamma = None
args.focal_loss_alpha = None
do_transformer_parameter_tuning_run(args)

In [0]:
args = copy.deepcopy(default_args)
args.model_type = "bert_tiny"
args.bert_learning_rate = 1e-5
do_transformer_parameter_tuning_run(args)

In [0]:
args = copy.deepcopy(default_args)
args.model_type = "bert_tiny"
args.classifier_learning_rate = 5e-3
do_transformer_parameter_tuning_run(args)

In [0]:
args = copy.deepcopy(default_args)
args.model_type = "bert_tiny"
args.dropout = 0.15
do_transformer_parameter_tuning_run(args)

In [0]:
args = copy.deepcopy(default_args)
args.model_type = "bert_tiny"
args.weight_decay = 0.0
do_transformer_parameter_tuning_run(args)

In [0]:
args = copy.deepcopy(default_args)
args.model_type = "bert_tiny"
args.max_sequence_length = 128
do_transformer_parameter_tuning_run(args)