In [1]:
# import sys
# sys.path.append("../../")

In [2]:
import json
from types import SimpleNamespace
import os
import ast
import random

from transformers import BertForSequenceClassification, AutoTokenizer
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader

from utils.transformers import TransformerModelManager
from utils.dataset import get_df_by_folds_from_args
from utils.dataset import URLDatasetPart

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def parse_value(v: str):
    if v == "None":
        return None
    if v in {"True", "False"}:
        return v == "True"
    try:
        return int(v)
    except ValueError:
        pass
    try:
        return float(v)
    except ValueError:
        pass
    try:
        obj = ast.literal_eval(v)
        if isinstance(obj, (list, tuple, dict)):
            return obj
    except (ValueError, SyntaxError):
        pass
    return v

def convert_params(raw_params: dict) -> dict:
    return {k: parse_value(v) for k, v in raw_params.items()}

In [4]:
def load_model_from_folder(folder_path, device=None):
    tokenizer = AutoTokenizer.from_pretrained(folder_path)
    model = BertForSequenceClassification.from_pretrained(folder_path)
    
    params_path = os.path.join(folder_path, "parameters.json")
    with open(params_path, "r") as f:
        raw_params = json.load(f)

    params = convert_params(raw_params)

    args = SimpleNamespace(**params)
    print(params)
    return TransformerModelManager(args, model, tokenizer, device)

In [5]:
model_manager = load_model_from_folder("./results/models/bert_tiny-joined_best_params_group")
default_args = model_manager.args

{'batch_size': 128, 'bert_learning_rate': 3e-05, 'classifier_learning_rate': 0.002, 'dataset_name': 'joined', 'decision_threshold': 0.5, 'dropout': 0, 'epochs_max': 4, 'eval_folds': [4], 'focal_loss_alpha': -1, 'focal_loss_gamma': 2, 'freeze_epochs': 1, 'label_count': 2, 'loss': 'focal', 'max_sequence_length': 256, 'model_checkpoint': 'google/bert_uncased_L-2_H-128_A-2', 'model_type': 'bert_tiny', 'patience': 3, 'seed': 42, 'shorten_to_eval': None, 'shorten_to_train': None, 'token_replacement': False, 'train_folds': None, 'weight_decay': 0}
Setting focal loss -1 2 mean


In [6]:
loaded = load_dotenv(".env")
if not loaded:
    loaded = load_dotenv("../../.env")
assert loaded is True
np.random.seed(default_args.seed)
torch.manual_seed(default_args.seed)
# in case any standard library uses some random function
random.seed(default_args.seed)
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["NCCL_SHM_DISABLE"] = "1"


# if you find those version and useless warnings annoying, uncomment
# import warnings
pd.set_option("display.max_colwidth", 120)
pd.set_option("display.max_rows", 800)
# warnings.filterwarnings("ignore", category=FutureWarning)
# warnings.filterwarnings("ignore", category=UserWarning, module="_distutils_hack")

In [7]:
default_args.shorten_to_eval = "600u"
df_train, df_test = get_df_by_folds_from_args(default_args, None)
train_dataset = URLDatasetPart.from_pandas(df_train)
eval_dataset = URLDatasetPart.from_pandas(df_test)

[dataset] Using public dataset joined with path: ./data/processed/joined
[dataset]: All folds [0, 1, 2, 3, 4]
[0, 1, 2, 3] [0, 1, 2, 3, 4]
[4] [0, 1, 2, 3, 4]
[dataset] Shortening applied to df of length: 572349 by (600u)
[dataset] New length is 600
[dataset]: train length: 2289711, eval_length: 600


In [8]:
eval_data_loader = DataLoader(
    eval_dataset,
    batch_size=default_args.batch_size,
    shuffle=False,
    collate_fn=model_manager.prepare_batch,
)


In [9]:
model_manager.evaluate(eval_data_loader)

100%|████████████████████| 5/5 [00:02<00:00,  1.89it/s]


({'0': {'precision': 0.9681528662420382,
   'recall': 0.9848812095032398,
   'f1-score': 0.9764453961456103,
   'support': 463.0},
  '1': {'precision': 0.9457364341085271,
   'recall': 0.8905109489051095,
   'f1-score': 0.9172932330827067,
   'support': 137.0},
  'accuracy': 0.9633333333333334,
  'macro avg': {'precision': 0.9569446501752827,
   'recall': 0.9376960792041746,
   'f1-score': 0.9468693146141585,
   'support': 600.0},
  'weighted avg': {'precision': 0.9630344475715532,
   'recall': 0.9633333333333334,
   'f1-score': 0.9629389855795807,
   'support': 600.0},
  'confusion_matrix': [[456, 7], [15, 122]],
  'tp': 122,
  'tn': 456,
  'fp': 7,
  'fn': 15,
  'fpr': 0.01511879049676026,
  'roc_auc_score': 0.9816966467500118,
  'best_decision_threshold': 0.4545454545454546,
  'average_loss': 0.03051038011908531},
 {'true_labels': array([1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
      