In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import optuna

from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

###############################
# 1) Device Setup
###############################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

###############################
# 2) Category Columns
###############################
category_columns = [
    "Unlawful detention",
    "Human trafficking",
    "Enslavement",
    "Willful killing of civilians",
    "Mass execution",
    "Kidnapping",
    "Extrajudicial killing",
    "Forced disappearance",
    "Damage or destruction of civilian critical infrastructure",
    "Damage or destruction, looting, or theft of cultural heritage",
    "Military operations (battle, shelling)",
    "Gender-based or other conflict-related sexual violence",
    "Violent crackdowns on protesters/opponents/civil rights abuse",
    "Indiscriminate use of weapons",
    "Torture or indications of torture",
    "Persecution based on political, racial, ethnic, gender, or sexual orientation",
    "Movement of military, paramilitary, or other troops and equipment"
]

###############################
# 3) Load CSVs
###############################
train_df = pd.read_csv("train.csv")
val_df   = pd.read_csv("val.csv")
test_df  = pd.read_csv("test.csv")

###############################
# 4) Custom Dataset class
###############################
class ArticleDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

###############################
# 5) Tokenization
###############################
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(
    list(train_df["Incident Narrative"].values),
    truncation=True,
    padding=True
)
val_encodings   = tokenizer(
    list(val_df["Incident Narrative"].values),
    truncation=True,
    padding=True
)
test_encodings  = tokenizer(
    list(test_df["Incident Narrative"].values),
    truncation=True,
    padding=True
)

###############################
# 6) Prepare Labels & Dataset
###############################
train_labels = train_df[category_columns].values
val_labels   = val_df[category_columns].values
test_labels  = test_df[category_columns].values

train_dataset = ArticleDataset(train_encodings, train_labels)
val_dataset   = ArticleDataset(val_encodings, val_labels)
test_dataset  = ArticleDataset(test_encodings, test_labels)

###############################
# 7) compute_metrics function
###############################
def compute_metrics(p):
    # p.predictions are logits; p.label_ids are the ground truth
    preds = torch.sigmoid(torch.tensor(p.predictions))
    preds = (preds > 0.5).int().cpu().numpy()
    labels = torch.tensor(p.label_ids).cpu().numpy()

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted'
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

###############################
# 8) Optuna Objective
###############################
def objective(trial):
    # Hyperparameter search space
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32])
    weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)
    num_train_epochs = trial.suggest_int('num_train_epochs', 5, 20)

    # Model setup
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=len(category_columns)
    )
    model.to(device)

    # Trainer settings
    training_args = TrainingArguments(
        output_dir='./results',
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_dir='./logs',
        logging_steps=10,
        disable_tqdm=True
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    # Train & evaluate
    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results["eval_f1"]

###############################
# 9) Run Optuna Study
###############################
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

best_params = study.best_params
print("Best Hyperparameters:", best_params)

###############################
# 10) Train Final Model
###############################
final_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(category_columns)
)
final_model.to(device)

final_training_args = TrainingArguments(
    output_dir='./final_results',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=best_params['learning_rate'],
    per_device_train_batch_size=best_params['batch_size'],
    per_device_eval_batch_size=best_params['batch_size'],
    num_train_epochs=best_params['num_train_epochs'],
    weight_decay=best_params['weight_decay'],
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir='./final_logs'
)

final_trainer = Trainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

final_trainer.train()
print("Final model trained with best hyperparameters.")

# Evaluate on validation set (optional)
val_results = final_trainer.evaluate(val_dataset)
print("Validation Results (Final Model):", val_results)

###############################
# 11) Evaluate on Test Set
###############################
test_results = final_trainer.evaluate(test_dataset)
print("Multi-label Test Results:", test_results)

###############################
# 12) Single-Label Inference
###############################
# We'll now do a "single-category" prediction pass on the test set,
# picking exactly one category via argmax of logits,
# and computing the "exact match" accuracy if that category is among the true "1"s.

test_loader = DataLoader(
    test_dataset,
    batch_size=best_params['batch_size'],  # from the best hyperparams
    shuffle=False
)

final_model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch in test_loader:
        # Move to same device
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)

        outputs = final_model(**inputs)
        logits = outputs.logits  # shape: (batch_size, 17)

        # Convert to numpy
        probs = logits.detach().cpu().numpy()
        
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(probs)

y_true = np.array(y_true)  # shape: (num_samples, 17)
y_pred = np.array(y_pred)  # shape: (num_samples, 17)

# Single-label predictions by argmax
y_pred_single = np.argmax(y_pred, axis=1)  # shape: (num_samples,)

# "Exact-match" if predicted category is among the 1's in ground truth
accuracy = np.mean([
    1 if y_true[i, y_pred_single[i]] == 1 else 0
    for i in range(len(y_true))
])

print("Single-category exact-match accuracy on the test set:", accuracy)


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


[I 2025-01-06 21:39:49,438] A new study created in memory with name: no-name-7ea5d9b1-5a21-449d-887e-ab58ebd586f1
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.6389, 'grad_norm': 1.9691593647003174, 'learning_rate': 1.452098429224022e-05, 'epoch': 0.24390243902439024}
{'loss': 0.5562, 'grad_norm': 1.868700385093689, 'learning_rate': 1.399676103259256e-05, 'epoch': 0.4878048780487805}
{'loss': 0.4931, 'grad_norm': 2.98490571975708, 'learning_rate': 1.3472537772944898e-05, 'epoch': 0.7317073170731707}
{'loss': 0.4466, 'grad_norm': 0.9924401044845581, 'learning_rate': 1.2948314513297237e-05, 'epoch': 0.975609756097561}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.421138197183609, 'eval_accuracy': 0.0, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 0.943, 'eval_samples_per_second': 60.443, 'eval_steps_per_second': 8.483, 'epoch': 1.0}
{'loss': 0.4124, 'grad_norm': 1.0963168144226074, 'learning_rate': 1.2424091253649577e-05, 'epoch': 1.2195121951219512}
{'loss': 0.3903, 'grad_norm': 0.9411932826042175, 'learning_rate': 1.1899867994001916e-05, 'epoch': 1.4634146341463414}
{'loss': 0.3533, 'grad_norm': 1.0151002407073975, 'learning_rate': 1.1375644734354254e-05, 'epoch': 1.7073170731707317}
{'loss': 0.3413, 'grad_norm': 0.8625487685203552, 'learning_rate': 1.0851421474706591e-05, 'epoch': 1.951219512195122}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.32678723335266113, 'eval_accuracy': 0.0, 'eval_f1': 0.13478260869565217, 'eval_precision': 0.16687370600414078, 'eval_recall': 0.11304347826086956, 'eval_runtime': 0.8999, 'eval_samples_per_second': 63.34, 'eval_steps_per_second': 8.89, 'epoch': 2.0}
{'loss': 0.341, 'grad_norm': 0.6370793581008911, 'learning_rate': 1.032719821505893e-05, 'epoch': 2.1951219512195124}
{'loss': 0.3137, 'grad_norm': 0.6235954761505127, 'learning_rate': 9.802974955411268e-06, 'epoch': 2.4390243902439024}
{'loss': 0.3009, 'grad_norm': 0.6424309611320496, 'learning_rate': 9.278751695763608e-06, 'epoch': 2.682926829268293}
{'loss': 0.3075, 'grad_norm': 0.7002003192901611, 'learning_rate': 8.754528436115945e-06, 'epoch': 2.926829268292683}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2955630421638489, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.20304912478825524, 'eval_precision': 0.16994328922495275, 'eval_recall': 0.25217391304347825, 'eval_runtime': 0.8554, 'eval_samples_per_second': 66.638, 'eval_steps_per_second': 9.353, 'epoch': 3.0}
{'loss': 0.284, 'grad_norm': 0.6160511374473572, 'learning_rate': 8.230305176468283e-06, 'epoch': 3.1707317073170733}
{'loss': 0.2876, 'grad_norm': 0.5780725479125977, 'learning_rate': 7.706081916820624e-06, 'epoch': 3.4146341463414633}
{'loss': 0.299, 'grad_norm': 0.6767719984054565, 'learning_rate': 7.181858657172962e-06, 'epoch': 3.658536585365854}
{'loss': 0.2781, 'grad_norm': 0.6668236255645752, 'learning_rate': 6.6576353975253e-06, 'epoch': 3.902439024390244}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.28163138031959534, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.21666666666666667, 'eval_precision': 0.4, 'eval_recall': 0.20869565217391303, 'eval_runtime': 0.8465, 'eval_samples_per_second': 67.338, 'eval_steps_per_second': 9.451, 'epoch': 4.0}
{'loss': 0.2808, 'grad_norm': 0.4961200952529907, 'learning_rate': 6.133412137877639e-06, 'epoch': 4.146341463414634}
{'loss': 0.2868, 'grad_norm': 0.7467353940010071, 'learning_rate': 5.609188878229977e-06, 'epoch': 4.390243902439025}
{'loss': 0.2603, 'grad_norm': 0.7978845238685608, 'learning_rate': 5.084965618582316e-06, 'epoch': 4.634146341463414}
{'loss': 0.2719, 'grad_norm': 0.5177339315414429, 'learning_rate': 4.560742358934654e-06, 'epoch': 4.878048780487805}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.274424284696579, 'eval_accuracy': 0.03508771929824561, 'eval_f1': 0.2891737229976561, 'eval_precision': 0.3621014492753623, 'eval_recall': 0.2956521739130435, 'eval_runtime': 0.8049, 'eval_samples_per_second': 70.819, 'eval_steps_per_second': 9.94, 'epoch': 5.0}
{'loss': 0.2791, 'grad_norm': 0.5054015517234802, 'learning_rate': 4.0365190992869936e-06, 'epoch': 5.121951219512195}
{'loss': 0.2642, 'grad_norm': 0.5433247089385986, 'learning_rate': 3.5122958396393316e-06, 'epoch': 5.365853658536586}
{'loss': 0.2564, 'grad_norm': 0.5769331455230713, 'learning_rate': 2.98807257999167e-06, 'epoch': 5.609756097560975}
{'loss': 0.277, 'grad_norm': 0.5189330577850342, 'learning_rate': 2.4638493203440087e-06, 'epoch': 5.853658536585366}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2708148658275604, 'eval_accuracy': 0.07017543859649122, 'eval_f1': 0.3576902989946468, 'eval_precision': 0.37638236630813515, 'eval_recall': 0.3652173913043478, 'eval_runtime': 0.8156, 'eval_samples_per_second': 69.886, 'eval_steps_per_second': 9.809, 'epoch': 6.0}
{'loss': 0.2649, 'grad_norm': 0.5248175263404846, 'learning_rate': 1.939626060696347e-06, 'epoch': 6.097560975609756}
{'loss': 0.2577, 'grad_norm': 0.47111445665359497, 'learning_rate': 1.415402801048686e-06, 'epoch': 6.341463414634147}
{'loss': 0.276, 'grad_norm': 0.7317503094673157, 'learning_rate': 8.911795414010244e-07, 'epoch': 6.585365853658536}
{'loss': 0.2584, 'grad_norm': 0.42553848028182983, 'learning_rate': 3.66956281753363e-07, 'epoch': 6.829268292682927}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2690815329551697, 'eval_accuracy': 0.07017543859649122, 'eval_f1': 0.3489855072463768, 'eval_precision': 0.378149386845039, 'eval_recall': 0.34782608695652173, 'eval_runtime': 1.02, 'eval_samples_per_second': 55.882, 'eval_steps_per_second': 7.843, 'epoch': 7.0}
{'train_runtime': 163.8384, 'train_samples_per_second': 13.8, 'train_steps_per_second': 1.752, 'train_loss': 0.3297698638995765, 'epoch': 7.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-06 21:42:36,836] Trial 0 finished with value: 0.3576902989946468 and parameters: {'learning_rate': 1.5045207551887883e-05, 'batch_size': 8, 'weight_decay': 0.024924814291503415, 'num_train_epochs': 7}. Best is trial 0 with value: 0.3576902989946468.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.2708148658275604, 'eval_accuracy': 0.07017543859649122, 'eval_f1': 0.3576902989946468, 'eval_precision': 0.37638236630813515, 'eval_recall': 0.3652173913043478, 'eval_runtime': 0.9165, 'eval_samples_per_second': 62.193, 'eval_steps_per_second': 8.729, 'epoch': 7.0}
{'loss': 0.6249, 'grad_norm': 1.8521456718444824, 'learning_rate': 1.2715284720255813e-05, 'epoch': 0.24390243902439024}
{'loss': 0.5452, 'grad_norm': 1.4289623498916626, 'learning_rate': 1.2558305896548952e-05, 'epoch': 0.4878048780487805}
{'loss': 0.4854, 'grad_norm': 1.1625077724456787, 'learning_rate': 1.2401327072842089e-05, 'epoch': 0.7317073170731707}
{'loss': 0.4362, 'grad_norm': 0.9356006979942322, 'learning_rate': 1.2244348249135228e-05, 'epoch': 0.975609756097561}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.40661126375198364, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.1899209486166008, 'eval_precision': 0.14660564454614797, 'eval_recall': 0.26956521739130435, 'eval_runtime': 0.832, 'eval_samples_per_second': 68.514, 'eval_steps_per_second': 9.616, 'epoch': 1.0}
{'loss': 0.4032, 'grad_norm': 0.8332029581069946, 'learning_rate': 1.2087369425428365e-05, 'epoch': 1.2195121951219512}
{'loss': 0.3761, 'grad_norm': 0.8375731110572815, 'learning_rate': 1.1930390601721505e-05, 'epoch': 1.4634146341463414}
{'loss': 0.345, 'grad_norm': 0.8064860701560974, 'learning_rate': 1.1773411778014643e-05, 'epoch': 1.7073170731707317}
{'loss': 0.3328, 'grad_norm': 0.7588661313056946, 'learning_rate': 1.1616432954307781e-05, 'epoch': 1.951219512195122}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.32170170545578003, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.21790704647676162, 'eval_precision': 0.3444099378881988, 'eval_recall': 0.2782608695652174, 'eval_runtime': 0.832, 'eval_samples_per_second': 68.506, 'eval_steps_per_second': 9.615, 'epoch': 2.0}
{'loss': 0.336, 'grad_norm': 0.5661579966545105, 'learning_rate': 1.1459454130600919e-05, 'epoch': 2.1951219512195124}
{'loss': 0.3061, 'grad_norm': 0.702339768409729, 'learning_rate': 1.1302475306894057e-05, 'epoch': 2.4390243902439024}
{'loss': 0.2927, 'grad_norm': 0.5678374171257019, 'learning_rate': 1.1145496483187195e-05, 'epoch': 2.682926829268293}
{'loss': 0.2998, 'grad_norm': 0.6521770358085632, 'learning_rate': 1.0988517659480333e-05, 'epoch': 2.926829268292683}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.28991949558258057, 'eval_accuracy': 0.07017543859649122, 'eval_f1': 0.2935071363748865, 'eval_precision': 0.3748531139835488, 'eval_recall': 0.2782608695652174, 'eval_runtime': 0.9007, 'eval_samples_per_second': 63.281, 'eval_steps_per_second': 8.882, 'epoch': 3.0}
{'loss': 0.2762, 'grad_norm': 0.6327891945838928, 'learning_rate': 1.083153883577347e-05, 'epoch': 3.1707317073170733}
{'loss': 0.2789, 'grad_norm': 0.5283322930335999, 'learning_rate': 1.067456001206661e-05, 'epoch': 3.4146341463414633}
{'loss': 0.2925, 'grad_norm': 0.7035195231437683, 'learning_rate': 1.0517581188359747e-05, 'epoch': 3.658536585365854}
{'loss': 0.2716, 'grad_norm': 0.5875888466835022, 'learning_rate': 1.0360602364652885e-05, 'epoch': 3.902439024390244}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2743663191795349, 'eval_accuracy': 0.12280701754385964, 'eval_f1': 0.36289855072463767, 'eval_precision': 0.3697151424287856, 'eval_recall': 0.3565217391304348, 'eval_runtime': 0.8135, 'eval_samples_per_second': 70.07, 'eval_steps_per_second': 9.834, 'epoch': 4.0}
{'loss': 0.2699, 'grad_norm': 0.5448892116546631, 'learning_rate': 1.0203623540946024e-05, 'epoch': 4.146341463414634}
{'loss': 0.2755, 'grad_norm': 0.7305415272712708, 'learning_rate': 1.0046644717239161e-05, 'epoch': 4.390243902439025}
{'loss': 0.2469, 'grad_norm': 0.6956807374954224, 'learning_rate': 9.8896658935323e-06, 'epoch': 4.634146341463414}
{'loss': 0.2578, 'grad_norm': 0.6369668245315552, 'learning_rate': 9.732687069825437e-06, 'epoch': 4.878048780487805}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2622288167476654, 'eval_accuracy': 0.21052631578947367, 'eval_f1': 0.4571551402940586, 'eval_precision': 0.555735310639254, 'eval_recall': 0.4434782608695652, 'eval_runtime': 0.8476, 'eval_samples_per_second': 67.25, 'eval_steps_per_second': 9.439, 'epoch': 5.0}
{'loss': 0.2645, 'grad_norm': 0.5458227396011353, 'learning_rate': 9.575708246118576e-06, 'epoch': 5.121951219512195}
{'loss': 0.2465, 'grad_norm': 0.6724509596824646, 'learning_rate': 9.418729422411713e-06, 'epoch': 5.365853658536586}
{'loss': 0.234, 'grad_norm': 0.6101060509681702, 'learning_rate': 9.261750598704852e-06, 'epoch': 5.609756097560975}
{'loss': 0.2535, 'grad_norm': 0.5306345224380493, 'learning_rate': 9.10477177499799e-06, 'epoch': 5.853658536585366}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.24837373197078705, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5365121834687052, 'eval_precision': 0.5313791068266771, 'eval_recall': 0.5565217391304348, 'eval_runtime': 0.8468, 'eval_samples_per_second': 67.316, 'eval_steps_per_second': 9.448, 'epoch': 6.0}
{'loss': 0.2361, 'grad_norm': 0.5654795169830322, 'learning_rate': 8.947792951291128e-06, 'epoch': 6.097560975609756}
{'loss': 0.2256, 'grad_norm': 0.4803772270679474, 'learning_rate': 8.790814127584267e-06, 'epoch': 6.341463414634147}
{'loss': 0.2419, 'grad_norm': 0.6914535760879517, 'learning_rate': 8.633835303877404e-06, 'epoch': 6.585365853658536}
{'loss': 0.2217, 'grad_norm': 0.4377674460411072, 'learning_rate': 8.476856480170543e-06, 'epoch': 6.829268292682927}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2372838854789734, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.544594280399907, 'eval_precision': 0.5432712215320912, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.8473, 'eval_samples_per_second': 67.271, 'eval_steps_per_second': 9.442, 'epoch': 7.0}
{'loss': 0.2257, 'grad_norm': 0.7184051275253296, 'learning_rate': 8.31987765646368e-06, 'epoch': 7.073170731707317}
{'loss': 0.2188, 'grad_norm': 0.5485565066337585, 'learning_rate': 8.162898832756819e-06, 'epoch': 7.317073170731708}
{'loss': 0.2133, 'grad_norm': 0.5978813171386719, 'learning_rate': 8.005920009049956e-06, 'epoch': 7.560975609756097}
{'loss': 0.2196, 'grad_norm': 0.6786966919898987, 'learning_rate': 7.848941185343095e-06, 'epoch': 7.804878048780488}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22999761998653412, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5365091912918, 'eval_precision': 0.5352374980092371, 'eval_recall': 0.5391304347826087, 'eval_runtime': 0.8276, 'eval_samples_per_second': 68.87, 'eval_steps_per_second': 9.666, 'epoch': 8.0}
{'loss': 0.2034, 'grad_norm': 0.4760999083518982, 'learning_rate': 7.691962361636232e-06, 'epoch': 8.048780487804878}
{'loss': 0.2081, 'grad_norm': 0.5475614666938782, 'learning_rate': 7.534983537929371e-06, 'epoch': 8.292682926829269}
{'loss': 0.1933, 'grad_norm': 0.5573504567146301, 'learning_rate': 7.378004714222509e-06, 'epoch': 8.536585365853659}
{'loss': 0.2186, 'grad_norm': 0.572019100189209, 'learning_rate': 7.221025890515648e-06, 'epoch': 8.78048780487805}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22534623742103577, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5337675168118008, 'eval_precision': 0.5313951105255453, 'eval_recall': 0.5391304347826087, 'eval_runtime': 0.8424, 'eval_samples_per_second': 67.661, 'eval_steps_per_second': 9.496, 'epoch': 9.0}
{'loss': 0.191, 'grad_norm': 0.6812612414360046, 'learning_rate': 7.064047066808786e-06, 'epoch': 9.024390243902438}
{'loss': 0.1872, 'grad_norm': 0.6727004647254944, 'learning_rate': 6.907068243101924e-06, 'epoch': 9.268292682926829}
{'loss': 0.2063, 'grad_norm': 0.5973880887031555, 'learning_rate': 6.750089419395062e-06, 'epoch': 9.512195121951219}
{'loss': 0.1871, 'grad_norm': 0.4998486340045929, 'learning_rate': 6.5931105956882e-06, 'epoch': 9.75609756097561}
{'loss': 0.1868, 'grad_norm': 0.7320358753204346, 'learning_rate': 6.436131771981338e-06, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21838264167308807, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5384422420405155, 'eval_precision': 0.5319397993311037, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.8136, 'eval_samples_per_second': 70.06, 'eval_steps_per_second': 9.833, 'epoch': 10.0}
{'loss': 0.1865, 'grad_norm': 0.55742347240448, 'learning_rate': 6.279152948274476e-06, 'epoch': 10.24390243902439}
{'loss': 0.1811, 'grad_norm': 0.6102827787399292, 'learning_rate': 6.122174124567614e-06, 'epoch': 10.487804878048781}
{'loss': 0.1834, 'grad_norm': 0.4803445637226105, 'learning_rate': 5.965195300860753e-06, 'epoch': 10.731707317073171}
{'loss': 0.1816, 'grad_norm': 0.6883160471916199, 'learning_rate': 5.808216477153891e-06, 'epoch': 10.975609756097562}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21617087721824646, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5350531215577585, 'eval_precision': 0.5351165326427695, 'eval_recall': 0.5391304347826087, 'eval_runtime': 0.8397, 'eval_samples_per_second': 67.883, 'eval_steps_per_second': 9.527, 'epoch': 11.0}
{'loss': 0.1842, 'grad_norm': 0.4903252124786377, 'learning_rate': 5.651237653447029e-06, 'epoch': 11.21951219512195}
{'loss': 0.1731, 'grad_norm': 0.4589659571647644, 'learning_rate': 5.494258829740167e-06, 'epoch': 11.463414634146341}
{'loss': 0.17, 'grad_norm': 0.4281817376613617, 'learning_rate': 5.337280006033305e-06, 'epoch': 11.707317073170731}
{'loss': 0.1778, 'grad_norm': 0.5347049236297607, 'learning_rate': 5.180301182326443e-06, 'epoch': 11.951219512195122}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21234111487865448, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5376086675060467, 'eval_precision': 0.5392512077294686, 'eval_recall': 0.5391304347826087, 'eval_runtime': 0.8585, 'eval_samples_per_second': 66.392, 'eval_steps_per_second': 9.318, 'epoch': 12.0}
{'loss': 0.177, 'grad_norm': 0.5298570990562439, 'learning_rate': 5.023322358619581e-06, 'epoch': 12.195121951219512}
{'loss': 0.1731, 'grad_norm': 0.6081816554069519, 'learning_rate': 4.866343534912719e-06, 'epoch': 12.439024390243903}
{'loss': 0.166, 'grad_norm': 0.672978401184082, 'learning_rate': 4.709364711205857e-06, 'epoch': 12.682926829268293}
{'loss': 0.1728, 'grad_norm': 0.5292860865592957, 'learning_rate': 4.552385887498995e-06, 'epoch': 12.926829268292684}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21052488684654236, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5396828644501279, 'eval_precision': 0.5175523349436393, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.8425, 'eval_samples_per_second': 67.654, 'eval_steps_per_second': 9.495, 'epoch': 13.0}
{'loss': 0.1547, 'grad_norm': 0.396822452545166, 'learning_rate': 4.3954070637921335e-06, 'epoch': 13.170731707317072}
{'loss': 0.1705, 'grad_norm': 0.4511222839355469, 'learning_rate': 4.2384282400852715e-06, 'epoch': 13.414634146341463}
{'loss': 0.1617, 'grad_norm': 0.5696330666542053, 'learning_rate': 4.0814494163784095e-06, 'epoch': 13.658536585365853}
{'loss': 0.1584, 'grad_norm': 0.849224328994751, 'learning_rate': 3.9244705926715475e-06, 'epoch': 13.902439024390244}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20844051241874695, 'eval_accuracy': 0.22807017543859648, 'eval_f1': 0.527122423143057, 'eval_precision': 0.5186956521739131, 'eval_recall': 0.5391304347826087, 'eval_runtime': 0.8723, 'eval_samples_per_second': 65.347, 'eval_steps_per_second': 9.171, 'epoch': 14.0}
{'loss': 0.1602, 'grad_norm': 0.5549571514129639, 'learning_rate': 3.7674917689646855e-06, 'epoch': 14.146341463414634}
{'loss': 0.1626, 'grad_norm': 0.42075976729393005, 'learning_rate': 3.610512945257824e-06, 'epoch': 14.390243902439025}
{'loss': 0.1546, 'grad_norm': 0.505168616771698, 'learning_rate': 3.453534121550962e-06, 'epoch': 14.634146341463415}
{'loss': 0.1654, 'grad_norm': 0.48735931515693665, 'learning_rate': 3.2965552978441e-06, 'epoch': 14.878048780487806}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2070668637752533, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5348852632507276, 'eval_precision': 0.5272141706924316, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.8246, 'eval_samples_per_second': 69.122, 'eval_steps_per_second': 9.701, 'epoch': 15.0}
{'loss': 0.1683, 'grad_norm': 0.4550390839576721, 'learning_rate': 3.139576474137238e-06, 'epoch': 15.121951219512194}
{'loss': 0.1528, 'grad_norm': 0.4837946891784668, 'learning_rate': 2.9825976504303764e-06, 'epoch': 15.365853658536585}
{'loss': 0.154, 'grad_norm': 0.38217392563819885, 'learning_rate': 2.8256188267235144e-06, 'epoch': 15.609756097560975}
{'loss': 0.1542, 'grad_norm': 0.40785932540893555, 'learning_rate': 2.6686400030166524e-06, 'epoch': 15.853658536585366}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2048388570547104, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5424749163879599, 'eval_precision': 0.5304347826086957, 'eval_recall': 0.5565217391304348, 'eval_runtime': 0.8585, 'eval_samples_per_second': 66.397, 'eval_steps_per_second': 9.319, 'epoch': 16.0}
{'loss': 0.1601, 'grad_norm': 0.5516440272331238, 'learning_rate': 2.5116611793097904e-06, 'epoch': 16.097560975609756}
{'loss': 0.1397, 'grad_norm': 0.4382997155189514, 'learning_rate': 2.3546823556029284e-06, 'epoch': 16.341463414634145}
{'loss': 0.1622, 'grad_norm': 0.7479804754257202, 'learning_rate': 2.1977035318960668e-06, 'epoch': 16.585365853658537}
{'loss': 0.1583, 'grad_norm': 0.44527921080589294, 'learning_rate': 2.0407247081892048e-06, 'epoch': 16.829268292682926}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20395483076572418, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5441806155745657, 'eval_precision': 0.5371980676328503, 'eval_recall': 0.5565217391304348, 'eval_runtime': 0.8287, 'eval_samples_per_second': 68.786, 'eval_steps_per_second': 9.654, 'epoch': 17.0}
{'loss': 0.1498, 'grad_norm': 0.4236690402030945, 'learning_rate': 1.8837458844823428e-06, 'epoch': 17.073170731707318}
{'loss': 0.1561, 'grad_norm': 0.5570951700210571, 'learning_rate': 1.726767060775481e-06, 'epoch': 17.317073170731707}
{'loss': 0.1471, 'grad_norm': 0.4658142626285553, 'learning_rate': 1.569788237068619e-06, 'epoch': 17.5609756097561}
{'loss': 0.1597, 'grad_norm': 0.5247901678085327, 'learning_rate': 1.4128094133617572e-06, 'epoch': 17.804878048780488}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20255666971206665, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5714785963661526, 'eval_precision': 0.6457326892109502, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.8282, 'eval_samples_per_second': 68.826, 'eval_steps_per_second': 9.66, 'epoch': 18.0}
{'loss': 0.1511, 'grad_norm': 0.3777806758880615, 'learning_rate': 1.2558305896548952e-06, 'epoch': 18.048780487804876}
{'loss': 0.1512, 'grad_norm': 0.5799666047096252, 'learning_rate': 1.0988517659480334e-06, 'epoch': 18.29268292682927}
{'loss': 0.1561, 'grad_norm': 0.6469480991363525, 'learning_rate': 9.418729422411714e-07, 'epoch': 18.536585365853657}
{'loss': 0.1439, 'grad_norm': 0.4024732708930969, 'learning_rate': 7.848941185343095e-07, 'epoch': 18.78048780487805}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2021360844373703, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5641272413909175, 'eval_precision': 0.6302795031055901, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.8339, 'eval_samples_per_second': 68.354, 'eval_steps_per_second': 9.594, 'epoch': 19.0}
{'loss': 0.156, 'grad_norm': 0.5140177607536316, 'learning_rate': 6.279152948274476e-07, 'epoch': 19.024390243902438}
{'loss': 0.1454, 'grad_norm': 0.4390653371810913, 'learning_rate': 4.709364711205857e-07, 'epoch': 19.26829268292683}
{'loss': 0.1557, 'grad_norm': 0.5110278129577637, 'learning_rate': 3.139576474137238e-07, 'epoch': 19.51219512195122}
{'loss': 0.1436, 'grad_norm': 0.4746552109718323, 'learning_rate': 1.569788237068619e-07, 'epoch': 19.75609756097561}
{'loss': 0.1553, 'grad_norm': 0.7272594571113586, 'learning_rate': 0.0, 'epoch': 20.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20206023752689362, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5641272413909175, 'eval_precision': 0.6302795031055901, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.8084, 'eval_samples_per_second': 70.513, 'eval_steps_per_second': 9.897, 'epoch': 20.0}
{'train_runtime': 440.8648, 'train_samples_per_second': 14.653, 'train_steps_per_second': 1.86, 'train_loss': 0.22160104033423633, 'epoch': 20.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-06 21:49:59,314] Trial 1 finished with value: 0.5714785963661526 and parameters: {'learning_rate': 1.2872263543962676e-05, 'batch_size': 8, 'weight_decay': 0.043983792603565455, 'num_train_epochs': 20}. Best is trial 1 with value: 0.5714785963661526.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)


{'eval_loss': 0.20255666971206665, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5714785963661526, 'eval_precision': 0.6457326892109502, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.8718, 'eval_samples_per_second': 65.384, 'eval_steps_per_second': 9.177, 'epoch': 20.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.6169, 'grad_norm': 1.6392138004302979, 'learning_rate': 2.9863955811325e-05, 'epoch': 0.47619047619047616}
{'loss': 0.4799, 'grad_norm': 1.1033445596694946, 'learning_rate': 2.888480971915041e-05, 'epoch': 0.9523809523809523}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.41197454929351807, 'eval_accuracy': 0.03508771929824561, 'eval_f1': 0.30829402014535257, 'eval_precision': 0.3908603029312412, 'eval_recall': 0.34782608695652173, 'eval_runtime': 0.8416, 'eval_samples_per_second': 67.725, 'eval_steps_per_second': 4.753, 'epoch': 1.0}
{'loss': 0.3969, 'grad_norm': 1.1554583311080933, 'learning_rate': 2.7905663626975822e-05, 'epoch': 1.4285714285714286}
{'loss': 0.3362, 'grad_norm': 0.6224244236946106, 'learning_rate': 2.692651753480123e-05, 'epoch': 1.9047619047619047}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3125767707824707, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.2065876152832675, 'eval_precision': 0.34660564454614795, 'eval_recall': 0.2782608695652174, 'eval_runtime': 0.7839, 'eval_samples_per_second': 72.71, 'eval_steps_per_second': 5.102, 'epoch': 2.0}
{'loss': 0.3163, 'grad_norm': 0.4711335003376007, 'learning_rate': 2.594737144262664e-05, 'epoch': 2.380952380952381}
{'loss': 0.2886, 'grad_norm': 0.44019976258277893, 'learning_rate': 2.496822535045205e-05, 'epoch': 2.857142857142857}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2805408835411072, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.2551474262868566, 'eval_precision': 0.5579192546583851, 'eval_recall': 0.30434782608695654, 'eval_runtime': 0.8212, 'eval_samples_per_second': 69.407, 'eval_steps_per_second': 4.871, 'epoch': 3.0}
{'loss': 0.2591, 'grad_norm': 0.5234754085540771, 'learning_rate': 2.398907925827746e-05, 'epoch': 3.3333333333333335}
{'loss': 0.2761, 'grad_norm': 0.38741379976272583, 'learning_rate': 2.300993316610287e-05, 'epoch': 3.8095238095238093}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.26586681604385376, 'eval_accuracy': 0.10526315789473684, 'eval_f1': 0.3951030105556121, 'eval_precision': 0.5684321475625824, 'eval_recall': 0.33043478260869563, 'eval_runtime': 0.8234, 'eval_samples_per_second': 69.223, 'eval_steps_per_second': 4.858, 'epoch': 4.0}
{'loss': 0.2563, 'grad_norm': 0.4892849922180176, 'learning_rate': 2.203078707392828e-05, 'epoch': 4.285714285714286}
{'loss': 0.2469, 'grad_norm': 0.5733186602592468, 'learning_rate': 2.105164098175369e-05, 'epoch': 4.761904761904762}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2572014629840851, 'eval_accuracy': 0.21052631578947367, 'eval_f1': 0.532615848938276, 'eval_precision': 0.49604240371503794, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.8189, 'eval_samples_per_second': 69.607, 'eval_steps_per_second': 4.885, 'epoch': 5.0}
{'loss': 0.2527, 'grad_norm': 0.6137820482254028, 'learning_rate': 2.00724948895791e-05, 'epoch': 5.238095238095238}
{'loss': 0.2328, 'grad_norm': 0.45994192361831665, 'learning_rate': 1.9093348797404508e-05, 'epoch': 5.714285714285714}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.24090589582920074, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5504358757787547, 'eval_precision': 0.5159610983981693, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.8143, 'eval_samples_per_second': 70.003, 'eval_steps_per_second': 4.912, 'epoch': 6.0}
{'loss': 0.2269, 'grad_norm': 0.40041056275367737, 'learning_rate': 1.811420270522992e-05, 'epoch': 6.190476190476191}
{'loss': 0.2223, 'grad_norm': 0.4929158687591553, 'learning_rate': 1.713505661305533e-05, 'epoch': 6.666666666666667}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.23126615583896637, 'eval_accuracy': 0.21052631578947367, 'eval_f1': 0.5357185742649594, 'eval_precision': 0.49646997553854655, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.8714, 'eval_samples_per_second': 65.413, 'eval_steps_per_second': 4.59, 'epoch': 7.0}
{'loss': 0.2104, 'grad_norm': 0.42931902408599854, 'learning_rate': 1.615591052088074e-05, 'epoch': 7.142857142857143}
{'loss': 0.2045, 'grad_norm': 0.4096581041812897, 'learning_rate': 1.5176764428706148e-05, 'epoch': 7.619047619047619}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2239779829978943, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5379472269944056, 'eval_precision': 0.5386585971228434, 'eval_recall': 0.5391304347826087, 'eval_runtime': 0.8135, 'eval_samples_per_second': 70.064, 'eval_steps_per_second': 4.917, 'epoch': 8.0}
{'loss': 0.1996, 'grad_norm': 0.4497872292995453, 'learning_rate': 1.4197618336531557e-05, 'epoch': 8.095238095238095}
{'loss': 0.1866, 'grad_norm': 0.45397230982780457, 'learning_rate': 1.3218472244356966e-05, 'epoch': 8.571428571428571}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21924719214439392, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5369718192618113, 'eval_precision': 0.5398362507058159, 'eval_recall': 0.5391304347826087, 'eval_runtime': 0.8265, 'eval_samples_per_second': 68.969, 'eval_steps_per_second': 4.84, 'epoch': 9.0}
{'loss': 0.1945, 'grad_norm': 0.42688244581222534, 'learning_rate': 1.2239326152182377e-05, 'epoch': 9.047619047619047}
{'loss': 0.1836, 'grad_norm': 0.42130613327026367, 'learning_rate': 1.1260180060007786e-05, 'epoch': 9.523809523809524}
{'loss': 0.1745, 'grad_norm': 0.671442449092865, 'learning_rate': 1.0281033967833197e-05, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21641580760478973, 'eval_accuracy': 0.21052631578947367, 'eval_f1': 0.5050899824812868, 'eval_precision': 0.5162418790604698, 'eval_recall': 0.5043478260869565, 'eval_runtime': 0.7874, 'eval_samples_per_second': 72.392, 'eval_steps_per_second': 5.08, 'epoch': 10.0}
{'loss': 0.1763, 'grad_norm': 1.1448967456817627, 'learning_rate': 9.301887875658606e-06, 'epoch': 10.476190476190476}
{'loss': 0.1749, 'grad_norm': 0.44198134541511536, 'learning_rate': 8.322741783484015e-06, 'epoch': 10.952380952380953}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2113368660211563, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5314688046047898, 'eval_precision': 0.5195902048975514, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.8703, 'eval_samples_per_second': 65.491, 'eval_steps_per_second': 4.596, 'epoch': 11.0}
{'loss': 0.172, 'grad_norm': 0.3741505444049835, 'learning_rate': 7.343595691309426e-06, 'epoch': 11.428571428571429}
{'loss': 0.1709, 'grad_norm': 0.42318010330200195, 'learning_rate': 6.364449599134836e-06, 'epoch': 11.904761904761905}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2092835009098053, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5363818090954523, 'eval_precision': 0.5448631239935589, 'eval_recall': 0.5304347826086957, 'eval_runtime': 0.879, 'eval_samples_per_second': 64.844, 'eval_steps_per_second': 4.55, 'epoch': 12.0}
{'loss': 0.1696, 'grad_norm': 0.4077048897743225, 'learning_rate': 5.385303506960245e-06, 'epoch': 12.380952380952381}
{'loss': 0.1632, 'grad_norm': 0.3547784090042114, 'learning_rate': 4.406157414785655e-06, 'epoch': 12.857142857142858}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20751170814037323, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5422641620366287, 'eval_precision': 0.5420059811364158, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.8123, 'eval_samples_per_second': 70.172, 'eval_steps_per_second': 4.924, 'epoch': 13.0}
{'loss': 0.1615, 'grad_norm': 0.3917281925678253, 'learning_rate': 3.4270113226110654e-06, 'epoch': 13.333333333333334}
{'loss': 0.1549, 'grad_norm': 0.4099784195423126, 'learning_rate': 2.4478652304364754e-06, 'epoch': 13.80952380952381}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20690369606018066, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5422641620366287, 'eval_precision': 0.5420059811364158, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.8194, 'eval_samples_per_second': 69.562, 'eval_steps_per_second': 4.882, 'epoch': 14.0}
{'loss': 0.1663, 'grad_norm': 0.2903214395046234, 'learning_rate': 1.4687191382618852e-06, 'epoch': 14.285714285714286}
{'loss': 0.1564, 'grad_norm': 0.3929115831851959, 'learning_rate': 4.895730460872951e-07, 'epoch': 14.761904761904763}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2067767232656479, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5422641620366287, 'eval_precision': 0.5420059811364158, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.7947, 'eval_samples_per_second': 71.724, 'eval_steps_per_second': 5.033, 'epoch': 15.0}
{'train_runtime': 328.2781, 'train_samples_per_second': 14.759, 'train_steps_per_second': 0.96, 'train_loss': 0.2385990129576789, 'epoch': 15.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-06 21:55:29,359] Trial 2 finished with value: 0.5504358757787547 and parameters: {'learning_rate': 3.084310190349959e-05, 'batch_size': 16, 'weight_decay': 0.05259304236981035, 'num_train_epochs': 15}. Best is trial 1 with value: 0.5714785963661526.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)


{'eval_loss': 0.24090589582920074, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5504358757787547, 'eval_precision': 0.5159610983981693, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.8561, 'eval_samples_per_second': 66.583, 'eval_steps_per_second': 4.672, 'epoch': 15.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.5947, 'grad_norm': 3.22655987739563, 'learning_rate': 1.803274295428823e-05, 'epoch': 0.47619047619047616}
{'loss': 0.4906, 'grad_norm': 1.2869023084640503, 'learning_rate': 1.7513067364827475e-05, 'epoch': 0.9523809523809523}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.435224324464798, 'eval_accuracy': 0.08771929824561403, 'eval_f1': 0.4309257795344752, 'eval_precision': 0.3190928827084433, 'eval_recall': 0.6695652173913044, 'eval_runtime': 0.8716, 'eval_samples_per_second': 65.395, 'eval_steps_per_second': 4.589, 'epoch': 1.0}
{'loss': 0.4237, 'grad_norm': 1.0196051597595215, 'learning_rate': 1.6993391775366717e-05, 'epoch': 1.4285714285714286}
{'loss': 0.3714, 'grad_norm': 0.8093101978302002, 'learning_rate': 1.6473716185905963e-05, 'epoch': 1.9047619047619047}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.34689512848854065, 'eval_accuracy': 0.07017543859649122, 'eval_f1': 0.3305420666290232, 'eval_precision': 0.43941196865681986, 'eval_recall': 0.3652173913043478, 'eval_runtime': 0.8916, 'eval_samples_per_second': 63.928, 'eval_steps_per_second': 4.486, 'epoch': 2.0}
{'loss': 0.3499, 'grad_norm': 0.6352674961090088, 'learning_rate': 1.595404059644521e-05, 'epoch': 2.380952380952381}
{'loss': 0.3208, 'grad_norm': 0.5667080879211426, 'learning_rate': 1.543436500698445e-05, 'epoch': 2.857142857142857}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.30697914958000183, 'eval_accuracy': 0.05263157894736842, 'eval_f1': 0.2969391814356751, 'eval_precision': 0.530301296720061, 'eval_recall': 0.3391304347826087, 'eval_runtime': 0.8188, 'eval_samples_per_second': 69.615, 'eval_steps_per_second': 4.885, 'epoch': 3.0}
{'loss': 0.2931, 'grad_norm': 0.6072192192077637, 'learning_rate': 1.4914689417523696e-05, 'epoch': 3.3333333333333335}
{'loss': 0.3013, 'grad_norm': 0.5053983330726624, 'learning_rate': 1.439501382806294e-05, 'epoch': 3.8095238095238093}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.28617921471595764, 'eval_accuracy': 0.12280701754385964, 'eval_f1': 0.39836662749706225, 'eval_precision': 0.4898846495119788, 'eval_recall': 0.40869565217391307, 'eval_runtime': 0.8472, 'eval_samples_per_second': 67.282, 'eval_steps_per_second': 4.722, 'epoch': 4.0}
{'loss': 0.2805, 'grad_norm': 0.43526777625083923, 'learning_rate': 1.3875338238602183e-05, 'epoch': 4.285714285714286}
{'loss': 0.2709, 'grad_norm': 0.5714895725250244, 'learning_rate': 1.3355662649141427e-05, 'epoch': 4.761904761904762}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.27376288175582886, 'eval_accuracy': 0.17543859649122806, 'eval_f1': 0.4263141402271837, 'eval_precision': 0.47466127401415575, 'eval_recall': 0.45217391304347826, 'eval_runtime': 0.7954, 'eval_samples_per_second': 71.663, 'eval_steps_per_second': 5.029, 'epoch': 5.0}
{'loss': 0.2727, 'grad_norm': 0.6962912082672119, 'learning_rate': 1.283598705968067e-05, 'epoch': 5.238095238095238}
{'loss': 0.2561, 'grad_norm': 0.4252919554710388, 'learning_rate': 1.2316311470219915e-05, 'epoch': 5.714285714285714}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.26098084449768066, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.507991718426501, 'eval_precision': 0.49317661743291036, 'eval_recall': 0.5391304347826087, 'eval_runtime': 0.8346, 'eval_samples_per_second': 68.3, 'eval_steps_per_second': 4.793, 'epoch': 6.0}
{'loss': 0.2564, 'grad_norm': 0.5151264071464539, 'learning_rate': 1.1796635880759159e-05, 'epoch': 6.190476190476191}
{'loss': 0.2508, 'grad_norm': 0.4914688467979431, 'learning_rate': 1.1276960291298403e-05, 'epoch': 6.666666666666667}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.25046175718307495, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.516701060271828, 'eval_precision': 0.4962773515203183, 'eval_recall': 0.5391304347826087, 'eval_runtime': 0.8599, 'eval_samples_per_second': 66.288, 'eval_steps_per_second': 4.652, 'epoch': 7.0}
{'loss': 0.2389, 'grad_norm': 0.4292461574077606, 'learning_rate': 1.0757284701837648e-05, 'epoch': 7.142857142857143}
{'loss': 0.2334, 'grad_norm': 0.40603625774383545, 'learning_rate': 1.0237609112376892e-05, 'epoch': 7.619047619047619}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.24437500536441803, 'eval_accuracy': 0.22807017543859648, 'eval_f1': 0.5220807453416149, 'eval_precision': 0.48602303976217015, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.8182, 'eval_samples_per_second': 69.669, 'eval_steps_per_second': 4.889, 'epoch': 8.0}
{'loss': 0.2326, 'grad_norm': 0.5165567398071289, 'learning_rate': 9.717933522916136e-06, 'epoch': 8.095238095238095}
{'loss': 0.2163, 'grad_norm': 0.43474626541137695, 'learning_rate': 9.19825793345538e-06, 'epoch': 8.571428571428571}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.23892885446548462, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5264084321475626, 'eval_precision': 0.5100437083045779, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.8743, 'eval_samples_per_second': 65.194, 'eval_steps_per_second': 4.575, 'epoch': 9.0}
{'loss': 0.2271, 'grad_norm': 0.537979781627655, 'learning_rate': 8.678582343994624e-06, 'epoch': 9.047619047619047}
{'loss': 0.2177, 'grad_norm': 0.5174124836921692, 'learning_rate': 8.158906754533868e-06, 'epoch': 9.523809523809524}
{'loss': 0.2077, 'grad_norm': 0.6444068551063538, 'learning_rate': 7.639231165073112e-06, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2307492345571518, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5420636564326072, 'eval_precision': 0.5079710144927536, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.825, 'eval_samples_per_second': 69.091, 'eval_steps_per_second': 4.849, 'epoch': 10.0}
{'loss': 0.2088, 'grad_norm': 0.44591087102890015, 'learning_rate': 7.119555575612356e-06, 'epoch': 10.476190476190476}
{'loss': 0.2036, 'grad_norm': 0.48867470026016235, 'learning_rate': 6.5998799861516e-06, 'epoch': 10.952380952380953}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22638612985610962, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5379667519181585, 'eval_precision': 0.5132685421994885, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.876, 'eval_samples_per_second': 65.069, 'eval_steps_per_second': 4.566, 'epoch': 11.0}
{'loss': 0.2017, 'grad_norm': 0.4495941996574402, 'learning_rate': 6.080204396690844e-06, 'epoch': 11.428571428571429}
{'loss': 0.201, 'grad_norm': 0.4635363817214966, 'learning_rate': 5.560528807230088e-06, 'epoch': 11.904761904761905}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22339047491550446, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5383167446284887, 'eval_precision': 0.5148107764299669, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.8667, 'eval_samples_per_second': 65.77, 'eval_steps_per_second': 4.615, 'epoch': 12.0}
{'loss': 0.1963, 'grad_norm': 0.3941754996776581, 'learning_rate': 5.040853217769332e-06, 'epoch': 12.380952380952381}
{'loss': 0.1909, 'grad_norm': 0.5072242021560669, 'learning_rate': 4.521177628308576e-06, 'epoch': 12.857142857142858}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22213126718997955, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.549703557312253, 'eval_precision': 0.514335403726708, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.8405, 'eval_samples_per_second': 67.819, 'eval_steps_per_second': 4.759, 'epoch': 13.0}
{'loss': 0.1908, 'grad_norm': 0.5034809708595276, 'learning_rate': 4.001502038847821e-06, 'epoch': 13.333333333333334}
{'loss': 0.1828, 'grad_norm': 0.4736959934234619, 'learning_rate': 3.481826449387065e-06, 'epoch': 13.80952380952381}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22013501822948456, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5481837804502003, 'eval_precision': 0.5188917306052856, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.8087, 'eval_samples_per_second': 70.481, 'eval_steps_per_second': 4.946, 'epoch': 14.0}
{'loss': 0.1913, 'grad_norm': 0.3745667338371277, 'learning_rate': 2.962150859926309e-06, 'epoch': 14.285714285714286}
{'loss': 0.1811, 'grad_norm': 0.4271334707736969, 'learning_rate': 2.442475270465553e-06, 'epoch': 14.761904761904763}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21869820356369019, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5431294326241134, 'eval_precision': 0.516969696969697, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.8174, 'eval_samples_per_second': 69.73, 'eval_steps_per_second': 4.893, 'epoch': 15.0}
{'loss': 0.1922, 'grad_norm': 0.34628161787986755, 'learning_rate': 1.9227996810047972e-06, 'epoch': 15.238095238095237}
{'loss': 0.1836, 'grad_norm': 0.3464714586734772, 'learning_rate': 1.403124091544041e-06, 'epoch': 15.714285714285714}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21873806416988373, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5465336361669911, 'eval_precision': 0.5087729468599034, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.8107, 'eval_samples_per_second': 70.313, 'eval_steps_per_second': 4.934, 'epoch': 16.0}
{'loss': 0.1774, 'grad_norm': 0.3755621016025543, 'learning_rate': 8.83448502083285e-07, 'epoch': 16.19047619047619}
{'loss': 0.1834, 'grad_norm': 0.4409436285495758, 'learning_rate': 3.6377291262252915e-07, 'epoch': 16.666666666666668}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2181013822555542, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5529710144927537, 'eval_precision': 0.5202250639386189, 'eval_recall': 0.591304347826087, 'eval_runtime': 1.0136, 'eval_samples_per_second': 56.233, 'eval_steps_per_second': 3.946, 'epoch': 17.0}
{'train_runtime': 404.168, 'train_samples_per_second': 13.586, 'train_steps_per_second': 0.883, 'train_loss': 0.25551212201265394, 'epoch': 17.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-06 22:02:15,397] Trial 3 finished with value: 0.5529710144927537 and parameters: {'learning_rate': 1.8552418543748987e-05, 'batch_size': 16, 'weight_decay': 0.02582584046080346, 'num_train_epochs': 17}. Best is trial 1 with value: 0.5714785963661526.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)


{'eval_loss': 0.2181013822555542, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5529710144927537, 'eval_precision': 0.5202250639386189, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.9478, 'eval_samples_per_second': 60.14, 'eval_steps_per_second': 4.22, 'epoch': 17.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.6328, 'grad_norm': 1.5490176677703857, 'learning_rate': 2.81644341133394e-05, 'epoch': 0.47619047619047616}
{'loss': 0.4866, 'grad_norm': 1.1130201816558838, 'learning_rate': 2.7172728686813367e-05, 'epoch': 0.9523809523809523}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.41988325119018555, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.1899209486166008, 'eval_precision': 0.14660564454614797, 'eval_recall': 0.26956521739130435, 'eval_runtime': 0.8019, 'eval_samples_per_second': 71.085, 'eval_steps_per_second': 4.988, 'epoch': 1.0}
{'loss': 0.4028, 'grad_norm': 0.8565735816955566, 'learning_rate': 2.6181023260287328e-05, 'epoch': 1.4285714285714286}
{'loss': 0.3472, 'grad_norm': 0.7453985214233398, 'learning_rate': 2.5189317833761292e-05, 'epoch': 1.9047619047619047}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.32385018467903137, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.2073347472605161, 'eval_precision': 0.35328218243819265, 'eval_recall': 0.2608695652173913, 'eval_runtime': 0.807, 'eval_samples_per_second': 70.629, 'eval_steps_per_second': 4.956, 'epoch': 2.0}
{'loss': 0.3283, 'grad_norm': 0.5661817193031311, 'learning_rate': 2.419761240723526e-05, 'epoch': 2.380952380952381}
{'loss': 0.2983, 'grad_norm': 0.5099236965179443, 'learning_rate': 2.3205906980709224e-05, 'epoch': 2.857142857142857}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.28776803612709045, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.2251304347826087, 'eval_precision': 0.3776679841897233, 'eval_recall': 0.2608695652173913, 'eval_runtime': 0.8258, 'eval_samples_per_second': 69.02, 'eval_steps_per_second': 4.844, 'epoch': 3.0}
{'loss': 0.269, 'grad_norm': 0.5585386157035828, 'learning_rate': 2.2214201554183185e-05, 'epoch': 3.3333333333333335}
{'loss': 0.2789, 'grad_norm': 0.4393097460269928, 'learning_rate': 2.1222496127657153e-05, 'epoch': 3.8095238095238093}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.26576918363571167, 'eval_accuracy': 0.21052631578947367, 'eval_f1': 0.516397310304055, 'eval_precision': 0.5969565217391304, 'eval_recall': 0.46956521739130436, 'eval_runtime': 0.8258, 'eval_samples_per_second': 69.028, 'eval_steps_per_second': 4.844, 'epoch': 4.0}
{'loss': 0.2602, 'grad_norm': 0.40937912464141846, 'learning_rate': 2.0230790701131118e-05, 'epoch': 4.285714285714286}
{'loss': 0.2464, 'grad_norm': 0.5727545022964478, 'learning_rate': 1.9239085274605082e-05, 'epoch': 4.761904761904762}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2514083981513977, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.544545471888891, 'eval_precision': 0.5352151478440346, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.8154, 'eval_samples_per_second': 69.905, 'eval_steps_per_second': 4.906, 'epoch': 5.0}
{'loss': 0.2468, 'grad_norm': 0.5472311973571777, 'learning_rate': 1.824737984807905e-05, 'epoch': 5.238095238095238}
{'loss': 0.2274, 'grad_norm': 0.41908252239227295, 'learning_rate': 1.725567442155301e-05, 'epoch': 5.714285714285714}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.23824846744537354, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5589889579020013, 'eval_precision': 0.5316413043478261, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.8185, 'eval_samples_per_second': 69.641, 'eval_steps_per_second': 4.887, 'epoch': 6.0}
{'loss': 0.2222, 'grad_norm': 0.49700066447257996, 'learning_rate': 1.6263968995026975e-05, 'epoch': 6.190476190476191}
{'loss': 0.2215, 'grad_norm': 0.5045523643493652, 'learning_rate': 1.5272263568500943e-05, 'epoch': 6.666666666666667}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2296820878982544, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.550177358873011, 'eval_precision': 0.5084943242664381, 'eval_recall': 0.6, 'eval_runtime': 0.8431, 'eval_samples_per_second': 67.607, 'eval_steps_per_second': 4.744, 'epoch': 7.0}
{'loss': 0.2053, 'grad_norm': 0.4362693130970001, 'learning_rate': 1.4280558141974906e-05, 'epoch': 7.142857142857143}
{'loss': 0.2016, 'grad_norm': 0.4751265347003937, 'learning_rate': 1.3288852715448872e-05, 'epoch': 7.619047619047619}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22332744300365448, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5411052715400542, 'eval_precision': 0.5058351367302775, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.8163, 'eval_samples_per_second': 69.827, 'eval_steps_per_second': 4.9, 'epoch': 8.0}
{'loss': 0.1968, 'grad_norm': 0.38002756237983704, 'learning_rate': 1.2297147288922836e-05, 'epoch': 8.095238095238095}
{'loss': 0.185, 'grad_norm': 0.38527607917785645, 'learning_rate': 1.1305441862396801e-05, 'epoch': 8.571428571428571}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22198061645030975, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5414778822490062, 'eval_precision': 0.5006113749847766, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.8277, 'eval_samples_per_second': 68.863, 'eval_steps_per_second': 4.832, 'epoch': 9.0}
{'loss': 0.1946, 'grad_norm': 0.4569956958293915, 'learning_rate': 1.0313736435870765e-05, 'epoch': 9.047619047619047}
{'loss': 0.1867, 'grad_norm': 0.3783991038799286, 'learning_rate': 9.322031009344731e-06, 'epoch': 9.523809523809524}
{'loss': 0.1757, 'grad_norm': 0.5359641313552856, 'learning_rate': 8.330325582818696e-06, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21546678245067596, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5569155482198961, 'eval_precision': 0.5927916584694078, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.7925, 'eval_samples_per_second': 71.928, 'eval_steps_per_second': 5.048, 'epoch': 10.0}
{'loss': 0.1764, 'grad_norm': 0.3670463562011719, 'learning_rate': 7.33862015629266e-06, 'epoch': 10.476190476190476}
{'loss': 0.1744, 'grad_norm': 0.39282217621803284, 'learning_rate': 6.346914729766625e-06, 'epoch': 10.952380952380953}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21290139853954315, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5578397895702737, 'eval_precision': 0.6033897337045763, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.8341, 'eval_samples_per_second': 68.341, 'eval_steps_per_second': 4.796, 'epoch': 11.0}
{'loss': 0.1754, 'grad_norm': 0.3930921256542206, 'learning_rate': 5.35520930324059e-06, 'epoch': 11.428571428571429}
{'loss': 0.1732, 'grad_norm': 0.43673086166381836, 'learning_rate': 4.363503876714555e-06, 'epoch': 11.904761904761905}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2106935977935791, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5669489930359495, 'eval_precision': 0.6119732441471571, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.8025, 'eval_samples_per_second': 71.028, 'eval_steps_per_second': 4.984, 'epoch': 12.0}
{'loss': 0.1724, 'grad_norm': 0.41668710112571716, 'learning_rate': 3.3717984501885196e-06, 'epoch': 12.380952380952381}
{'loss': 0.1663, 'grad_norm': 0.4696139097213745, 'learning_rate': 2.3800930236624845e-06, 'epoch': 12.857142857142858}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21070252358913422, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.567681405224232, 'eval_precision': 0.6058216427500536, 'eval_recall': 0.6, 'eval_runtime': 0.8555, 'eval_samples_per_second': 66.624, 'eval_steps_per_second': 4.675, 'epoch': 13.0}
{'loss': 0.1649, 'grad_norm': 0.4025823473930359, 'learning_rate': 1.388387597136449e-06, 'epoch': 13.333333333333334}
{'loss': 0.1585, 'grad_norm': 0.3973471224308014, 'learning_rate': 3.9668217061041404e-07, 'epoch': 13.80952380952381}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2097598910331726, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5575900655184544, 'eval_precision': 0.601863354037267, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.7745, 'eval_samples_per_second': 73.597, 'eval_steps_per_second': 5.165, 'epoch': 14.0}
{'train_runtime': 308.0317, 'train_samples_per_second': 14.68, 'train_steps_per_second': 0.954, 'train_loss': 0.24662450341140332, 'epoch': 14.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-06 22:07:25,111] Trial 4 finished with value: 0.567681405224232 and parameters: {'learning_rate': 2.9156139539865435e-05, 'batch_size': 16, 'weight_decay': 0.06244378658367392, 'num_train_epochs': 14}. Best is trial 1 with value: 0.5714785963661526.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)


{'eval_loss': 0.21070252358913422, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.567681405224232, 'eval_precision': 0.6058216427500536, 'eval_recall': 0.6, 'eval_runtime': 0.839, 'eval_samples_per_second': 67.942, 'eval_steps_per_second': 4.768, 'epoch': 14.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.5734, 'grad_norm': 1.2222657203674316, 'learning_rate': 4.716483745291329e-05, 'epoch': 0.47619047619047616}
{'loss': 0.3999, 'grad_norm': 0.7061593532562256, 'learning_rate': 4.503068191205749e-05, 'epoch': 0.9523809523809523}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.34264567494392395, 'eval_accuracy': 0.12280701754385964, 'eval_f1': 0.3263553408480944, 'eval_precision': 0.2586734074823054, 'eval_recall': 0.4434782608695652, 'eval_runtime': 0.8252, 'eval_samples_per_second': 69.071, 'eval_steps_per_second': 4.847, 'epoch': 1.0}
{'loss': 0.3303, 'grad_norm': 0.5282391309738159, 'learning_rate': 4.2896526371201675e-05, 'epoch': 1.4285714285714286}
{'loss': 0.2875, 'grad_norm': 0.4599764943122864, 'learning_rate': 4.0762370830345876e-05, 'epoch': 1.9047619047619047}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.27971166372299194, 'eval_accuracy': 0.0, 'eval_f1': 0.2483049961310831, 'eval_precision': 0.3582230623818526, 'eval_recall': 0.26956521739130435, 'eval_runtime': 0.8138, 'eval_samples_per_second': 70.045, 'eval_steps_per_second': 4.915, 'epoch': 2.0}
{'loss': 0.2815, 'grad_norm': 0.4188167154788971, 'learning_rate': 3.862821528949007e-05, 'epoch': 2.380952380952381}
{'loss': 0.2554, 'grad_norm': 0.3707391023635864, 'learning_rate': 3.649405974863426e-05, 'epoch': 2.857142857142857}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.25696861743927, 'eval_accuracy': 0.21052631578947367, 'eval_f1': 0.5154687777975449, 'eval_precision': 0.4796423865989084, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.8167, 'eval_samples_per_second': 69.791, 'eval_steps_per_second': 4.898, 'epoch': 3.0}
{'loss': 0.224, 'grad_norm': 0.41233029961586, 'learning_rate': 3.435990420777846e-05, 'epoch': 3.3333333333333335}
{'loss': 0.2403, 'grad_norm': 0.41115254163742065, 'learning_rate': 3.222574866692265e-05, 'epoch': 3.8095238095238093}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2339620143175125, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5431684096191104, 'eval_precision': 0.5164057677043831, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.8502, 'eval_samples_per_second': 67.045, 'eval_steps_per_second': 4.705, 'epoch': 4.0}
{'loss': 0.2203, 'grad_norm': 0.5095391869544983, 'learning_rate': 3.009159312606685e-05, 'epoch': 4.285714285714286}
{'loss': 0.2051, 'grad_norm': 0.4743952453136444, 'learning_rate': 2.7957437585211044e-05, 'epoch': 4.761904761904762}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22434726357460022, 'eval_accuracy': 0.22807017543859648, 'eval_f1': 0.5147264484136558, 'eval_precision': 0.4666727076022428, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.8164, 'eval_samples_per_second': 69.821, 'eval_steps_per_second': 4.9, 'epoch': 5.0}
{'loss': 0.206, 'grad_norm': 0.5439714789390564, 'learning_rate': 2.582328204435524e-05, 'epoch': 5.238095238095238}
{'loss': 0.1862, 'grad_norm': 0.37870994210243225, 'learning_rate': 2.3689126503499434e-05, 'epoch': 5.714285714285714}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2136373370885849, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5355762594893029, 'eval_precision': 0.504804347826087, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.8229, 'eval_samples_per_second': 69.268, 'eval_steps_per_second': 4.861, 'epoch': 6.0}
{'loss': 0.1808, 'grad_norm': 0.4047497510910034, 'learning_rate': 2.155497096264363e-05, 'epoch': 6.190476190476191}
{'loss': 0.1783, 'grad_norm': 0.4758144021034241, 'learning_rate': 1.9420815421787825e-05, 'epoch': 6.666666666666667}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2077130377292633, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5707064612406229, 'eval_precision': 0.6247491638795986, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.8528, 'eval_samples_per_second': 66.84, 'eval_steps_per_second': 4.691, 'epoch': 7.0}
{'loss': 0.1662, 'grad_norm': 0.37943968176841736, 'learning_rate': 1.7286659880932022e-05, 'epoch': 7.142857142857143}
{'loss': 0.1625, 'grad_norm': 0.366454541683197, 'learning_rate': 1.5152504340076214e-05, 'epoch': 7.619047619047619}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20199893414974213, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.563650838114131, 'eval_precision': 0.6131078904991949, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.8721, 'eval_samples_per_second': 65.357, 'eval_steps_per_second': 4.586, 'epoch': 8.0}
{'loss': 0.1604, 'grad_norm': 0.364711195230484, 'learning_rate': 1.301834879922041e-05, 'epoch': 8.095238095238095}
{'loss': 0.149, 'grad_norm': 0.33648669719696045, 'learning_rate': 1.0884193258364604e-05, 'epoch': 8.571428571428571}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20074966549873352, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5483320158102767, 'eval_precision': 0.6064412238325281, 'eval_recall': 0.5565217391304348, 'eval_runtime': 0.8122, 'eval_samples_per_second': 70.176, 'eval_steps_per_second': 4.925, 'epoch': 9.0}
{'loss': 0.16, 'grad_norm': 0.3103155195713043, 'learning_rate': 8.750037717508801e-06, 'epoch': 9.047619047619047}
{'loss': 0.1516, 'grad_norm': 0.33189257979393005, 'learning_rate': 6.615882176652996e-06, 'epoch': 9.523809523809524}
{'loss': 0.1443, 'grad_norm': 0.5270025134086609, 'learning_rate': 4.481726635797191e-06, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19734176993370056, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5636943346508564, 'eval_precision': 0.6289299794547171, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.8125, 'eval_samples_per_second': 70.15, 'eval_steps_per_second': 4.923, 'epoch': 10.0}
{'loss': 0.147, 'grad_norm': 0.3642146587371826, 'learning_rate': 2.3475710949413854e-06, 'epoch': 10.476190476190476}
{'loss': 0.1472, 'grad_norm': 0.4005231559276581, 'learning_rate': 2.1341555408558048e-07, 'epoch': 10.952380952380953}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19700433313846588, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5638981932434737, 'eval_precision': 0.6280713489409142, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.7975, 'eval_samples_per_second': 71.478, 'eval_steps_per_second': 5.016, 'epoch': 11.0}
{'train_runtime': 253.4286, 'train_samples_per_second': 14.02, 'train_steps_per_second': 0.911, 'train_loss': 0.2240632131244197, 'epoch': 11.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-06 22:11:40,323] Trial 5 finished with value: 0.5707064612406229 and parameters: {'learning_rate': 4.9298992993769094e-05, 'batch_size': 16, 'weight_decay': 0.010959592625894096, 'num_train_epochs': 11}. Best is trial 1 with value: 0.5714785963661526.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)


{'eval_loss': 0.2077130377292633, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5707064612406229, 'eval_precision': 0.6247491638795986, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.7854, 'eval_samples_per_second': 72.573, 'eval_steps_per_second': 5.093, 'epoch': 11.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.5854, 'grad_norm': 1.7651044130325317, 'learning_rate': 2.757218693230837e-05, 'epoch': 0.24390243902439024}
{'loss': 0.477, 'grad_norm': 1.2029367685317993, 'learning_rate': 2.680415944115772e-05, 'epoch': 0.4878048780487805}
{'loss': 0.396, 'grad_norm': 0.8982163071632385, 'learning_rate': 2.603613195000707e-05, 'epoch': 0.7317073170731707}
{'loss': 0.343, 'grad_norm': 0.816440999507904, 'learning_rate': 2.5268104458856417e-05, 'epoch': 0.975609756097561}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3206062316894531, 'eval_accuracy': 0.10526315789473684, 'eval_f1': 0.3486956521739131, 'eval_precision': 0.317673630717109, 'eval_recall': 0.4, 'eval_runtime': 0.8989, 'eval_samples_per_second': 63.408, 'eval_steps_per_second': 8.899, 'epoch': 1.0}
{'loss': 0.3166, 'grad_norm': 0.5671581029891968, 'learning_rate': 2.4500076967705768e-05, 'epoch': 1.2195121951219512}
{'loss': 0.3031, 'grad_norm': 0.7312307953834534, 'learning_rate': 2.3732049476555115e-05, 'epoch': 1.4634146341463414}
{'loss': 0.2726, 'grad_norm': 0.7340152859687805, 'learning_rate': 2.2964021985404466e-05, 'epoch': 1.7073170731707317}
{'loss': 0.2726, 'grad_norm': 0.5935947895050049, 'learning_rate': 2.2195994494253817e-05, 'epoch': 1.951219512195122}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2715437412261963, 'eval_accuracy': 0.08771929824561403, 'eval_f1': 0.36065217391304344, 'eval_precision': 0.32475598935226263, 'eval_recall': 0.41739130434782606, 'eval_runtime': 0.8417, 'eval_samples_per_second': 67.717, 'eval_steps_per_second': 9.504, 'epoch': 2.0}
{'loss': 0.2845, 'grad_norm': 0.5034423470497131, 'learning_rate': 2.1427967003103165e-05, 'epoch': 2.1951219512195124}
{'loss': 0.2558, 'grad_norm': 0.4650406837463379, 'learning_rate': 2.0659939511952512e-05, 'epoch': 2.4390243902439024}
{'loss': 0.2455, 'grad_norm': 0.5275526642799377, 'learning_rate': 1.9891912020801863e-05, 'epoch': 2.682926829268293}
{'loss': 0.2541, 'grad_norm': 0.5431119799613953, 'learning_rate': 1.912388452965121e-05, 'epoch': 2.926829268292683}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.25050294399261475, 'eval_accuracy': 0.14035087719298245, 'eval_f1': 0.4932173913043479, 'eval_precision': 0.4343159859464207, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.8181, 'eval_samples_per_second': 69.671, 'eval_steps_per_second': 9.778, 'epoch': 3.0}
{'loss': 0.2219, 'grad_norm': 0.5803123116493225, 'learning_rate': 1.835585703850056e-05, 'epoch': 3.1707317073170733}
{'loss': 0.2308, 'grad_norm': 0.6108737587928772, 'learning_rate': 1.758782954734991e-05, 'epoch': 3.4146341463414633}
{'loss': 0.241, 'grad_norm': 0.7340725660324097, 'learning_rate': 1.6819802056199257e-05, 'epoch': 3.658536585365854}
{'loss': 0.2175, 'grad_norm': 0.5942587852478027, 'learning_rate': 1.6051774565048605e-05, 'epoch': 3.902439024390244}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22828388214111328, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5420843925863156, 'eval_precision': 0.5569105548235983, 'eval_recall': 0.5304347826086957, 'eval_runtime': 0.8276, 'eval_samples_per_second': 68.875, 'eval_steps_per_second': 9.667, 'epoch': 4.0}
{'loss': 0.2205, 'grad_norm': 0.5583654642105103, 'learning_rate': 1.5283747073897956e-05, 'epoch': 4.146341463414634}
{'loss': 0.2233, 'grad_norm': 0.6050778031349182, 'learning_rate': 1.4515719582747305e-05, 'epoch': 4.390243902439025}
{'loss': 0.1912, 'grad_norm': 0.6268308758735657, 'learning_rate': 1.3747692091596652e-05, 'epoch': 4.634146341463414}
{'loss': 0.1963, 'grad_norm': 0.6221045851707458, 'learning_rate': 1.2979664600446002e-05, 'epoch': 4.878048780487805}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21902357041835785, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5420289855072463, 'eval_precision': 0.50900395256917, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.8374, 'eval_samples_per_second': 68.068, 'eval_steps_per_second': 9.553, 'epoch': 5.0}
{'loss': 0.209, 'grad_norm': 0.4788699150085449, 'learning_rate': 1.2211637109295351e-05, 'epoch': 5.121951219512195}
{'loss': 0.1892, 'grad_norm': 0.5884864926338196, 'learning_rate': 1.14436096181447e-05, 'epoch': 5.365853658536586}
{'loss': 0.1774, 'grad_norm': 0.5860753059387207, 'learning_rate': 1.067558212699405e-05, 'epoch': 5.609756097560975}
{'loss': 0.1999, 'grad_norm': 0.6322353482246399, 'learning_rate': 9.907554635843399e-06, 'epoch': 5.853658536585366}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21436797082424164, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5455548821334013, 'eval_precision': 0.5183716965046888, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.8389, 'eval_samples_per_second': 67.948, 'eval_steps_per_second': 9.537, 'epoch': 6.0}
{'loss': 0.1809, 'grad_norm': 0.6453925371170044, 'learning_rate': 9.139527144692746e-06, 'epoch': 6.097560975609756}
{'loss': 0.1727, 'grad_norm': 0.3970801830291748, 'learning_rate': 8.371499653542096e-06, 'epoch': 6.341463414634147}
{'loss': 0.1917, 'grad_norm': 0.4274722933769226, 'learning_rate': 7.603472162391446e-06, 'epoch': 6.585365853658536}
{'loss': 0.1719, 'grad_norm': 0.4552880823612213, 'learning_rate': 6.835444671240794e-06, 'epoch': 6.829268292682927}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21056166291236877, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5446601487378797, 'eval_precision': 0.5207540674107392, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.8558, 'eval_samples_per_second': 66.603, 'eval_steps_per_second': 9.348, 'epoch': 7.0}
{'loss': 0.1753, 'grad_norm': 0.44793906807899475, 'learning_rate': 6.067417180090143e-06, 'epoch': 7.073170731707317}
{'loss': 0.1712, 'grad_norm': 0.48537901043891907, 'learning_rate': 5.299389688939492e-06, 'epoch': 7.317073170731708}
{'loss': 0.1681, 'grad_norm': 0.5738776326179504, 'learning_rate': 4.531362197788841e-06, 'epoch': 7.560975609756097}
{'loss': 0.1759, 'grad_norm': 0.6045613288879395, 'learning_rate': 3.76333470663819e-06, 'epoch': 7.804878048780488}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20829695463180542, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5523185379553666, 'eval_precision': 0.5982588566827697, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.8919, 'eval_samples_per_second': 63.907, 'eval_steps_per_second': 8.969, 'epoch': 8.0}
{'loss': 0.1598, 'grad_norm': 0.3721363842487335, 'learning_rate': 2.995307215487539e-06, 'epoch': 8.048780487804878}
{'loss': 0.1663, 'grad_norm': 0.4290274977684021, 'learning_rate': 2.227279724336888e-06, 'epoch': 8.292682926829269}
{'loss': 0.1555, 'grad_norm': 0.42076820135116577, 'learning_rate': 1.4592522331862371e-06, 'epoch': 8.536585365853659}
{'loss': 0.1851, 'grad_norm': 0.40477436780929565, 'learning_rate': 6.91224742035586e-07, 'epoch': 8.78048780487805}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20666572451591492, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5523453937366981, 'eval_precision': 0.5989878076834598, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.8319, 'eval_samples_per_second': 68.519, 'eval_steps_per_second': 9.617, 'epoch': 9.0}
{'train_runtime': 217.3311, 'train_samples_per_second': 13.376, 'train_steps_per_second': 1.698, 'train_loss': 0.23684537539960246, 'epoch': 9.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-06 22:15:19,502] Trial 6 finished with value: 0.5523453937366981 and parameters: {'learning_rate': 2.8340214423459023e-05, 'batch_size': 8, 'weight_decay': 0.04708365307070549, 'num_train_epochs': 9}. Best is trial 1 with value: 0.5714785963661526.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.20666572451591492, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5523453937366981, 'eval_precision': 0.5989878076834598, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.9582, 'eval_samples_per_second': 59.485, 'eval_steps_per_second': 8.349, 'epoch': 9.0}
{'loss': 0.6383, 'grad_norm': 2.1204118728637695, 'learning_rate': 1.31787944676877e-05, 'epoch': 0.47619047619047616}
{'loss': 0.555, 'grad_norm': 1.2334527969360352, 'learning_rate': 1.2442548966699561e-05, 'epoch': 0.9523809523809523}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.5007266998291016, 'eval_accuracy': 0.10526315789473684, 'eval_f1': 0.41040432164869944, 'eval_precision': 0.3507833442301405, 'eval_recall': 0.5304347826086957, 'eval_runtime': 0.8336, 'eval_samples_per_second': 68.381, 'eval_steps_per_second': 4.799, 'epoch': 1.0}
{'loss': 0.4965, 'grad_norm': 1.8779569864273071, 'learning_rate': 1.1706303465711422e-05, 'epoch': 1.4285714285714286}
{'loss': 0.4505, 'grad_norm': 1.3459255695343018, 'learning_rate': 1.0970057964723282e-05, 'epoch': 1.9047619047619047}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.41584041714668274, 'eval_accuracy': 0.12280701754385964, 'eval_f1': 0.39973707257483065, 'eval_precision': 0.38655072463768114, 'eval_recall': 0.41739130434782606, 'eval_runtime': 0.833, 'eval_samples_per_second': 68.425, 'eval_steps_per_second': 4.802, 'epoch': 2.0}
{'loss': 0.4213, 'grad_norm': 0.9806808233261108, 'learning_rate': 1.0233812463735141e-05, 'epoch': 2.380952380952381}
{'loss': 0.3897, 'grad_norm': 1.0569047927856445, 'learning_rate': 9.497566962747001e-06, 'epoch': 2.857142857142857}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.36635449528694153, 'eval_accuracy': 0.08771929824561403, 'eval_f1': 0.3697481140519115, 'eval_precision': 0.4465579710144928, 'eval_recall': 0.40869565217391307, 'eval_runtime': 0.8177, 'eval_samples_per_second': 69.709, 'eval_steps_per_second': 4.892, 'epoch': 3.0}
{'loss': 0.3568, 'grad_norm': 0.8101860284805298, 'learning_rate': 8.761321461758862e-06, 'epoch': 3.3333333333333335}
{'loss': 0.3566, 'grad_norm': 0.7103469371795654, 'learning_rate': 8.025075960770723e-06, 'epoch': 3.8095238095238093}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.33610862493515015, 'eval_accuracy': 0.07017543859649122, 'eval_f1': 0.3433841520798043, 'eval_precision': 0.32317182369913444, 'eval_recall': 0.3826086956521739, 'eval_runtime': 0.8227, 'eval_samples_per_second': 69.28, 'eval_steps_per_second': 4.862, 'epoch': 4.0}
{'loss': 0.3366, 'grad_norm': 0.7930057048797607, 'learning_rate': 7.288830459782583e-06, 'epoch': 4.285714285714286}
{'loss': 0.3252, 'grad_norm': 0.8388504981994629, 'learning_rate': 6.552584958794443e-06, 'epoch': 4.761904761904762}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3209420144557953, 'eval_accuracy': 0.08771929824561403, 'eval_f1': 0.35253008735643676, 'eval_precision': 0.321371237458194, 'eval_recall': 0.391304347826087, 'eval_runtime': 0.8008, 'eval_samples_per_second': 71.176, 'eval_steps_per_second': 4.995, 'epoch': 5.0}
{'loss': 0.3245, 'grad_norm': 0.7586922645568848, 'learning_rate': 5.816339457806303e-06, 'epoch': 5.238095238095238}
{'loss': 0.3105, 'grad_norm': 0.5798925161361694, 'learning_rate': 5.080093956818163e-06, 'epoch': 5.714285714285714}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3093792200088501, 'eval_accuracy': 0.05263157894736842, 'eval_f1': 0.33446280622945457, 'eval_precision': 0.3399068322981366, 'eval_recall': 0.3652173913043478, 'eval_runtime': 0.8261, 'eval_samples_per_second': 68.999, 'eval_steps_per_second': 4.842, 'epoch': 6.0}
{'loss': 0.3105, 'grad_norm': 0.6039041876792908, 'learning_rate': 4.343848455830024e-06, 'epoch': 6.190476190476191}
{'loss': 0.3067, 'grad_norm': 0.5893656015396118, 'learning_rate': 3.607602954841884e-06, 'epoch': 6.666666666666667}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.30285927653312683, 'eval_accuracy': 0.07017543859649122, 'eval_f1': 0.34186353820865695, 'eval_precision': 0.3552898550724638, 'eval_recall': 0.34782608695652173, 'eval_runtime': 0.8275, 'eval_samples_per_second': 68.882, 'eval_steps_per_second': 4.834, 'epoch': 7.0}
{'loss': 0.3013, 'grad_norm': 0.5513238906860352, 'learning_rate': 2.8713574538537447e-06, 'epoch': 7.142857142857143}
{'loss': 0.2977, 'grad_norm': 0.5338227152824402, 'learning_rate': 2.135111952865605e-06, 'epoch': 7.619047619047619}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.29964664578437805, 'eval_accuracy': 0.07017543859649122, 'eval_f1': 0.3286704473850031, 'eval_precision': 0.33984553775743703, 'eval_recall': 0.33043478260869563, 'eval_runtime': 0.8189, 'eval_samples_per_second': 69.608, 'eval_steps_per_second': 4.885, 'epoch': 8.0}
{'loss': 0.2992, 'grad_norm': 0.528145432472229, 'learning_rate': 1.3988664518774654e-06, 'epoch': 8.095238095238095}
{'loss': 0.2912, 'grad_norm': 0.5677812099456787, 'learning_rate': 6.626209508893257e-07, 'epoch': 8.571428571428571}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2982504963874817, 'eval_accuracy': 0.05263157894736842, 'eval_f1': 0.339391565478522, 'eval_precision': 0.3555218906226329, 'eval_recall': 0.34782608695652173, 'eval_runtime': 0.9741, 'eval_samples_per_second': 58.513, 'eval_steps_per_second': 4.106, 'epoch': 9.0}
{'train_runtime': 198.5392, 'train_samples_per_second': 14.642, 'train_steps_per_second': 0.952, 'train_loss': 0.37256562142145067, 'epoch': 9.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-06 22:18:39,359] Trial 7 finished with value: 0.41040432164869944 and parameters: {'learning_rate': 1.391503996867584e-05, 'batch_size': 16, 'weight_decay': 0.025411723335780526, 'num_train_epochs': 9}. Best is trial 1 with value: 0.5714785963661526.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.5007266998291016, 'eval_accuracy': 0.10526315789473684, 'eval_f1': 0.41040432164869944, 'eval_precision': 0.3507833442301405, 'eval_recall': 0.5304347826086957, 'eval_runtime': 0.7984, 'eval_samples_per_second': 71.394, 'eval_steps_per_second': 5.01, 'epoch': 9.0}
{'loss': 0.6402, 'grad_norm': 2.8304035663604736, 'learning_rate': 1.760316899091995e-05, 'epoch': 0.24390243902439024}
{'loss': 0.5551, 'grad_norm': 2.5082173347473145, 'learning_rate': 1.7361367219066653e-05, 'epoch': 0.4878048780487805}
{'loss': 0.482, 'grad_norm': 1.349485993385315, 'learning_rate': 1.7119565447213358e-05, 'epoch': 0.7317073170731707}
{'loss': 0.4225, 'grad_norm': 0.936850368976593, 'learning_rate': 1.6877763675360062e-05, 'epoch': 0.975609756097561}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3848215639591217, 'eval_accuracy': 0.15789473684210525, 'eval_f1': 0.3226395197923427, 'eval_precision': 0.2842555994729908, 'eval_recall': 0.391304347826087, 'eval_runtime': 0.8957, 'eval_samples_per_second': 63.638, 'eval_steps_per_second': 8.932, 'epoch': 1.0}
{'loss': 0.3788, 'grad_norm': 0.7570070028305054, 'learning_rate': 1.6635961903506763e-05, 'epoch': 1.2195121951219512}
{'loss': 0.3504, 'grad_norm': 0.7776986360549927, 'learning_rate': 1.6394160131653467e-05, 'epoch': 1.4634146341463414}
{'loss': 0.317, 'grad_norm': 1.0428872108459473, 'learning_rate': 1.615235835980017e-05, 'epoch': 1.7073170731707317}
{'loss': 0.311, 'grad_norm': 0.6477923393249512, 'learning_rate': 1.5910556587946876e-05, 'epoch': 1.951219512195122}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2996155023574829, 'eval_accuracy': 0.03508771929824561, 'eval_f1': 0.2946931899177088, 'eval_precision': 0.36869565217391304, 'eval_recall': 0.2608695652173913, 'eval_runtime': 0.899, 'eval_samples_per_second': 63.406, 'eval_steps_per_second': 8.899, 'epoch': 2.0}
{'loss': 0.3119, 'grad_norm': 0.5474733710289001, 'learning_rate': 1.566875481609358e-05, 'epoch': 2.1951219512195124}
{'loss': 0.2862, 'grad_norm': 0.5392085909843445, 'learning_rate': 1.5426953044240285e-05, 'epoch': 2.4390243902439024}
{'loss': 0.272, 'grad_norm': 0.540806770324707, 'learning_rate': 1.5185151272386987e-05, 'epoch': 2.682926829268293}
{'loss': 0.2808, 'grad_norm': 0.6649135947227478, 'learning_rate': 1.4943349500533692e-05, 'epoch': 2.926829268292683}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.27286794781684875, 'eval_accuracy': 0.08771929824561403, 'eval_f1': 0.3702726652043173, 'eval_precision': 0.3639061421670117, 'eval_recall': 0.391304347826087, 'eval_runtime': 0.8199, 'eval_samples_per_second': 69.517, 'eval_steps_per_second': 9.757, 'epoch': 3.0}
{'loss': 0.253, 'grad_norm': 0.5725865364074707, 'learning_rate': 1.4701547728680398e-05, 'epoch': 3.1707317073170733}
{'loss': 0.2582, 'grad_norm': 0.5599718689918518, 'learning_rate': 1.44597459568271e-05, 'epoch': 3.4146341463414633}
{'loss': 0.2722, 'grad_norm': 0.6351391077041626, 'learning_rate': 1.4217944184973805e-05, 'epoch': 3.658536585365854}
{'loss': 0.2474, 'grad_norm': 0.5797377228736877, 'learning_rate': 1.397614241312051e-05, 'epoch': 3.902439024390244}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.25493189692497253, 'eval_accuracy': 0.12280701754385964, 'eval_f1': 0.3605465838509317, 'eval_precision': 0.37120772946859903, 'eval_recall': 0.3565217391304348, 'eval_runtime': 0.8406, 'eval_samples_per_second': 67.809, 'eval_steps_per_second': 9.517, 'epoch': 4.0}
{'loss': 0.2475, 'grad_norm': 0.6102083325386047, 'learning_rate': 1.3734340641267212e-05, 'epoch': 4.146341463414634}
{'loss': 0.2508, 'grad_norm': 0.7924773693084717, 'learning_rate': 1.3492538869413916e-05, 'epoch': 4.390243902439025}
{'loss': 0.2207, 'grad_norm': 0.7654867768287659, 'learning_rate': 1.3250737097560621e-05, 'epoch': 4.634146341463414}
{'loss': 0.2297, 'grad_norm': 0.8740519285202026, 'learning_rate': 1.3008935325707324e-05, 'epoch': 4.878048780487805}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.23994912207126617, 'eval_accuracy': 0.21052631578947367, 'eval_f1': 0.4732148211608481, 'eval_precision': 0.5440491875274485, 'eval_recall': 0.43478260869565216, 'eval_runtime': 0.8728, 'eval_samples_per_second': 65.308, 'eval_steps_per_second': 9.166, 'epoch': 5.0}
{'loss': 0.2384, 'grad_norm': 0.7252579927444458, 'learning_rate': 1.2767133553854028e-05, 'epoch': 5.121951219512195}
{'loss': 0.2211, 'grad_norm': 0.6861586570739746, 'learning_rate': 1.2525331782000734e-05, 'epoch': 5.365853658536586}
{'loss': 0.2032, 'grad_norm': 0.6651512384414673, 'learning_rate': 1.2283530010147437e-05, 'epoch': 5.609756097560975}
{'loss': 0.2307, 'grad_norm': 0.5432027578353882, 'learning_rate': 1.2041728238294141e-05, 'epoch': 5.853658536585366}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2294912338256836, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5385099933693285, 'eval_precision': 0.5572644402436463, 'eval_recall': 0.5304347826086957, 'eval_runtime': 0.8487, 'eval_samples_per_second': 67.161, 'eval_steps_per_second': 9.426, 'epoch': 6.0}
{'loss': 0.2102, 'grad_norm': 0.7476306557655334, 'learning_rate': 1.1799926466440845e-05, 'epoch': 6.097560975609756}
{'loss': 0.1964, 'grad_norm': 0.46954187750816345, 'learning_rate': 1.1558124694587548e-05, 'epoch': 6.341463414634147}
{'loss': 0.2155, 'grad_norm': 0.47234073281288147, 'learning_rate': 1.1316322922734253e-05, 'epoch': 6.585365853658536}
{'loss': 0.194, 'grad_norm': 0.6352190375328064, 'learning_rate': 1.1074521150880957e-05, 'epoch': 6.829268292682927}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2214280366897583, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5470161977834612, 'eval_precision': 0.5305893719806763, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.863, 'eval_samples_per_second': 66.051, 'eval_steps_per_second': 9.27, 'epoch': 7.0}
{'loss': 0.2017, 'grad_norm': 0.7847915291786194, 'learning_rate': 1.083271937902766e-05, 'epoch': 7.073170731707317}
{'loss': 0.1917, 'grad_norm': 0.49962472915649414, 'learning_rate': 1.0590917607174364e-05, 'epoch': 7.317073170731708}
{'loss': 0.1842, 'grad_norm': 0.6039749979972839, 'learning_rate': 1.034911583532107e-05, 'epoch': 7.560975609756097}
{'loss': 0.1896, 'grad_norm': 0.5583171844482422, 'learning_rate': 1.0107314063467773e-05, 'epoch': 7.804878048780488}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2137438803911209, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.551993482071213, 'eval_precision': 0.5337911044477761, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.857, 'eval_samples_per_second': 66.509, 'eval_steps_per_second': 9.335, 'epoch': 8.0}
{'loss': 0.1751, 'grad_norm': 0.4646868109703064, 'learning_rate': 9.865512291614477e-06, 'epoch': 8.048780487804878}
{'loss': 0.1778, 'grad_norm': 0.43526437878608704, 'learning_rate': 9.623710519761182e-06, 'epoch': 8.292682926829269}
{'loss': 0.1652, 'grad_norm': 0.48356887698173523, 'learning_rate': 9.381908747907884e-06, 'epoch': 8.536585365853659}
{'loss': 0.1945, 'grad_norm': 0.581371545791626, 'learning_rate': 9.140106976054589e-06, 'epoch': 8.78048780487805}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2139083743095398, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5383115928666683, 'eval_precision': 0.5541798941798941, 'eval_recall': 0.5304347826086957, 'eval_runtime': 0.8424, 'eval_samples_per_second': 67.662, 'eval_steps_per_second': 9.496, 'epoch': 9.0}
{'loss': 0.1647, 'grad_norm': 0.8566161394119263, 'learning_rate': 8.898305204201293e-06, 'epoch': 9.024390243902438}
{'loss': 0.16, 'grad_norm': 0.6281580328941345, 'learning_rate': 8.656503432347996e-06, 'epoch': 9.268292682926829}
{'loss': 0.1777, 'grad_norm': 0.5491612553596497, 'learning_rate': 8.4147016604947e-06, 'epoch': 9.512195121951219}
{'loss': 0.1594, 'grad_norm': 0.4544758200645447, 'learning_rate': 8.172899888641404e-06, 'epoch': 9.75609756097561}
{'loss': 0.1603, 'grad_norm': 0.7548204660415649, 'learning_rate': 7.931098116788109e-06, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20622894167900085, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5557453304295628, 'eval_precision': 0.631982076594553, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.8106, 'eval_samples_per_second': 70.321, 'eval_steps_per_second': 9.87, 'epoch': 10.0}
{'loss': 0.1597, 'grad_norm': 0.4765481948852539, 'learning_rate': 7.689296344934813e-06, 'epoch': 10.24390243902439}
{'loss': 0.1542, 'grad_norm': 0.743651807308197, 'learning_rate': 7.447494573081517e-06, 'epoch': 10.487804878048781}
{'loss': 0.1591, 'grad_norm': 0.6266592144966125, 'learning_rate': 7.20569280122822e-06, 'epoch': 10.731707317073171}
{'loss': 0.1575, 'grad_norm': 0.5296348929405212, 'learning_rate': 6.963891029374925e-06, 'epoch': 10.975609756097562}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2025391310453415, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.569067801763454, 'eval_precision': 0.6151368760064412, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.8412, 'eval_samples_per_second': 67.764, 'eval_steps_per_second': 9.511, 'epoch': 11.0}
{'loss': 0.1565, 'grad_norm': 0.4679372310638428, 'learning_rate': 6.722089257521628e-06, 'epoch': 11.21951219512195}
{'loss': 0.1463, 'grad_norm': 0.5101193785667419, 'learning_rate': 6.4802874856683326e-06, 'epoch': 11.463414634146341}
{'loss': 0.147, 'grad_norm': 0.40378519892692566, 'learning_rate': 6.238485713815037e-06, 'epoch': 11.707317073170731}
{'loss': 0.1542, 'grad_norm': 0.5168342590332031, 'learning_rate': 5.9966839419617405e-06, 'epoch': 11.951219512195122}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20014069974422455, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5620454869435625, 'eval_precision': 0.6171212684256162, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.8288, 'eval_samples_per_second': 68.778, 'eval_steps_per_second': 9.653, 'epoch': 12.0}
{'loss': 0.1543, 'grad_norm': 0.6011658310890198, 'learning_rate': 5.754882170108444e-06, 'epoch': 12.195121951219512}
{'loss': 0.1492, 'grad_norm': 0.6057782769203186, 'learning_rate': 5.513080398255149e-06, 'epoch': 12.439024390243903}
{'loss': 0.1394, 'grad_norm': 0.4958629012107849, 'learning_rate': 5.271278626401853e-06, 'epoch': 12.682926829268293}
{'loss': 0.1473, 'grad_norm': 0.5136553645133972, 'learning_rate': 5.029476854548556e-06, 'epoch': 12.926829268292684}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19760851562023163, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5995164879922033, 'eval_precision': 0.6233890746934225, 'eval_recall': 0.6, 'eval_runtime': 0.8715, 'eval_samples_per_second': 65.403, 'eval_steps_per_second': 9.179, 'epoch': 13.0}
{'loss': 0.1309, 'grad_norm': 0.39552533626556396, 'learning_rate': 4.787675082695261e-06, 'epoch': 13.170731707317072}
{'loss': 0.1462, 'grad_norm': 0.4379895031452179, 'learning_rate': 4.545873310841965e-06, 'epoch': 13.414634146341463}
{'loss': 0.1399, 'grad_norm': 0.429123193025589, 'learning_rate': 4.304071538988669e-06, 'epoch': 13.658536585365853}
{'loss': 0.1309, 'grad_norm': 0.5479715466499329, 'learning_rate': 4.062269767135372e-06, 'epoch': 13.902439024390244}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19763115048408508, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5719470602210949, 'eval_precision': 0.6111755233494364, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.8418, 'eval_samples_per_second': 67.709, 'eval_steps_per_second': 9.503, 'epoch': 14.0}
{'loss': 0.1333, 'grad_norm': 0.4016105830669403, 'learning_rate': 3.8204679952820765e-06, 'epoch': 14.146341463414634}
{'loss': 0.1371, 'grad_norm': 0.4305097162723541, 'learning_rate': 3.578666223428781e-06, 'epoch': 14.390243902439025}
{'loss': 0.1309, 'grad_norm': 0.4578751027584076, 'learning_rate': 3.3368644515754844e-06, 'epoch': 14.634146341463415}
{'loss': 0.145, 'grad_norm': 0.4845619797706604, 'learning_rate': 3.095062679722189e-06, 'epoch': 14.878048780487806}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.195785254240036, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5846460537392214, 'eval_precision': 0.6223212838025433, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.9222, 'eval_samples_per_second': 61.809, 'eval_steps_per_second': 8.675, 'epoch': 15.0}
{'loss': 0.1444, 'grad_norm': 0.4434439539909363, 'learning_rate': 2.8532609078688928e-06, 'epoch': 15.121951219512194}
{'loss': 0.1287, 'grad_norm': 0.511618971824646, 'learning_rate': 2.6114591360155967e-06, 'epoch': 15.365853658536585}
{'loss': 0.1287, 'grad_norm': 0.39962515234947205, 'learning_rate': 2.3696573641623007e-06, 'epoch': 15.609756097560975}
{'loss': 0.1311, 'grad_norm': 0.45439258217811584, 'learning_rate': 2.1278555923090047e-06, 'epoch': 15.853658536585366}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19468188285827637, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.6075188541560601, 'eval_precision': 0.6345125727734423, 'eval_recall': 0.6086956521739131, 'eval_runtime': 0.87, 'eval_samples_per_second': 65.515, 'eval_steps_per_second': 9.195, 'epoch': 16.0}
{'loss': 0.137, 'grad_norm': 0.5144452452659607, 'learning_rate': 1.8860538204557088e-06, 'epoch': 16.097560975609756}
{'loss': 0.1157, 'grad_norm': 0.4068792164325714, 'learning_rate': 1.6442520486024128e-06, 'epoch': 16.341463414634145}
{'loss': 0.1409, 'grad_norm': 0.5618058443069458, 'learning_rate': 1.4024502767491167e-06, 'epoch': 16.585365853658537}
{'loss': 0.1359, 'grad_norm': 0.5008223652839661, 'learning_rate': 1.160648504895821e-06, 'epoch': 16.829268292682926}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19409558176994324, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.6049430073032558, 'eval_precision': 0.6479068158228578, 'eval_recall': 0.6, 'eval_runtime': 0.838, 'eval_samples_per_second': 68.023, 'eval_steps_per_second': 9.547, 'epoch': 17.0}
{'loss': 0.129, 'grad_norm': 0.4465824067592621, 'learning_rate': 9.188467330425249e-07, 'epoch': 17.073170731707318}
{'loss': 0.1311, 'grad_norm': 0.465814471244812, 'learning_rate': 6.770449611892287e-07, 'epoch': 17.317073170731707}
{'loss': 0.1304, 'grad_norm': 0.3985486626625061, 'learning_rate': 4.352431893359328e-07, 'epoch': 17.5609756097561}
{'loss': 0.1383, 'grad_norm': 0.587925136089325, 'learning_rate': 1.934414174826368e-07, 'epoch': 17.804878048780488}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.1939162164926529, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.6144986690328307, 'eval_precision': 0.6479068158228578, 'eval_recall': 0.6086956521739131, 'eval_runtime': 0.8158, 'eval_samples_per_second': 69.871, 'eval_steps_per_second': 9.806, 'epoch': 18.0}
{'train_runtime': 421.8703, 'train_samples_per_second': 13.781, 'train_steps_per_second': 1.749, 'train_loss': 0.20615096289290968, 'epoch': 18.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-06 22:25:42,752] Trial 8 finished with value: 0.6144986690328307 and parameters: {'learning_rate': 1.7844970762773244e-05, 'batch_size': 8, 'weight_decay': 0.06397417370345938, 'num_train_epochs': 18}. Best is trial 8 with value: 0.6144986690328307.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)


{'eval_loss': 0.1939162164926529, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.6144986690328307, 'eval_precision': 0.6479068158228578, 'eval_recall': 0.6086956521739131, 'eval_runtime': 0.9937, 'eval_samples_per_second': 57.364, 'eval_steps_per_second': 8.051, 'epoch': 18.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.6145, 'grad_norm': 1.3440207242965698, 'learning_rate': 2.936543212123087e-05, 'epoch': 0.9090909090909091}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.5036166906356812, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.20504573687182384, 'eval_precision': 0.25822306238185255, 'eval_recall': 0.24347826086956523, 'eval_runtime': 9.4093, 'eval_samples_per_second': 6.058, 'eval_steps_per_second': 0.213, 'epoch': 1.0}
{'loss': 0.4653, 'grad_norm': 0.9059202671051025, 'learning_rate': 2.7967078210696066e-05, 'epoch': 1.8181818181818183}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3885294795036316, 'eval_accuracy': 0.03508771929824561, 'eval_f1': 0.10152761457109284, 'eval_precision': 0.14448160535117055, 'eval_recall': 0.0782608695652174, 'eval_runtime': 9.5989, 'eval_samples_per_second': 5.938, 'eval_steps_per_second': 0.208, 'epoch': 2.0}
{'loss': 0.3808, 'grad_norm': 0.5918225049972534, 'learning_rate': 2.6568724300161264e-05, 'epoch': 2.7272727272727275}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.33473455905914307, 'eval_accuracy': 0.10526315789473684, 'eval_f1': 0.3069521630162363, 'eval_precision': 0.31147310441814136, 'eval_recall': 0.34782608695652173, 'eval_runtime': 8.8601, 'eval_samples_per_second': 6.433, 'eval_steps_per_second': 0.226, 'epoch': 3.0}
{'loss': 0.3273, 'grad_norm': 0.6141409277915955, 'learning_rate': 2.5170370389626462e-05, 'epoch': 3.6363636363636362}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.30387216806411743, 'eval_accuracy': 0.07017543859649122, 'eval_f1': 0.282295409786396, 'eval_precision': 0.5619778346121057, 'eval_recall': 0.30434782608695654, 'eval_runtime': 9.0835, 'eval_samples_per_second': 6.275, 'eval_steps_per_second': 0.22, 'epoch': 4.0}
{'loss': 0.2999, 'grad_norm': 0.43004944920539856, 'learning_rate': 2.3772016479091656e-05, 'epoch': 4.545454545454545}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2877822816371918, 'eval_accuracy': 0.05263157894736842, 'eval_f1': 0.24769608551206124, 'eval_precision': 0.3557312252964427, 'eval_recall': 0.2956521739130435, 'eval_runtime': 8.9305, 'eval_samples_per_second': 6.383, 'eval_steps_per_second': 0.224, 'epoch': 5.0}
{'loss': 0.287, 'grad_norm': 0.4050551652908325, 'learning_rate': 2.2373662568556854e-05, 'epoch': 5.454545454545454}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2785607576370239, 'eval_accuracy': 0.14035087719298245, 'eval_f1': 0.3783374299875085, 'eval_precision': 0.5340301003344482, 'eval_recall': 0.4, 'eval_runtime': 9.0781, 'eval_samples_per_second': 6.279, 'eval_steps_per_second': 0.22, 'epoch': 6.0}
{'loss': 0.2693, 'grad_norm': 0.390159010887146, 'learning_rate': 2.097530865802205e-05, 'epoch': 6.363636363636363}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2683936059474945, 'eval_accuracy': 0.19298245614035087, 'eval_f1': 0.46541177369154685, 'eval_precision': 0.5355800568615214, 'eval_recall': 0.45217391304347826, 'eval_runtime': 8.8201, 'eval_samples_per_second': 6.463, 'eval_steps_per_second': 0.227, 'epoch': 7.0}
{'loss': 0.2636, 'grad_norm': 0.39758163690567017, 'learning_rate': 1.9576954747487247e-05, 'epoch': 7.2727272727272725}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.26049402356147766, 'eval_accuracy': 0.19298245614035087, 'eval_f1': 0.4809918352572095, 'eval_precision': 0.549141876430206, 'eval_recall': 0.4782608695652174, 'eval_runtime': 9.3679, 'eval_samples_per_second': 6.085, 'eval_steps_per_second': 0.213, 'epoch': 8.0}
{'loss': 0.2524, 'grad_norm': 0.38471129536628723, 'learning_rate': 1.8178600836952444e-05, 'epoch': 8.181818181818182}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.25334417819976807, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5028749567982305, 'eval_precision': 0.5202078332513115, 'eval_recall': 0.5130434782608696, 'eval_runtime': 8.6619, 'eval_samples_per_second': 6.581, 'eval_steps_per_second': 0.231, 'epoch': 9.0}
{'loss': 0.2434, 'grad_norm': 0.36892959475517273, 'learning_rate': 1.678024692641764e-05, 'epoch': 9.090909090909092}
{'loss': 0.2323, 'grad_norm': 0.6713087558746338, 'learning_rate': 1.5381893015882837e-05, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.24447523057460785, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.521247264276528, 'eval_precision': 0.5468670076726343, 'eval_recall': 0.5043478260869565, 'eval_runtime': 8.568, 'eval_samples_per_second': 6.653, 'eval_steps_per_second': 0.233, 'epoch': 10.0}
{'loss': 0.228, 'grad_norm': 0.42428290843963623, 'learning_rate': 1.3983539105348033e-05, 'epoch': 10.909090909090908}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.23858638107776642, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5533548137579245, 'eval_precision': 0.5607556035646774, 'eval_recall': 0.5478260869565217, 'eval_runtime': 8.745, 'eval_samples_per_second': 6.518, 'eval_steps_per_second': 0.229, 'epoch': 11.0}
{'loss': 0.229, 'grad_norm': 0.33285433053970337, 'learning_rate': 1.2585185194813231e-05, 'epoch': 11.818181818181818}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.23224829137325287, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5372023954428161, 'eval_precision': 0.5560527846444671, 'eval_recall': 0.5217391304347826, 'eval_runtime': 8.767, 'eval_samples_per_second': 6.502, 'eval_steps_per_second': 0.228, 'epoch': 12.0}
{'loss': 0.2143, 'grad_norm': 0.3587421476840973, 'learning_rate': 1.1186831284278427e-05, 'epoch': 12.727272727272727}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2302723079919815, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5535367324187199, 'eval_precision': 0.5279264214046823, 'eval_recall': 0.5826086956521739, 'eval_runtime': 8.7987, 'eval_samples_per_second': 6.478, 'eval_steps_per_second': 0.227, 'epoch': 13.0}
{'loss': 0.2062, 'grad_norm': 0.3958362936973572, 'learning_rate': 9.788477373743623e-06, 'epoch': 13.636363636363637}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22630448639392853, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5406113269895664, 'eval_precision': 0.5453982410504149, 'eval_recall': 0.5391304347826087, 'eval_runtime': 8.6155, 'eval_samples_per_second': 6.616, 'eval_steps_per_second': 0.232, 'epoch': 14.0}
{'loss': 0.208, 'grad_norm': 0.42338991165161133, 'learning_rate': 8.39012346320882e-06, 'epoch': 14.545454545454545}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22687995433807373, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5442938553597771, 'eval_precision': 0.556357318416815, 'eval_recall': 0.5391304347826087, 'eval_runtime': 8.8058, 'eval_samples_per_second': 6.473, 'eval_steps_per_second': 0.227, 'epoch': 15.0}
{'loss': 0.2024, 'grad_norm': 0.35116371512413025, 'learning_rate': 6.9917695526740165e-06, 'epoch': 15.454545454545455}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22216922044754028, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5536689491218965, 'eval_precision': 0.5290508591857918, 'eval_recall': 0.5826086956521739, 'eval_runtime': 8.68, 'eval_samples_per_second': 6.567, 'eval_steps_per_second': 0.23, 'epoch': 16.0}
{'loss': 0.1894, 'grad_norm': 0.3428152799606323, 'learning_rate': 5.5934156421392136e-06, 'epoch': 16.363636363636363}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2192031741142273, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5404755131799188, 'eval_precision': 0.5443590436029, 'eval_recall': 0.5391304347826087, 'eval_runtime': 8.7391, 'eval_samples_per_second': 6.522, 'eval_steps_per_second': 0.229, 'epoch': 17.0}
{'loss': 0.1947, 'grad_norm': 0.38962897658348083, 'learning_rate': 4.19506173160441e-06, 'epoch': 17.272727272727273}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22068464756011963, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5302533081285444, 'eval_precision': 0.5242277850973504, 'eval_recall': 0.5391304347826087, 'eval_runtime': 9.0177, 'eval_samples_per_second': 6.321, 'eval_steps_per_second': 0.222, 'epoch': 18.0}
{'loss': 0.1941, 'grad_norm': 0.3438587188720703, 'learning_rate': 2.7967078210696068e-06, 'epoch': 18.181818181818183}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21863435208797455, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5481041112097013, 'eval_precision': 0.5416325683312191, 'eval_recall': 0.5565217391304348, 'eval_runtime': 8.5551, 'eval_samples_per_second': 6.663, 'eval_steps_per_second': 0.234, 'epoch': 19.0}
{'loss': 0.1888, 'grad_norm': 0.3359123170375824, 'learning_rate': 1.3983539105348034e-06, 'epoch': 19.09090909090909}
{'loss': 0.1897, 'grad_norm': 0.7856618762016296, 'learning_rate': 0.0, 'epoch': 20.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21853137016296387, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5395859213250518, 'eval_precision': 0.5332847422442625, 'eval_recall': 0.5478260869565217, 'eval_runtime': 8.3859, 'eval_samples_per_second': 6.797, 'eval_steps_per_second': 0.238, 'epoch': 20.0}
{'train_runtime': 5463.8892, 'train_samples_per_second': 1.182, 'train_steps_per_second': 0.04, 'train_loss': 0.2672846062616868, 'epoch': 20.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-06 23:56:56,518] Trial 9 finished with value: 0.5536689491218965 and parameters: {'learning_rate': 3.0763786031765674e-05, 'batch_size': 32, 'weight_decay': 0.09516132044515489, 'num_train_epochs': 20}. Best is trial 8 with value: 0.6144986690328307.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.22216922044754028, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5536689491218965, 'eval_precision': 0.5290508591857918, 'eval_recall': 0.5826086956521739, 'eval_runtime': 8.9291, 'eval_samples_per_second': 6.384, 'eval_steps_per_second': 0.224, 'epoch': 20.0}
Best Hyperparameters: {'learning_rate': 1.7844970762773244e-05, 'batch_size': 8, 'weight_decay': 0.06397417370345938, 'num_train_epochs': 18}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                
  6%|▌         | 41/738 [00:19<03:37,  3.20it/s]

{'eval_loss': 0.38994768261909485, 'eval_accuracy': 0.03508771929824561, 'eval_f1': 0.26134952004517226, 'eval_precision': 0.34660564454614795, 'eval_recall': 0.3130434782608696, 'eval_runtime': 0.7655, 'eval_samples_per_second': 74.461, 'eval_steps_per_second': 10.451, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                
 11%|█         | 82/738 [00:38<03:22,  3.23it/s]

{'eval_loss': 0.3041329085826874, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.2219209486166008, 'eval_precision': 0.34660564454614795, 'eval_recall': 0.28695652173913044, 'eval_runtime': 0.7628, 'eval_samples_per_second': 74.722, 'eval_steps_per_second': 10.487, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 17%|█▋        | 123/738 [01:13<03:09,  3.24it/s]

{'eval_loss': 0.27550244331359863, 'eval_accuracy': 0.05263157894736842, 'eval_f1': 0.33591995211185616, 'eval_precision': 0.5692876965772432, 'eval_recall': 0.33043478260869563, 'eval_runtime': 0.7731, 'eval_samples_per_second': 73.729, 'eval_steps_per_second': 10.348, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 22%|██▏       | 164/738 [01:31<02:59,  3.19it/s]

{'eval_loss': 0.26017624139785767, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.508866189891091, 'eval_precision': 0.5623577710534232, 'eval_recall': 0.4782608695652174, 'eval_runtime': 0.773, 'eval_samples_per_second': 73.735, 'eval_steps_per_second': 10.349, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 28%|██▊       | 205/738 [01:48<02:45,  3.23it/s]

{'eval_loss': 0.24243062734603882, 'eval_accuracy': 0.22807017543859648, 'eval_f1': 0.5068840579710145, 'eval_precision': 0.5383148425787107, 'eval_recall': 0.48695652173913045, 'eval_runtime': 0.7713, 'eval_samples_per_second': 73.899, 'eval_steps_per_second': 10.372, 'epoch': 5.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 33%|███▎      | 246/738 [02:05<02:33,  3.20it/s]

{'eval_loss': 0.22822630405426025, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5507460885653381, 'eval_precision': 0.5544466403162056, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.7676, 'eval_samples_per_second': 74.258, 'eval_steps_per_second': 10.422, 'epoch': 6.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 39%|███▉      | 287/738 [02:38<02:21,  3.18it/s]

{'eval_loss': 0.22251224517822266, 'eval_accuracy': 0.22807017543859648, 'eval_f1': 0.5140535169767362, 'eval_precision': 0.5789697542533081, 'eval_recall': 0.46956521739130436, 'eval_runtime': 0.7698, 'eval_samples_per_second': 74.044, 'eval_steps_per_second': 10.392, 'epoch': 7.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 44%|████▍     | 328/738 [03:08<02:08,  3.18it/s]

{'eval_loss': 0.2182871699333191, 'eval_accuracy': 0.21052631578947367, 'eval_f1': 0.5025641025641026, 'eval_precision': 0.5257871064467766, 'eval_recall': 0.4956521739130435, 'eval_runtime': 0.7809, 'eval_samples_per_second': 72.994, 'eval_steps_per_second': 10.245, 'epoch': 8.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 50%|█████     | 369/738 [03:25<01:54,  3.22it/s]

{'eval_loss': 0.21652628481388092, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5059146306898702, 'eval_precision': 0.5330905975583637, 'eval_recall': 0.4956521739130435, 'eval_runtime': 0.7901, 'eval_samples_per_second': 72.141, 'eval_steps_per_second': 10.125, 'epoch': 9.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 56%|█████▌    | 410/738 [03:42<01:41,  3.22it/s]

{'eval_loss': 0.20738543570041656, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5353744629661454, 'eval_precision': 0.5583516322646757, 'eval_recall': 0.5217391304347826, 'eval_runtime': 0.7889, 'eval_samples_per_second': 72.25, 'eval_steps_per_second': 10.14, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 61%|██████    | 451/738 [03:59<01:29,  3.21it/s]

{'eval_loss': 0.20501576364040375, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5246998432044208, 'eval_precision': 0.5256231787487898, 'eval_recall': 0.5304347826086957, 'eval_runtime': 0.7646, 'eval_samples_per_second': 74.553, 'eval_steps_per_second': 10.464, 'epoch': 11.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 67%|██████▋   | 492/738 [04:30<01:17,  3.16it/s]

{'eval_loss': 0.2040586769580841, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.5333590982286635, 'eval_precision': 0.5663115992038487, 'eval_recall': 0.5130434782608696, 'eval_runtime': 0.7758, 'eval_samples_per_second': 73.477, 'eval_steps_per_second': 10.313, 'epoch': 12.0}


 68%|██████▊   | 500/738 [04:45<02:48,  1.41it/s]

{'loss': 0.2406, 'grad_norm': 0.6044687032699585, 'learning_rate': 5.754882170108444e-06, 'epoch': 12.2}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 72%|███████▏  | 533/738 [04:58<01:04,  3.18it/s]

{'eval_loss': 0.2012057602405548, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.5474624468841646, 'eval_precision': 0.5732243280069367, 'eval_recall': 0.5304347826086957, 'eval_runtime': 0.7866, 'eval_samples_per_second': 72.463, 'eval_steps_per_second': 10.17, 'epoch': 13.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 78%|███████▊  | 574/738 [05:18<00:51,  3.21it/s]

{'eval_loss': 0.20087265968322754, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5302555994729907, 'eval_precision': 0.5578387393604785, 'eval_recall': 0.5130434782608696, 'eval_runtime': 0.776, 'eval_samples_per_second': 73.452, 'eval_steps_per_second': 10.309, 'epoch': 14.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 83%|████████▎ | 615/738 [05:35<00:38,  3.22it/s]

{'eval_loss': 0.1987634003162384, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5515748507470656, 'eval_precision': 0.5621159838551143, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.7797, 'eval_samples_per_second': 73.103, 'eval_steps_per_second': 10.26, 'epoch': 15.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 89%|████████▉ | 656/738 [06:08<00:25,  3.22it/s]

{'eval_loss': 0.19810059666633606, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5650036799781927, 'eval_precision': 0.6546583850931676, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.7754, 'eval_samples_per_second': 73.515, 'eval_steps_per_second': 10.318, 'epoch': 16.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 94%|█████████▍| 697/738 [06:25<00:12,  3.23it/s]

{'eval_loss': 0.19781076908111572, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.5685273529243283, 'eval_precision': 0.6637060459669155, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.7719, 'eval_samples_per_second': 73.843, 'eval_steps_per_second': 10.364, 'epoch': 17.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
100%|██████████| 738/738 [07:03<00:00,  3.26it/s]

{'eval_loss': 0.19724583625793457, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5652453676449977, 'eval_precision': 0.6566945319610726, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.985, 'eval_samples_per_second': 57.867, 'eval_steps_per_second': 8.122, 'epoch': 18.0}


100%|██████████| 738/738 [07:20<00:00,  1.68it/s]


{'train_runtime': 440.1007, 'train_samples_per_second': 13.211, 'train_steps_per_second': 1.677, 'train_loss': 0.20742459284257758, 'epoch': 18.0}
Final model trained with best hyperparameters.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 8/8 [00:00<00:00, 12.38it/s]


Validation Results (Final Model): {'eval_loss': 0.19781076908111572, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.5685273529243283, 'eval_precision': 0.6637060459669155, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.847, 'eval_samples_per_second': 67.295, 'eval_steps_per_second': 9.445, 'epoch': 18.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 6/6 [00:00<00:00, 26.03it/s]


Multi-label Test Results: {'eval_loss': 0.20181123912334442, 'eval_accuracy': 0.27906976744186046, 'eval_f1': 0.515648470770422, 'eval_precision': 0.5976733143399809, 'eval_recall': 0.45555555555555555, 'eval_runtime': 0.2811, 'eval_samples_per_second': 152.997, 'eval_steps_per_second': 21.348, 'epoch': 18.0}
Single-category exact-match accuracy on the test set: 0.7441860465116279
