In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import optuna
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

category_columns = [
    "Unlawful detention",
    "Human trafficking",
    "Enslavement",
    "Willful killing of civilians",
    "Mass execution",
    "Kidnapping",
    "Extrajudicial killing",
    "Forced disappearance",
    "Damage or destruction of civilian critical infrastructure",
    "Damage or destruction, looting, or theft of cultural heritage",
    "Military operations (battle, shelling)",
    "Gender-based or other conflict-related sexual violence",
    "Violent crackdowns on protesters/opponents/civil rights abuse",
    "Indiscriminate use of weapons",
    "Torture or indications of torture",
    "Persecution based on political, racial, ethnic, gender, or sexual orientation",
    "Movement of military, paramilitary, or other troops and equipment"
]

# 2) Custom Dataset class for articles
class ArticleDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Load datasets
train_df = pd.read_csv("train.csv")  
val_df   = pd.read_csv("val.csv")    
test_df  = pd.read_csv("test.csv")  

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the text columns
train_encodings = tokenizer(
    list(train_df["Incident Narrative"].values),
    truncation=True,
    padding=True
)
val_encodings   = tokenizer(
    list(val_df["Incident Narrative"].values),
    truncation=True,
    padding=True
)
test_encodings  = tokenizer(
    list(test_df["Incident Narrative"].values),
    truncation=True,
    padding=True
)

# Extract labels (multi-label targets in your category columns)
train_labels = train_df[category_columns].values
val_labels   = val_df[category_columns].values
test_labels  = test_df[category_columns].values

# Create Dataset objects
train_dataset = ArticleDataset(train_encodings, train_labels)
val_dataset   = ArticleDataset(val_encodings, val_labels)
test_dataset  = ArticleDataset(test_encodings, test_labels)

# Define compute_metrics for multi-label classification
def compute_metrics(p):
    preds = torch.sigmoid(torch.tensor(p.predictions))  # Convert logits to probabilities
    preds = (preds > 0.5).int().cpu().numpy() 
    labels = torch.tensor(p.label_ids).cpu().numpy()
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        preds,
        average='weighted'
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Optuna hyperparameter optimization
def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
    batch_size = trial.suggest_categorical('batch_size', [8, 16, 32])
    weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)
    num_train_epochs = trial.suggest_int('num_train_epochs', 5, 20)

    # Define model and training arguments
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", 
        num_labels=len(category_columns)
    )

    training_args = TrainingArguments(
        output_dir='./results', 
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_dir='./logs',
        logging_steps=10,
        disable_tqdm=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    # Train and evaluate the model
    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results["eval_f1"]

# Run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train the final model with the best hyperparameters
final_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=len(category_columns)
)
final_training_args = TrainingArguments(
    output_dir='./final_results', 
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=best_params['learning_rate'],
    per_device_train_batch_size=best_params['batch_size'],
    per_device_eval_batch_size=best_params['batch_size'],
    num_train_epochs=best_params['num_train_epochs'],
    weight_decay=best_params['weight_decay'],
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir='./final_logs'
)

trainer = Trainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)
trainer.train()

# Evaluate the final model on the test dataset
test_results = trainer.evaluate(test_dataset)
print("Final Test Results:", test_results)


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-01-03 10:55:30,218] A new study created in memory with name: no-name-d6c61be0-db6f-4ff7-bdcf-7540abe5c331
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.6259, 'grad_norm': 1.746819019317627, 'learning_rate': 1.89132482280482e-05, 'epoch': 0.24390243902439024}
{'loss': 0.5156, 'grad_norm': 1.785358190536499, 'learning_rate': 1.860063255485732e-05, 'epoch': 0.4878048780487805}
{'loss': 0.4391, 'grad_norm': 0.9706712961196899, 'learning_rate': 1.828801688166644e-05, 'epoch': 0.7317073170731707}
{'loss': 0.3827, 'grad_norm': 0.8255770802497864, 'learning_rate': 1.797540120847556e-05, 'epoch': 0.975609756097561}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.35463032126426697, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.046153846153846156, 'eval_precision': 0.2, 'eval_recall': 0.02608695652173913, 'eval_runtime': 0.7783, 'eval_samples_per_second': 73.232, 'eval_steps_per_second': 10.278, 'epoch': 1.0}
{'loss': 0.3484, 'grad_norm': 0.6654301285743713, 'learning_rate': 1.7662785535284682e-05, 'epoch': 1.2195121951219512}
{'loss': 0.3284, 'grad_norm': 0.7192702889442444, 'learning_rate': 1.7350169862093803e-05, 'epoch': 1.4634146341463414}
{'loss': 0.2982, 'grad_norm': 0.7436495423316956, 'learning_rate': 1.7037554188902925e-05, 'epoch': 1.7073170731707317}
{'loss': 0.2945, 'grad_norm': 0.5854768753051758, 'learning_rate': 1.6724938515712043e-05, 'epoch': 1.951219512195122}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.28804823756217957, 'eval_accuracy': 0.08771929824561403, 'eval_f1': 0.2869466215658774, 'eval_precision': 0.3297504025764895, 'eval_recall': 0.33043478260869563, 'eval_runtime': 0.7214, 'eval_samples_per_second': 79.012, 'eval_steps_per_second': 11.089, 'epoch': 2.0}
{'loss': 0.3018, 'grad_norm': 0.4946359395980835, 'learning_rate': 1.6412322842521164e-05, 'epoch': 2.1951219512195124}
{'loss': 0.2745, 'grad_norm': 0.5020418167114258, 'learning_rate': 1.6099707169330286e-05, 'epoch': 2.4390243902439024}
{'loss': 0.2637, 'grad_norm': 0.5410714149475098, 'learning_rate': 1.5787091496139407e-05, 'epoch': 2.682926829268293}
{'loss': 0.2744, 'grad_norm': 0.6675539612770081, 'learning_rate': 1.5474475822948528e-05, 'epoch': 2.926829268292683}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.26915597915649414, 'eval_accuracy': 0.14035087719298245, 'eval_f1': 0.3755430787424225, 'eval_precision': 0.32166351606805293, 'eval_recall': 0.45217391304347826, 'eval_runtime': 0.7349, 'eval_samples_per_second': 77.565, 'eval_steps_per_second': 10.886, 'epoch': 3.0}
{'loss': 0.2452, 'grad_norm': 0.5659387707710266, 'learning_rate': 1.5161860149757646e-05, 'epoch': 3.1707317073170733}
{'loss': 0.2496, 'grad_norm': 0.5445433259010315, 'learning_rate': 1.4849244476566768e-05, 'epoch': 3.4146341463414633}
{'loss': 0.2672, 'grad_norm': 0.6808682680130005, 'learning_rate': 1.4536628803375889e-05, 'epoch': 3.658536585365854}
{'loss': 0.2426, 'grad_norm': 0.5887702107429504, 'learning_rate': 1.422401313018501e-05, 'epoch': 3.902439024390244}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.24899336695671082, 'eval_accuracy': 0.15789473684210525, 'eval_f1': 0.41847826086956524, 'eval_precision': 0.4636521739130435, 'eval_recall': 0.40869565217391307, 'eval_runtime': 0.7153, 'eval_samples_per_second': 79.685, 'eval_steps_per_second': 11.184, 'epoch': 4.0}
{'loss': 0.2436, 'grad_norm': 0.7527567744255066, 'learning_rate': 1.3911397456994132e-05, 'epoch': 4.146341463414634}
{'loss': 0.2481, 'grad_norm': 0.7142419219017029, 'learning_rate': 1.359878178380325e-05, 'epoch': 4.390243902439025}
{'loss': 0.2143, 'grad_norm': 0.700531542301178, 'learning_rate': 1.3286166110612371e-05, 'epoch': 4.634146341463414}
{'loss': 0.225, 'grad_norm': 0.9164631962776184, 'learning_rate': 1.2973550437421492e-05, 'epoch': 4.878048780487805}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2355944812297821, 'eval_accuracy': 0.22807017543859648, 'eval_f1': 0.4897005742411813, 'eval_precision': 0.47834482758620683, 'eval_recall': 0.5043478260869565, 'eval_runtime': 0.7161, 'eval_samples_per_second': 79.6, 'eval_steps_per_second': 11.172, 'epoch': 5.0}
{'loss': 0.2343, 'grad_norm': 0.6274406313896179, 'learning_rate': 1.2660934764230614e-05, 'epoch': 5.121951219512195}
{'loss': 0.2159, 'grad_norm': 0.579987108707428, 'learning_rate': 1.2348319091039735e-05, 'epoch': 5.365853658536586}
{'loss': 0.2021, 'grad_norm': 0.7900787591934204, 'learning_rate': 1.2035703417848853e-05, 'epoch': 5.609756097560975}
{'loss': 0.2246, 'grad_norm': 0.6370137333869934, 'learning_rate': 1.1723087744657974e-05, 'epoch': 5.853658536585366}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22576206922531128, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5144366967459667, 'eval_precision': 0.5020329835082459, 'eval_recall': 0.5304347826086957, 'eval_runtime': 0.7201, 'eval_samples_per_second': 79.152, 'eval_steps_per_second': 11.109, 'epoch': 6.0}
{'loss': 0.2047, 'grad_norm': 0.7535466551780701, 'learning_rate': 1.1410472071467096e-05, 'epoch': 6.097560975609756}
{'loss': 0.192, 'grad_norm': 0.4425143897533417, 'learning_rate': 1.1097856398276217e-05, 'epoch': 6.341463414634147}
{'loss': 0.2138, 'grad_norm': 0.7058950066566467, 'learning_rate': 1.0785240725085338e-05, 'epoch': 6.585365853658536}
{'loss': 0.1897, 'grad_norm': 0.5337439179420471, 'learning_rate': 1.0472625051894456e-05, 'epoch': 6.829268292682927}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21992094814777374, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5182532684706598, 'eval_precision': 0.4861915683524462, 'eval_recall': 0.5565217391304348, 'eval_runtime': 0.7354, 'eval_samples_per_second': 77.514, 'eval_steps_per_second': 10.879, 'epoch': 7.0}
{'loss': 0.1964, 'grad_norm': 0.576740026473999, 'learning_rate': 1.0160009378703578e-05, 'epoch': 7.073170731707317}
{'loss': 0.1894, 'grad_norm': 0.5051959156990051, 'learning_rate': 9.8473937055127e-06, 'epoch': 7.317073170731708}
{'loss': 0.1843, 'grad_norm': 0.5865015983581543, 'learning_rate': 9.534778032321819e-06, 'epoch': 7.560975609756097}
{'loss': 0.1897, 'grad_norm': 0.5944710373878479, 'learning_rate': 9.22216235913094e-06, 'epoch': 7.804878048780488}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21423928439617157, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5141642512077295, 'eval_precision': 0.48644122383252814, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.7083, 'eval_samples_per_second': 80.479, 'eval_steps_per_second': 11.295, 'epoch': 8.0}
{'loss': 0.1719, 'grad_norm': 0.5169413089752197, 'learning_rate': 8.909546685940062e-06, 'epoch': 8.048780487804878}
{'loss': 0.1781, 'grad_norm': 0.449830025434494, 'learning_rate': 8.596931012749181e-06, 'epoch': 8.292682926829269}
{'loss': 0.1655, 'grad_norm': 0.4815899431705475, 'learning_rate': 8.284315339558303e-06, 'epoch': 8.536585365853659}
{'loss': 0.195, 'grad_norm': 0.6125132441520691, 'learning_rate': 7.971699666367422e-06, 'epoch': 8.78048780487805}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21314489841461182, 'eval_accuracy': 0.22807017543859648, 'eval_f1': 0.5184903874593316, 'eval_precision': 0.5093737575656616, 'eval_recall': 0.5304347826086957, 'eval_runtime': 0.709, 'eval_samples_per_second': 80.398, 'eval_steps_per_second': 11.284, 'epoch': 9.0}
{'loss': 0.1631, 'grad_norm': 0.571971595287323, 'learning_rate': 7.659083993176544e-06, 'epoch': 9.024390243902438}
{'loss': 0.1636, 'grad_norm': 0.669140100479126, 'learning_rate': 7.346468319985665e-06, 'epoch': 9.268292682926829}
{'loss': 0.1735, 'grad_norm': 0.469880074262619, 'learning_rate': 7.033852646794785e-06, 'epoch': 9.512195121951219}
{'loss': 0.1643, 'grad_norm': 0.4118466377258301, 'learning_rate': 6.721236973603906e-06, 'epoch': 9.75609756097561}
{'loss': 0.1628, 'grad_norm': 0.6181808114051819, 'learning_rate': 6.408621300413026e-06, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20897310972213745, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5161621223286661, 'eval_precision': 0.5138946399815965, 'eval_recall': 0.5217391304347826, 'eval_runtime': 0.6794, 'eval_samples_per_second': 83.892, 'eval_steps_per_second': 11.774, 'epoch': 10.0}
{'loss': 0.1608, 'grad_norm': 0.4604625105857849, 'learning_rate': 6.096005627222147e-06, 'epoch': 10.24390243902439}
{'loss': 0.1551, 'grad_norm': 0.6338843703269958, 'learning_rate': 5.783389954031268e-06, 'epoch': 10.487804878048781}
{'loss': 0.1621, 'grad_norm': 0.36722350120544434, 'learning_rate': 5.470774280840388e-06, 'epoch': 10.731707317073171}
{'loss': 0.1578, 'grad_norm': 0.42980438470840454, 'learning_rate': 5.158158607649509e-06, 'epoch': 10.975609756097562}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20682674646377563, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5196281859070465, 'eval_precision': 0.5217391304347826, 'eval_recall': 0.5217391304347826, 'eval_runtime': 0.7215, 'eval_samples_per_second': 79.001, 'eval_steps_per_second': 11.088, 'epoch': 11.0}
{'loss': 0.1594, 'grad_norm': 0.4135947823524475, 'learning_rate': 4.845542934458629e-06, 'epoch': 11.21951219512195}
{'loss': 0.1522, 'grad_norm': 0.4409359097480774, 'learning_rate': 4.5329272612677504e-06, 'epoch': 11.463414634146341}
{'loss': 0.1467, 'grad_norm': 0.46185609698295593, 'learning_rate': 4.220311588076871e-06, 'epoch': 11.707317073170731}
{'loss': 0.1595, 'grad_norm': 0.5562427043914795, 'learning_rate': 3.9076959148859915e-06, 'epoch': 11.951219512195122}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2065439224243164, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5103528235882059, 'eval_precision': 0.5217391304347826, 'eval_recall': 0.5043478260869565, 'eval_runtime': 0.7088, 'eval_samples_per_second': 80.421, 'eval_steps_per_second': 11.287, 'epoch': 12.0}
{'loss': 0.1613, 'grad_norm': 0.498420387506485, 'learning_rate': 3.595080241695112e-06, 'epoch': 12.195121951219512}
{'loss': 0.1514, 'grad_norm': 0.5329745411872864, 'learning_rate': 3.2824645685042333e-06, 'epoch': 12.439024390243903}
{'loss': 0.1472, 'grad_norm': 0.5840879678726196, 'learning_rate': 2.969848895313354e-06, 'epoch': 12.682926829268293}
{'loss': 0.155, 'grad_norm': 0.539671778678894, 'learning_rate': 2.6572332221224744e-06, 'epoch': 12.926829268292684}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20651990175247192, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5103528235882059, 'eval_precision': 0.5217391304347826, 'eval_recall': 0.5043478260869565, 'eval_runtime': 0.7539, 'eval_samples_per_second': 75.608, 'eval_steps_per_second': 10.612, 'epoch': 13.0}
{'loss': 0.1405, 'grad_norm': 0.42212337255477905, 'learning_rate': 2.344617548931595e-06, 'epoch': 13.170731707317072}
{'loss': 0.1505, 'grad_norm': 0.4222840964794159, 'learning_rate': 2.032001875740716e-06, 'epoch': 13.414634146341463}
{'loss': 0.1482, 'grad_norm': 0.3899706304073334, 'learning_rate': 1.7193862025498363e-06, 'epoch': 13.658536585365853}
{'loss': 0.1396, 'grad_norm': 0.6041058897972107, 'learning_rate': 1.4067705293589568e-06, 'epoch': 13.902439024390244}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20483526587486267, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5166036112378594, 'eval_precision': 0.5249011857707511, 'eval_recall': 0.5130434782608696, 'eval_runtime': 0.7131, 'eval_samples_per_second': 79.937, 'eval_steps_per_second': 11.219, 'epoch': 14.0}
{'loss': 0.1451, 'grad_norm': 0.46920716762542725, 'learning_rate': 1.0941548561680776e-06, 'epoch': 14.146341463414634}
{'loss': 0.1478, 'grad_norm': 0.3768139183521271, 'learning_rate': 7.815391829771983e-07, 'epoch': 14.390243902439025}
{'loss': 0.141, 'grad_norm': 0.4092276096343994, 'learning_rate': 4.68923509786319e-07, 'epoch': 14.634146341463415}
{'loss': 0.1513, 'grad_norm': 0.46697014570236206, 'learning_rate': 1.563078365954397e-07, 'epoch': 14.878048780487806}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20563353598117828, 'eval_accuracy': 0.22807017543859648, 'eval_f1': 0.5038179092272046, 'eval_precision': 0.5182608695652174, 'eval_recall': 0.4956521739130435, 'eval_runtime': 0.8123, 'eval_samples_per_second': 70.168, 'eval_steps_per_second': 9.848, 'epoch': 15.0}
{'train_runtime': 265.7763, 'train_samples_per_second': 18.23, 'train_steps_per_second': 2.314, 'train_loss': 0.21741780042648315, 'epoch': 15.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-03 10:59:58,451] Trial 0 finished with value: 0.5196281859070465 and parameters: {'learning_rate': 1.922586390123908e-05, 'batch_size': 8, 'weight_decay': 0.04948356370335308, 'num_train_epochs': 15}. Best is trial 0 with value: 0.5196281859070465.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.20682674646377563, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5196281859070465, 'eval_precision': 0.5217391304347826, 'eval_recall': 0.5217391304347826, 'eval_runtime': 0.8622, 'eval_samples_per_second': 66.109, 'eval_steps_per_second': 9.278, 'epoch': 15.0}
{'loss': 0.576, 'grad_norm': 1.5148143768310547, 'learning_rate': 2.6861739408570185e-05, 'epoch': 0.47619047619047616}
{'loss': 0.4424, 'grad_norm': 0.9031828045845032, 'learning_rate': 2.617120626182031e-05, 'epoch': 0.9523809523809523}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.38503947854042053, 'eval_accuracy': 0.12280701754385964, 'eval_f1': 0.44700106656628397, 'eval_precision': 0.34110933609659916, 'eval_recall': 0.6521739130434783, 'eval_runtime': 0.7107, 'eval_samples_per_second': 80.208, 'eval_steps_per_second': 5.629, 'epoch': 1.0}
{'loss': 0.3736, 'grad_norm': 0.7540454864501953, 'learning_rate': 2.5480673115070433e-05, 'epoch': 1.4285714285714286}
{'loss': 0.326, 'grad_norm': 0.6109713912010193, 'learning_rate': 2.479013996832056e-05, 'epoch': 1.9047619047619047}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3095219135284424, 'eval_accuracy': 0.05263157894736842, 'eval_f1': 0.26664144551101077, 'eval_precision': 0.5031273836765827, 'eval_recall': 0.3130434782608696, 'eval_runtime': 0.7235, 'eval_samples_per_second': 78.781, 'eval_steps_per_second': 5.528, 'epoch': 2.0}
{'loss': 0.3124, 'grad_norm': 0.48144543170928955, 'learning_rate': 2.409960682157068e-05, 'epoch': 2.380952380952381}
{'loss': 0.286, 'grad_norm': 0.43050169944763184, 'learning_rate': 2.3409073674820804e-05, 'epoch': 2.857142857142857}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.278836190700531, 'eval_accuracy': 0.10526315789473684, 'eval_f1': 0.3753228648833478, 'eval_precision': 0.5312525879917185, 'eval_recall': 0.40869565217391307, 'eval_runtime': 0.701, 'eval_samples_per_second': 81.315, 'eval_steps_per_second': 5.706, 'epoch': 3.0}
{'loss': 0.2581, 'grad_norm': 0.5244187712669373, 'learning_rate': 2.271854052807093e-05, 'epoch': 3.3333333333333335}
{'loss': 0.2723, 'grad_norm': 0.6025379300117493, 'learning_rate': 2.2028007381321052e-05, 'epoch': 3.8095238095238093}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.25971198081970215, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5001212740009642, 'eval_precision': 0.48297751066629785, 'eval_recall': 0.5304347826086957, 'eval_runtime': 0.6995, 'eval_samples_per_second': 81.486, 'eval_steps_per_second': 5.718, 'epoch': 4.0}
{'loss': 0.2499, 'grad_norm': 0.3813193142414093, 'learning_rate': 2.1337474234571178e-05, 'epoch': 4.285714285714286}
{'loss': 0.2387, 'grad_norm': 0.4804120659828186, 'learning_rate': 2.06469410878213e-05, 'epoch': 4.761904761904762}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2440524697303772, 'eval_accuracy': 0.22807017543859648, 'eval_f1': 0.5228936447425566, 'eval_precision': 0.4869594391993193, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.7028, 'eval_samples_per_second': 81.109, 'eval_steps_per_second': 5.692, 'epoch': 5.0}
{'loss': 0.2387, 'grad_norm': 0.5537687540054321, 'learning_rate': 1.9956407941071423e-05, 'epoch': 5.238095238095238}
{'loss': 0.2183, 'grad_norm': 0.4008490741252899, 'learning_rate': 1.926587479432155e-05, 'epoch': 5.714285714285714}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.23327897489070892, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.537885477113891, 'eval_precision': 0.4981620553359683, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.6905, 'eval_samples_per_second': 82.546, 'eval_steps_per_second': 5.793, 'epoch': 6.0}
{'loss': 0.2167, 'grad_norm': 0.4644147753715515, 'learning_rate': 1.857534164757167e-05, 'epoch': 6.190476190476191}
{'loss': 0.2131, 'grad_norm': 0.4858625829219818, 'learning_rate': 1.7884808500821797e-05, 'epoch': 6.666666666666667}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22131314873695374, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5329192546583852, 'eval_precision': 0.5134782608695652, 'eval_recall': 0.5565217391304348, 'eval_runtime': 0.6956, 'eval_samples_per_second': 81.938, 'eval_steps_per_second': 5.75, 'epoch': 7.0}
{'loss': 0.1968, 'grad_norm': 0.4387645423412323, 'learning_rate': 1.719427535407192e-05, 'epoch': 7.142857142857143}
{'loss': 0.1913, 'grad_norm': 0.4802356958389282, 'learning_rate': 1.650374220732204e-05, 'epoch': 7.619047619047619}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21521078050136566, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5556594179173645, 'eval_precision': 0.5240685884164145, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.6968, 'eval_samples_per_second': 81.8, 'eval_steps_per_second': 5.74, 'epoch': 8.0}
{'loss': 0.1879, 'grad_norm': 0.45283621549606323, 'learning_rate': 1.5813209060572167e-05, 'epoch': 8.095238095238095}
{'loss': 0.171, 'grad_norm': 0.4509570300579071, 'learning_rate': 1.5122675913822291e-05, 'epoch': 8.571428571428571}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21377713978290558, 'eval_accuracy': 0.21052631578947367, 'eval_f1': 0.5516645426728684, 'eval_precision': 0.5990720752155891, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.7273, 'eval_samples_per_second': 78.369, 'eval_steps_per_second': 5.5, 'epoch': 9.0}
{'loss': 0.1809, 'grad_norm': 0.40148302912712097, 'learning_rate': 1.4432142767072414e-05, 'epoch': 9.047619047619047}
{'loss': 0.1701, 'grad_norm': 0.37952783703804016, 'learning_rate': 1.3741609620322536e-05, 'epoch': 9.523809523809524}
{'loss': 0.1601, 'grad_norm': 0.5117558836936951, 'learning_rate': 1.305107647357266e-05, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20586508512496948, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.5718439108061749, 'eval_precision': 0.6122190669371197, 'eval_recall': 0.6, 'eval_runtime': 0.6673, 'eval_samples_per_second': 85.417, 'eval_steps_per_second': 5.994, 'epoch': 10.0}
{'loss': 0.16, 'grad_norm': 0.5245856642723083, 'learning_rate': 1.2360543326822784e-05, 'epoch': 10.476190476190476}
{'loss': 0.157, 'grad_norm': 0.4548783302307129, 'learning_rate': 1.1670010180072909e-05, 'epoch': 10.952380952380953}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2016652524471283, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5763936404601365, 'eval_precision': 0.6274492753623189, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.7262, 'eval_samples_per_second': 78.496, 'eval_steps_per_second': 5.509, 'epoch': 11.0}
{'loss': 0.1524, 'grad_norm': 0.35579192638397217, 'learning_rate': 1.0979477033323033e-05, 'epoch': 11.428571428571429}
{'loss': 0.153, 'grad_norm': 0.35252875089645386, 'learning_rate': 1.0288943886573155e-05, 'epoch': 11.904761904761905}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.200339674949646, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5814850722151789, 'eval_precision': 0.6137417005782823, 'eval_recall': 0.6, 'eval_runtime': 0.7106, 'eval_samples_per_second': 80.219, 'eval_steps_per_second': 5.629, 'epoch': 12.0}
{'loss': 0.1506, 'grad_norm': 0.34902217984199524, 'learning_rate': 9.598410739823279e-06, 'epoch': 12.380952380952381}
{'loss': 0.141, 'grad_norm': 0.5176306962966919, 'learning_rate': 8.907877593073403e-06, 'epoch': 12.857142857142858}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19694924354553223, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5846583850931677, 'eval_precision': 0.6256655518394649, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.7066, 'eval_samples_per_second': 80.671, 'eval_steps_per_second': 5.661, 'epoch': 13.0}
{'loss': 0.141, 'grad_norm': 0.39677929878234863, 'learning_rate': 8.217344446323527e-06, 'epoch': 13.333333333333334}
{'loss': 0.134, 'grad_norm': 0.4120349884033203, 'learning_rate': 7.526811299573651e-06, 'epoch': 13.80952380952381}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19674956798553467, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5962391280622921, 'eval_precision': 0.6284164145033709, 'eval_recall': 0.6173913043478261, 'eval_runtime': 0.6914, 'eval_samples_per_second': 82.442, 'eval_steps_per_second': 5.785, 'epoch': 14.0}
{'loss': 0.1411, 'grad_norm': 0.32014089822769165, 'learning_rate': 6.836278152823775e-06, 'epoch': 14.285714285714286}
{'loss': 0.1305, 'grad_norm': 0.3880619406700134, 'learning_rate': 6.145745006073898e-06, 'epoch': 14.761904761904763}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.1955200433731079, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5852386743691091, 'eval_precision': 0.6122732919254659, 'eval_recall': 0.6086956521739131, 'eval_runtime': 0.7237, 'eval_samples_per_second': 78.766, 'eval_steps_per_second': 5.527, 'epoch': 15.0}
{'loss': 0.144, 'grad_norm': 0.2621806263923645, 'learning_rate': 5.455211859324022e-06, 'epoch': 15.238095238095237}
{'loss': 0.132, 'grad_norm': 0.31596317887306213, 'learning_rate': 4.764678712574146e-06, 'epoch': 15.714285714285714}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.1937517672777176, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.6144025539677712, 'eval_precision': 0.6411313903487816, 'eval_recall': 0.6260869565217392, 'eval_runtime': 0.741, 'eval_samples_per_second': 76.926, 'eval_steps_per_second': 5.398, 'epoch': 16.0}
{'loss': 0.1264, 'grad_norm': 0.3415844142436981, 'learning_rate': 4.0741455658242695e-06, 'epoch': 16.19047619047619}
{'loss': 0.1323, 'grad_norm': 0.3488866984844208, 'learning_rate': 3.3836124190743936e-06, 'epoch': 16.666666666666668}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19241264462471008, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.6074008600095556, 'eval_precision': 0.643056856187291, 'eval_recall': 0.6086956521739131, 'eval_runtime': 0.7027, 'eval_samples_per_second': 81.119, 'eval_steps_per_second': 5.693, 'epoch': 17.0}
{'loss': 0.1254, 'grad_norm': 0.32400092482566833, 'learning_rate': 2.6930792723245173e-06, 'epoch': 17.142857142857142}
{'loss': 0.1333, 'grad_norm': 0.33166223764419556, 'learning_rate': 2.002546125574641e-06, 'epoch': 17.61904761904762}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19166848063468933, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.6093756967670011, 'eval_precision': 0.6390926618138895, 'eval_recall': 0.6173913043478261, 'eval_runtime': 0.7373, 'eval_samples_per_second': 77.31, 'eval_steps_per_second': 5.425, 'epoch': 18.0}
{'loss': 0.1303, 'grad_norm': 0.3037504553794861, 'learning_rate': 1.3120129788247648e-06, 'epoch': 18.095238095238095}
{'loss': 0.1263, 'grad_norm': 0.3391702473163605, 'learning_rate': 6.214798320748886e-07, 'epoch': 18.571428571428573}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19188322126865387, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.6093756967670011, 'eval_precision': 0.6390926618138895, 'eval_recall': 0.6173913043478261, 'eval_runtime': 0.6768, 'eval_samples_per_second': 84.219, 'eval_steps_per_second': 5.91, 'epoch': 19.0}
{'train_runtime': 387.9395, 'train_samples_per_second': 15.819, 'train_steps_per_second': 1.029, 'train_loss': 0.20319910425888865, 'epoch': 19.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-03 11:06:27,626] Trial 1 finished with value: 0.6144025539677712 and parameters: {'learning_rate': 2.7552272555320063e-05, 'batch_size': 16, 'weight_decay': 0.03065642840179999, 'num_train_epochs': 19}. Best is trial 1 with value: 0.6144025539677712.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)


{'eval_loss': 0.1937517672777176, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.6144025539677712, 'eval_precision': 0.6411313903487816, 'eval_recall': 0.6260869565217392, 'eval_runtime': 0.6678, 'eval_samples_per_second': 85.353, 'eval_steps_per_second': 5.99, 'epoch': 19.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.5665, 'grad_norm': 1.3264544010162354, 'learning_rate': 3.2064768215689084e-05, 'epoch': 0.47619047619047616}
{'loss': 0.4272, 'grad_norm': 0.8359748125076294, 'learning_rate': 2.9724274185346816e-05, 'epoch': 0.9523809523809523}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3706642687320709, 'eval_accuracy': 0.14035087719298245, 'eval_f1': 0.3427414992632384, 'eval_precision': 0.2842618806875632, 'eval_recall': 0.43478260869565216, 'eval_runtime': 0.7177, 'eval_samples_per_second': 79.426, 'eval_steps_per_second': 5.574, 'epoch': 1.0}
{'loss': 0.3605, 'grad_norm': 0.6639840006828308, 'learning_rate': 2.7383780155004548e-05, 'epoch': 1.4285714285714286}
{'loss': 0.316, 'grad_norm': 0.5366557240486145, 'learning_rate': 2.504328612466228e-05, 'epoch': 1.9047619047619047}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3018186688423157, 'eval_accuracy': 0.0, 'eval_f1': 0.2714542728635682, 'eval_precision': 0.3952023988005997, 'eval_recall': 0.23478260869565218, 'eval_runtime': 0.7095, 'eval_samples_per_second': 80.34, 'eval_steps_per_second': 5.638, 'epoch': 2.0}
{'loss': 0.3079, 'grad_norm': 0.4537799060344696, 'learning_rate': 2.270279209432001e-05, 'epoch': 2.380952380952381}
{'loss': 0.2836, 'grad_norm': 0.39714524149894714, 'learning_rate': 2.036229806397774e-05, 'epoch': 2.857142857142857}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2779707610607147, 'eval_accuracy': 0.08771929824561403, 'eval_f1': 0.3602387812118037, 'eval_precision': 0.5693979933110368, 'eval_recall': 0.3739130434782609, 'eval_runtime': 0.7024, 'eval_samples_per_second': 81.154, 'eval_steps_per_second': 5.695, 'epoch': 3.0}
{'loss': 0.2589, 'grad_norm': 0.5387125015258789, 'learning_rate': 1.8021804033635473e-05, 'epoch': 3.3333333333333335}
{'loss': 0.2763, 'grad_norm': 0.42977023124694824, 'learning_rate': 1.5681310003293202e-05, 'epoch': 3.8095238095238093}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.26473191380500793, 'eval_accuracy': 0.21052631578947367, 'eval_f1': 0.46785098388551977, 'eval_precision': 0.5164057971014493, 'eval_recall': 0.5043478260869565, 'eval_runtime': 0.6897, 'eval_samples_per_second': 82.648, 'eval_steps_per_second': 5.8, 'epoch': 4.0}
{'loss': 0.2582, 'grad_norm': 0.3938952386379242, 'learning_rate': 1.3340815972950933e-05, 'epoch': 4.285714285714286}
{'loss': 0.2475, 'grad_norm': 0.6105465888977051, 'learning_rate': 1.1000321942608665e-05, 'epoch': 4.761904761904762}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2547229528427124, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5010096541554343, 'eval_precision': 0.5014724406028754, 'eval_recall': 0.5391304347826087, 'eval_runtime': 0.6991, 'eval_samples_per_second': 81.535, 'eval_steps_per_second': 5.722, 'epoch': 5.0}
{'loss': 0.2521, 'grad_norm': 0.6694476008415222, 'learning_rate': 8.659827912266395e-06, 'epoch': 5.238095238095238}
{'loss': 0.2394, 'grad_norm': 0.41163933277130127, 'learning_rate': 6.319333881924127e-06, 'epoch': 5.714285714285714}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.24704661965370178, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5175583798926893, 'eval_precision': 0.5172397461400019, 'eval_recall': 0.5304347826086957, 'eval_runtime': 0.6901, 'eval_samples_per_second': 82.594, 'eval_steps_per_second': 5.796, 'epoch': 6.0}
{'loss': 0.2369, 'grad_norm': 0.43042564392089844, 'learning_rate': 3.978839851581857e-06, 'epoch': 6.190476190476191}
{'loss': 0.2386, 'grad_norm': 0.46187540888786316, 'learning_rate': 1.6383458212395883e-06, 'epoch': 6.666666666666667}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2446691244840622, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5490191795781858, 'eval_precision': 0.5264686208931734, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.6793, 'eval_samples_per_second': 83.907, 'eval_steps_per_second': 5.888, 'epoch': 7.0}
{'train_runtime': 141.5435, 'train_samples_per_second': 15.974, 'train_steps_per_second': 1.039, 'train_loss': 0.3014401894848363, 'epoch': 7.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-03 11:08:50,565] Trial 2 finished with value: 0.5490191795781858 and parameters: {'learning_rate': 3.4405262246031355e-05, 'batch_size': 16, 'weight_decay': 0.057606789645825, 'num_train_epochs': 7}. Best is trial 1 with value: 0.6144025539677712.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.2446691244840622, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5490191795781858, 'eval_precision': 0.5264686208931734, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.6761, 'eval_samples_per_second': 84.313, 'eval_steps_per_second': 5.917, 'epoch': 7.0}
{'loss': 0.5637, 'grad_norm': 1.338547945022583, 'learning_rate': 4.2607719074550684e-05, 'epoch': 0.47619047619047616}
{'loss': 0.4158, 'grad_norm': 0.7354313135147095, 'learning_rate': 4.084706952601553e-05, 'epoch': 0.9523809523809523}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.35482820868492126, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.1899209486166008, 'eval_precision': 0.14660564454614797, 'eval_recall': 0.26956521739130435, 'eval_runtime': 0.7167, 'eval_samples_per_second': 79.529, 'eval_steps_per_second': 5.581, 'epoch': 1.0}
{'loss': 0.3424, 'grad_norm': 0.5981231927871704, 'learning_rate': 3.908641997748038e-05, 'epoch': 1.4285714285714286}
{'loss': 0.2934, 'grad_norm': 0.4648340344429016, 'learning_rate': 3.732577042894523e-05, 'epoch': 1.9047619047619047}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.28628355264663696, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.1899209486166008, 'eval_precision': 0.14660564454614797, 'eval_recall': 0.26956521739130435, 'eval_runtime': 0.7469, 'eval_samples_per_second': 76.313, 'eval_steps_per_second': 5.355, 'epoch': 2.0}
{'loss': 0.2893, 'grad_norm': 0.3936472237110138, 'learning_rate': 3.556512088041008e-05, 'epoch': 2.380952380952381}
{'loss': 0.2668, 'grad_norm': 0.3381548225879669, 'learning_rate': 3.380447133187492e-05, 'epoch': 2.857142857142857}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2654601037502289, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.22502200751476112, 'eval_precision': 0.35634782608695653, 'eval_recall': 0.26956521739130435, 'eval_runtime': 0.6903, 'eval_samples_per_second': 82.575, 'eval_steps_per_second': 5.795, 'epoch': 3.0}
{'loss': 0.2411, 'grad_norm': 0.47626230120658875, 'learning_rate': 3.204382178333977e-05, 'epoch': 3.3333333333333335}
{'loss': 0.2632, 'grad_norm': 0.4437352120876312, 'learning_rate': 3.028317223480462e-05, 'epoch': 3.8095238095238093}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.24984556436538696, 'eval_accuracy': 0.15789473684210525, 'eval_f1': 0.4304457644885939, 'eval_precision': 0.5759057971014492, 'eval_recall': 0.391304347826087, 'eval_runtime': 0.7123, 'eval_samples_per_second': 80.02, 'eval_steps_per_second': 5.615, 'epoch': 4.0}
{'loss': 0.2411, 'grad_norm': 0.35460561513900757, 'learning_rate': 2.852252268626947e-05, 'epoch': 4.285714285714286}
{'loss': 0.2266, 'grad_norm': 0.5158507227897644, 'learning_rate': 2.6761873137734315e-05, 'epoch': 4.761904761904762}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.23464487493038177, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.557604399381337, 'eval_precision': 0.5463986052086739, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.6962, 'eval_samples_per_second': 81.87, 'eval_steps_per_second': 5.745, 'epoch': 5.0}
{'loss': 0.2269, 'grad_norm': 0.5401535630226135, 'learning_rate': 2.5001223589199164e-05, 'epoch': 5.238095238095238}
{'loss': 0.2041, 'grad_norm': 0.39667442440986633, 'learning_rate': 2.324057404066401e-05, 'epoch': 5.714285714285714}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22186945378780365, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5518869717239282, 'eval_precision': 0.5416901938641069, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.7123, 'eval_samples_per_second': 80.018, 'eval_steps_per_second': 5.615, 'epoch': 6.0}
{'loss': 0.2007, 'grad_norm': 0.42893317341804504, 'learning_rate': 2.147992449212886e-05, 'epoch': 6.190476190476191}
{'loss': 0.1959, 'grad_norm': 0.457680881023407, 'learning_rate': 1.9719274943593705e-05, 'epoch': 6.666666666666667}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2152342051267624, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.547857939162287, 'eval_precision': 0.532608695652174, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.6943, 'eval_samples_per_second': 82.092, 'eval_steps_per_second': 5.761, 'epoch': 7.0}
{'loss': 0.179, 'grad_norm': 0.3901680111885071, 'learning_rate': 1.7958625395058555e-05, 'epoch': 7.142857142857143}
{'loss': 0.1755, 'grad_norm': 0.3622368574142456, 'learning_rate': 1.61979758465234e-05, 'epoch': 7.619047619047619}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20893490314483643, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5555626598465473, 'eval_precision': 0.5390144927536231, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.6893, 'eval_samples_per_second': 82.689, 'eval_steps_per_second': 5.803, 'epoch': 8.0}
{'loss': 0.1723, 'grad_norm': 0.37899407744407654, 'learning_rate': 1.443732629798825e-05, 'epoch': 8.095238095238095}
{'loss': 0.1584, 'grad_norm': 0.32267361879348755, 'learning_rate': 1.2676676749453096e-05, 'epoch': 8.571428571428571}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2052723467350006, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5409558606051116, 'eval_precision': 0.5283178923358833, 'eval_recall': 0.5565217391304348, 'eval_runtime': 0.6983, 'eval_samples_per_second': 81.625, 'eval_steps_per_second': 5.728, 'epoch': 9.0}
{'loss': 0.1682, 'grad_norm': 0.37843582034111023, 'learning_rate': 1.0916027200917944e-05, 'epoch': 9.047619047619047}
{'loss': 0.1622, 'grad_norm': 0.35765406489372253, 'learning_rate': 9.155377652382791e-06, 'epoch': 9.523809523809524}
{'loss': 0.1526, 'grad_norm': 0.47333264350891113, 'learning_rate': 7.394728103847639e-06, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20173762738704681, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5535367324187199, 'eval_precision': 0.5279264214046823, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.6662, 'eval_samples_per_second': 85.561, 'eval_steps_per_second': 6.004, 'epoch': 10.0}
{'loss': 0.1529, 'grad_norm': 0.3927086889743805, 'learning_rate': 5.634078555312487e-06, 'epoch': 10.476190476190476}
{'loss': 0.1534, 'grad_norm': 0.37652266025543213, 'learning_rate': 3.8734290067773344e-06, 'epoch': 10.952380952380953}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2003195434808731, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.553215025140491, 'eval_precision': 0.5435589897359013, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.7017, 'eval_samples_per_second': 81.236, 'eval_steps_per_second': 5.701, 'epoch': 11.0}
{'loss': 0.1533, 'grad_norm': 0.37668710947036743, 'learning_rate': 2.1127794582421825e-06, 'epoch': 11.428571428571429}
{'loss': 0.1537, 'grad_norm': 0.34911051392555237, 'learning_rate': 3.521299097070304e-07, 'epoch': 11.904761904761905}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.1994895339012146, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5548203763110596, 'eval_precision': 0.5395456118094798, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.6745, 'eval_samples_per_second': 84.502, 'eval_steps_per_second': 5.93, 'epoch': 12.0}
{'train_runtime': 243.7938, 'train_samples_per_second': 15.899, 'train_steps_per_second': 1.034, 'train_loss': 0.2295285134561478, 'epoch': 12.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-03 11:12:55,527] Trial 3 finished with value: 0.557604399381337 and parameters: {'learning_rate': 4.436836862308584e-05, 'batch_size': 16, 'weight_decay': 0.08764598934339318, 'num_train_epochs': 12}. Best is trial 1 with value: 0.6144025539677712.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)


{'eval_loss': 0.23464487493038177, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.557604399381337, 'eval_precision': 0.5463986052086739, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.6619, 'eval_samples_per_second': 86.118, 'eval_steps_per_second': 6.043, 'epoch': 12.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.5851, 'grad_norm': 1.747338891029358, 'learning_rate': 1.2575786507786571e-05, 'epoch': 0.9090909090909091}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.52567458152771, 'eval_accuracy': 0.0, 'eval_f1': 0.32353033439989964, 'eval_precision': 0.23996948893974065, 'eval_recall': 0.5304347826086957, 'eval_runtime': 4.1205, 'eval_samples_per_second': 13.833, 'eval_steps_per_second': 0.485, 'epoch': 1.0}
{'loss': 0.5102, 'grad_norm': 1.3432468175888062, 'learning_rate': 1.1865290094917274e-05, 'epoch': 1.8181818181818183}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.4586581885814667, 'eval_accuracy': 0.12280701754385964, 'eval_f1': 0.4413702239789196, 'eval_precision': 0.3283550491134669, 'eval_recall': 0.6782608695652174, 'eval_runtime': 4.1864, 'eval_samples_per_second': 13.615, 'eval_steps_per_second': 0.478, 'epoch': 2.0}
{'loss': 0.4541, 'grad_norm': 1.092833399772644, 'learning_rate': 1.1154793682047976e-05, 'epoch': 2.7272727272727275}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.4140785336494446, 'eval_accuracy': 0.17543859649122806, 'eval_f1': 0.37506206227030026, 'eval_precision': 0.35721242906597594, 'eval_recall': 0.5130434782608696, 'eval_runtime': 4.2605, 'eval_samples_per_second': 13.379, 'eval_steps_per_second': 0.469, 'epoch': 3.0}
{'loss': 0.4095, 'grad_norm': 1.0369218587875366, 'learning_rate': 1.044429726917868e-05, 'epoch': 3.6363636363636362}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3822064697742462, 'eval_accuracy': 0.08771929824561403, 'eval_f1': 0.3265715422387087, 'eval_precision': 0.4095933814469284, 'eval_recall': 0.3652173913043478, 'eval_runtime': 4.112, 'eval_samples_per_second': 13.862, 'eval_steps_per_second': 0.486, 'epoch': 4.0}
{'loss': 0.3809, 'grad_norm': 0.8649172782897949, 'learning_rate': 9.733800856309381e-06, 'epoch': 4.545454545454545}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.35915911197662354, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.23797446032228645, 'eval_precision': 0.4509534706331045, 'eval_recall': 0.2956521739130435, 'eval_runtime': 4.2623, 'eval_samples_per_second': 13.373, 'eval_steps_per_second': 0.469, 'epoch': 5.0}
{'loss': 0.3623, 'grad_norm': 0.7321634888648987, 'learning_rate': 9.023304443440083e-06, 'epoch': 5.454545454545454}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3429918587207794, 'eval_accuracy': 0.05263157894736842, 'eval_f1': 0.26134952004517226, 'eval_precision': 0.34660564454614795, 'eval_recall': 0.3130434782608696, 'eval_runtime': 4.2581, 'eval_samples_per_second': 13.386, 'eval_steps_per_second': 0.47, 'epoch': 6.0}
{'loss': 0.3422, 'grad_norm': 0.6841766238212585, 'learning_rate': 8.312808030570784e-06, 'epoch': 6.363636363636363}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3301236629486084, 'eval_accuracy': 0.05263157894736842, 'eval_f1': 0.26134952004517226, 'eval_precision': 0.34660564454614795, 'eval_recall': 0.3130434782608696, 'eval_runtime': 4.1338, 'eval_samples_per_second': 13.789, 'eval_steps_per_second': 0.484, 'epoch': 7.0}
{'loss': 0.3331, 'grad_norm': 0.600090503692627, 'learning_rate': 7.6023116177014866e-06, 'epoch': 7.2727272727272725}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.32034415006637573, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.2065876152832675, 'eval_precision': 0.34660564454614795, 'eval_recall': 0.2782608695652174, 'eval_runtime': 4.8188, 'eval_samples_per_second': 11.829, 'eval_steps_per_second': 0.415, 'epoch': 8.0}
{'loss': 0.3219, 'grad_norm': 0.5453931093215942, 'learning_rate': 6.891815204832188e-06, 'epoch': 8.181818181818182}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.31325623393058777, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.2219209486166008, 'eval_precision': 0.34660564454614795, 'eval_recall': 0.28695652173913044, 'eval_runtime': 5.6496, 'eval_samples_per_second': 10.089, 'eval_steps_per_second': 0.354, 'epoch': 9.0}
{'loss': 0.316, 'grad_norm': 0.5186102986335754, 'learning_rate': 6.1813187919628915e-06, 'epoch': 9.090909090909092}
{'loss': 0.3043, 'grad_norm': 0.7694738507270813, 'learning_rate': 5.470822379093593e-06, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.30714714527130127, 'eval_accuracy': 0.05263157894736842, 'eval_f1': 0.2699209486166008, 'eval_precision': 0.31803421597471937, 'eval_recall': 0.3217391304347826, 'eval_runtime': 7.4907, 'eval_samples_per_second': 7.609, 'eval_steps_per_second': 0.267, 'epoch': 10.0}
{'loss': 0.3029, 'grad_norm': 0.48323899507522583, 'learning_rate': 4.7603259662242955e-06, 'epoch': 10.909090909090908}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3024417459964752, 'eval_accuracy': 0.05263157894736842, 'eval_f1': 0.29912218084506137, 'eval_precision': 0.5329192546583852, 'eval_recall': 0.3391304347826087, 'eval_runtime': 5.119, 'eval_samples_per_second': 11.135, 'eval_steps_per_second': 0.391, 'epoch': 11.0}
{'loss': 0.305, 'grad_norm': 0.4677722454071045, 'learning_rate': 4.049829553354998e-06, 'epoch': 11.818181818181818}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2988278865814209, 'eval_accuracy': 0.08771929824561403, 'eval_f1': 0.3280382853927135, 'eval_precision': 0.4381027667984189, 'eval_recall': 0.3652173913043478, 'eval_runtime': 4.155, 'eval_samples_per_second': 13.718, 'eval_steps_per_second': 0.481, 'epoch': 12.0}
{'loss': 0.2971, 'grad_norm': 0.4488127529621124, 'learning_rate': 3.3393331404857e-06, 'epoch': 12.727272727272727}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2959870994091034, 'eval_accuracy': 0.12280701754385964, 'eval_f1': 0.36179216156332866, 'eval_precision': 0.4139021055509981, 'eval_recall': 0.4, 'eval_runtime': 4.2315, 'eval_samples_per_second': 13.47, 'eval_steps_per_second': 0.473, 'epoch': 13.0}
{'loss': 0.2896, 'grad_norm': 0.5362158417701721, 'learning_rate': 2.628836727616402e-06, 'epoch': 13.636363636363637}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.29368481040000916, 'eval_accuracy': 0.14035087719298245, 'eval_f1': 0.37997163164245856, 'eval_precision': 0.43399829497016196, 'eval_recall': 0.40869565217391307, 'eval_runtime': 4.3724, 'eval_samples_per_second': 13.036, 'eval_steps_per_second': 0.457, 'epoch': 14.0}
{'loss': 0.2939, 'grad_norm': 0.4289470314979553, 'learning_rate': 1.918340314747104e-06, 'epoch': 14.545454545454545}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2921857237815857, 'eval_accuracy': 0.15789473684210525, 'eval_f1': 0.38938359933957073, 'eval_precision': 0.42013000852514915, 'eval_recall': 0.41739130434782606, 'eval_runtime': 4.2144, 'eval_samples_per_second': 13.525, 'eval_steps_per_second': 0.475, 'epoch': 15.0}
{'loss': 0.2916, 'grad_norm': 0.400691956281662, 'learning_rate': 1.2078439018778063e-06, 'epoch': 15.454545454545455}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2913338840007782, 'eval_accuracy': 0.17543859649122806, 'eval_f1': 0.40160181752362406, 'eval_precision': 0.4166956521739131, 'eval_recall': 0.4434782608695652, 'eval_runtime': 4.2478, 'eval_samples_per_second': 13.419, 'eval_steps_per_second': 0.471, 'epoch': 16.0}
{'loss': 0.2828, 'grad_norm': 0.4156832993030548, 'learning_rate': 4.973474890085084e-07, 'epoch': 16.363636363636363}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2909115254878998, 'eval_accuracy': 0.17543859649122806, 'eval_f1': 0.40160181752362406, 'eval_precision': 0.4166956521739131, 'eval_recall': 0.4434782608695652, 'eval_runtime': 4.1477, 'eval_samples_per_second': 13.743, 'eval_steps_per_second': 0.482, 'epoch': 17.0}
{'train_runtime': 2414.2179, 'train_samples_per_second': 2.274, 'train_steps_per_second': 0.077, 'train_loss': 0.35211445813510506, 'epoch': 17.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-03 11:53:14,445] Trial 4 finished with value: 0.4413702239789196 and parameters: {'learning_rate': 1.328628292065587e-05, 'batch_size': 32, 'weight_decay': 0.046787034706532324, 'num_train_epochs': 17}. Best is trial 1 with value: 0.6144025539677712.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)


{'eval_loss': 0.4586581885814667, 'eval_accuracy': 0.12280701754385964, 'eval_f1': 0.4413702239789196, 'eval_precision': 0.3283550491134669, 'eval_recall': 0.6782608695652174, 'eval_runtime': 4.1039, 'eval_samples_per_second': 13.889, 'eval_steps_per_second': 0.487, 'epoch': 17.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.6232, 'grad_norm': 1.4807416200637817, 'learning_rate': 2.4578934749187868e-05, 'epoch': 0.9090909090909091}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.5216335654258728, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.1899209486166008, 'eval_precision': 0.14660564454614797, 'eval_recall': 0.26956521739130435, 'eval_runtime': 0.8242, 'eval_samples_per_second': 69.158, 'eval_steps_per_second': 2.427, 'epoch': 1.0}
{'loss': 0.497, 'grad_norm': 1.170830249786377, 'learning_rate': 2.327154460295447e-05, 'epoch': 1.8181818181818183}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.4251682758331299, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.19210394802598701, 'eval_precision': 0.14922360248447208, 'eval_recall': 0.26956521739130435, 'eval_runtime': 0.8197, 'eval_samples_per_second': 69.539, 'eval_steps_per_second': 2.44, 'epoch': 2.0}
{'loss': 0.4194, 'grad_norm': 0.8648954629898071, 'learning_rate': 2.1964154456721073e-05, 'epoch': 2.7272727272727275}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3683992922306061, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.1899209486166008, 'eval_precision': 0.14660564454614797, 'eval_recall': 0.26956521739130435, 'eval_runtime': 0.813, 'eval_samples_per_second': 70.11, 'eval_steps_per_second': 2.46, 'epoch': 3.0}
{'loss': 0.363, 'grad_norm': 0.8081860542297363, 'learning_rate': 2.0656764310487675e-05, 'epoch': 3.6363636363636362}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3326292932033539, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.2073347472605161, 'eval_precision': 0.35328218243819265, 'eval_recall': 0.2608695652173913, 'eval_runtime': 0.8287, 'eval_samples_per_second': 68.787, 'eval_steps_per_second': 2.414, 'epoch': 4.0}
{'loss': 0.3311, 'grad_norm': 0.6207064390182495, 'learning_rate': 1.934937416425428e-05, 'epoch': 4.545454545454545}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3111199736595154, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.1906680805938494, 'eval_precision': 0.15328218243819264, 'eval_recall': 0.25217391304347825, 'eval_runtime': 0.8134, 'eval_samples_per_second': 70.074, 'eval_steps_per_second': 2.459, 'epoch': 5.0}
{'loss': 0.3125, 'grad_norm': 0.5005659461021423, 'learning_rate': 1.8041984018020884e-05, 'epoch': 5.454545454545454}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2972506284713745, 'eval_accuracy': 0.08771929824561403, 'eval_f1': 0.3399150049035632, 'eval_precision': 0.3392507519824994, 'eval_recall': 0.3826086956521739, 'eval_runtime': 0.8167, 'eval_samples_per_second': 69.792, 'eval_steps_per_second': 2.449, 'epoch': 6.0}
{'loss': 0.2934, 'grad_norm': 0.4668835401535034, 'learning_rate': 1.6734593871787487e-05, 'epoch': 6.363636363636363}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2866140604019165, 'eval_accuracy': 0.10526315789473684, 'eval_f1': 0.35583218916857356, 'eval_precision': 0.36433465085639, 'eval_recall': 0.3739130434782609, 'eval_runtime': 0.8332, 'eval_samples_per_second': 68.41, 'eval_steps_per_second': 2.4, 'epoch': 7.0}
{'loss': 0.2844, 'grad_norm': 0.4014277458190918, 'learning_rate': 1.5427203725554086e-05, 'epoch': 7.2727272727272725}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.27960607409477234, 'eval_accuracy': 0.03508771929824561, 'eval_f1': 0.3104050538833148, 'eval_precision': 0.5448568398727466, 'eval_recall': 0.2956521739130435, 'eval_runtime': 0.821, 'eval_samples_per_second': 69.424, 'eval_steps_per_second': 2.436, 'epoch': 8.0}
{'loss': 0.2738, 'grad_norm': 0.5050816535949707, 'learning_rate': 1.4119813579320689e-05, 'epoch': 8.181818181818182}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2717342972755432, 'eval_accuracy': 0.15789473684210525, 'eval_f1': 0.4897472432071329, 'eval_precision': 0.527888198757764, 'eval_recall': 0.4782608695652174, 'eval_runtime': 0.8307, 'eval_samples_per_second': 68.617, 'eval_steps_per_second': 2.408, 'epoch': 9.0}
{'loss': 0.266, 'grad_norm': 0.3637658953666687, 'learning_rate': 1.2812423433087293e-05, 'epoch': 9.090909090909092}
{'loss': 0.2549, 'grad_norm': 0.7998218536376953, 'learning_rate': 1.1505033286853895e-05, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2648329734802246, 'eval_accuracy': 0.17543859649122806, 'eval_f1': 0.523869081637139, 'eval_precision': 0.537312252964427, 'eval_recall': 0.5130434782608696, 'eval_runtime': 0.7413, 'eval_samples_per_second': 76.895, 'eval_steps_per_second': 2.698, 'epoch': 10.0}
{'loss': 0.2522, 'grad_norm': 0.40818846225738525, 'learning_rate': 1.0197643140620498e-05, 'epoch': 10.909090909090908}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2588217258453369, 'eval_accuracy': 0.22807017543859648, 'eval_f1': 0.5271924502864846, 'eval_precision': 0.5547542018464452, 'eval_recall': 0.5043478260869565, 'eval_runtime': 0.8286, 'eval_samples_per_second': 68.789, 'eval_steps_per_second': 2.414, 'epoch': 11.0}
{'loss': 0.2542, 'grad_norm': 0.40052443742752075, 'learning_rate': 8.8902529943871e-06, 'epoch': 11.818181818181818}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2531041204929352, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5452352102637206, 'eval_precision': 0.5535486542443064, 'eval_recall': 0.5391304347826087, 'eval_runtime': 0.8253, 'eval_samples_per_second': 69.067, 'eval_steps_per_second': 2.423, 'epoch': 12.0}
{'loss': 0.2405, 'grad_norm': 0.34848859906196594, 'learning_rate': 7.582862848153704e-06, 'epoch': 12.727272727272727}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.25086966156959534, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5606414845750367, 'eval_precision': 0.5265692867851788, 'eval_recall': 0.6, 'eval_runtime': 0.8247, 'eval_samples_per_second': 69.112, 'eval_steps_per_second': 2.425, 'epoch': 13.0}
{'loss': 0.2314, 'grad_norm': 0.4111770987510681, 'learning_rate': 6.275472701920307e-06, 'epoch': 13.636363636363637}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.24719396233558655, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5369750589821368, 'eval_precision': 0.5575368105420974, 'eval_recall': 0.5217391304347826, 'eval_runtime': 0.8232, 'eval_samples_per_second': 69.243, 'eval_steps_per_second': 2.43, 'epoch': 14.0}
{'loss': 0.2367, 'grad_norm': 0.3285432755947113, 'learning_rate': 4.968082555686909e-06, 'epoch': 14.545454545454545}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.24485209584236145, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5519912312751187, 'eval_precision': 0.5325529542920847, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.8309, 'eval_samples_per_second': 68.599, 'eval_steps_per_second': 2.407, 'epoch': 15.0}
{'loss': 0.2316, 'grad_norm': 0.37480178475379944, 'learning_rate': 3.6606924094535123e-06, 'epoch': 15.454545454545455}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.24354353547096252, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5607004830917874, 'eval_precision': 0.5271815856777493, 'eval_recall': 0.6, 'eval_runtime': 0.8141, 'eval_samples_per_second': 70.012, 'eval_steps_per_second': 2.457, 'epoch': 16.0}
{'loss': 0.2206, 'grad_norm': 0.3411249816417694, 'learning_rate': 2.353302263220115e-06, 'epoch': 16.363636363636363}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.24186547100543976, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5467482223245308, 'eval_precision': 0.5316541729135432, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.8312, 'eval_samples_per_second': 68.578, 'eval_steps_per_second': 2.406, 'epoch': 17.0}
{'loss': 0.227, 'grad_norm': 0.3631191849708557, 'learning_rate': 1.045912116986718e-06, 'epoch': 17.272727272727273}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2417202591896057, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5535830363506771, 'eval_precision': 0.5432285395763657, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.7455, 'eval_samples_per_second': 76.464, 'eval_steps_per_second': 2.683, 'epoch': 18.0}
{'train_runtime': 2076.1788, 'train_samples_per_second': 2.8, 'train_steps_per_second': 0.095, 'train_loss': 0.3027455650194727, 'epoch': 18.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-03 12:27:52,093] Trial 5 finished with value: 0.5607004830917874 and parameters: {'learning_rate': 2.5886324895421265e-05, 'batch_size': 32, 'weight_decay': 0.0685841294079647, 'num_train_epochs': 18}. Best is trial 1 with value: 0.6144025539677712.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.24354353547096252, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5607004830917874, 'eval_precision': 0.5271815856777493, 'eval_recall': 0.6, 'eval_runtime': 0.7238, 'eval_samples_per_second': 78.746, 'eval_steps_per_second': 2.763, 'epoch': 18.0}
{'loss': 0.6098, 'grad_norm': 1.3335715532302856, 'learning_rate': 3.881358538147623e-05, 'epoch': 0.9090909090909091}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.4774644076824188, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.16337285902503296, 'eval_precision': 0.15403726708074533, 'eval_recall': 0.17391304347826086, 'eval_runtime': 4.4847, 'eval_samples_per_second': 12.71, 'eval_steps_per_second': 0.446, 'epoch': 1.0}
{'loss': 0.439, 'grad_norm': 0.8298715353012085, 'learning_rate': 3.0188344185592624e-05, 'epoch': 1.8181818181818183}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.36502814292907715, 'eval_accuracy': 0.03508771929824561, 'eval_f1': 0.0719640179910045, 'eval_precision': 0.20869565217391303, 'eval_recall': 0.043478260869565216, 'eval_runtime': 4.5153, 'eval_samples_per_second': 12.624, 'eval_steps_per_second': 0.443, 'epoch': 2.0}
{'loss': 0.3608, 'grad_norm': 0.5452831983566284, 'learning_rate': 2.1563102989709016e-05, 'epoch': 2.7272727272727275}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.32571572065353394, 'eval_accuracy': 0.05263157894736842, 'eval_f1': 0.24176598959207654, 'eval_precision': 0.3654196157735086, 'eval_recall': 0.25217391304347825, 'eval_runtime': 4.459, 'eval_samples_per_second': 12.783, 'eval_steps_per_second': 0.449, 'epoch': 3.0}
{'loss': 0.3201, 'grad_norm': 0.5950146913528442, 'learning_rate': 1.2937861793825409e-05, 'epoch': 3.6363636363636362}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.30889931321144104, 'eval_accuracy': 0.08771929824561403, 'eval_f1': 0.33315401409768575, 'eval_precision': 0.5411082693947143, 'eval_recall': 0.34782608695652173, 'eval_runtime': 4.484, 'eval_samples_per_second': 12.712, 'eval_steps_per_second': 0.446, 'epoch': 4.0}
{'loss': 0.3052, 'grad_norm': 0.47670066356658936, 'learning_rate': 4.312620597941803e-06, 'epoch': 4.545454545454545}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3041578233242035, 'eval_accuracy': 0.10526315789473684, 'eval_f1': 0.31820431247790737, 'eval_precision': 0.3445865302642796, 'eval_recall': 0.34782608695652173, 'eval_runtime': 4.4943, 'eval_samples_per_second': 12.683, 'eval_steps_per_second': 0.445, 'epoch': 5.0}
{'train_runtime': 610.3202, 'train_samples_per_second': 2.646, 'train_steps_per_second': 0.09, 'train_loss': 0.39791345596313477, 'epoch': 5.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-03 12:38:07,850] Trial 6 finished with value: 0.33315401409768575 and parameters: {'learning_rate': 4.743882657735984e-05, 'batch_size': 32, 'weight_decay': 0.08998813461764842, 'num_train_epochs': 5}. Best is trial 1 with value: 0.6144025539677712.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.30889931321144104, 'eval_accuracy': 0.08771929824561403, 'eval_f1': 0.33315401409768575, 'eval_precision': 0.5411082693947143, 'eval_recall': 0.34782608695652173, 'eval_runtime': 4.7096, 'eval_samples_per_second': 12.103, 'eval_steps_per_second': 0.425, 'epoch': 5.0}
{'loss': 0.5971, 'grad_norm': 1.329908847808838, 'learning_rate': 3.8357213368305086e-05, 'epoch': 0.47619047619047616}
{'loss': 0.4363, 'grad_norm': 0.8490806818008423, 'learning_rate': 3.737116675215328e-05, 'epoch': 0.9523809523809523}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3714565336704254, 'eval_accuracy': 0.03508771929824561, 'eval_f1': 0.16678260869565217, 'eval_precision': 0.39028132992327363, 'eval_recall': 0.12173913043478261, 'eval_runtime': 0.6934, 'eval_samples_per_second': 82.209, 'eval_steps_per_second': 5.769, 'epoch': 1.0}
{'loss': 0.3578, 'grad_norm': 0.728528618812561, 'learning_rate': 3.638512013600148e-05, 'epoch': 1.4285714285714286}
{'loss': 0.3073, 'grad_norm': 0.5152506232261658, 'learning_rate': 3.5399073519849684e-05, 'epoch': 1.9047619047619047}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2922188937664032, 'eval_accuracy': 0.05263157894736842, 'eval_f1': 0.3111810610531838, 'eval_precision': 0.3424102262215121, 'eval_recall': 0.33043478260869563, 'eval_runtime': 0.6905, 'eval_samples_per_second': 82.548, 'eval_steps_per_second': 5.793, 'epoch': 2.0}
{'loss': 0.296, 'grad_norm': 0.4177704155445099, 'learning_rate': 3.441302690369788e-05, 'epoch': 2.380952380952381}
{'loss': 0.2672, 'grad_norm': 0.38038045167922974, 'learning_rate': 3.342698028754608e-05, 'epoch': 2.857142857142857}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2632644474506378, 'eval_accuracy': 0.19298245614035087, 'eval_f1': 0.43993374741200825, 'eval_precision': 0.46096317869773246, 'eval_recall': 0.45217391304347826, 'eval_runtime': 0.6903, 'eval_samples_per_second': 82.574, 'eval_steps_per_second': 5.795, 'epoch': 3.0}
{'loss': 0.2345, 'grad_norm': 0.5007474422454834, 'learning_rate': 3.2440933671394276e-05, 'epoch': 3.3333333333333335}
{'loss': 0.2502, 'grad_norm': 0.554510772228241, 'learning_rate': 3.145488705524247e-05, 'epoch': 3.8095238095238093}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.24316859245300293, 'eval_accuracy': 0.19298245614035087, 'eval_f1': 0.5094000361507766, 'eval_precision': 0.5343372431431344, 'eval_recall': 0.4956521739130435, 'eval_runtime': 0.6955, 'eval_samples_per_second': 81.958, 'eval_steps_per_second': 5.751, 'epoch': 4.0}
{'loss': 0.2275, 'grad_norm': 0.44623908400535583, 'learning_rate': 3.0468840439090673e-05, 'epoch': 4.285714285714286}
{'loss': 0.2106, 'grad_norm': 0.4168976843357086, 'learning_rate': 2.948279382293887e-05, 'epoch': 4.761904761904762}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.22820785641670227, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.514089191634293, 'eval_precision': 0.4988537549407115, 'eval_recall': 0.5391304347826087, 'eval_runtime': 0.7143, 'eval_samples_per_second': 79.797, 'eval_steps_per_second': 5.6, 'epoch': 5.0}
{'loss': 0.2115, 'grad_norm': 0.652596116065979, 'learning_rate': 2.849674720678707e-05, 'epoch': 5.238095238095238}
{'loss': 0.1899, 'grad_norm': 0.41259658336639404, 'learning_rate': 2.751070059063527e-05, 'epoch': 5.714285714285714}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2186097800731659, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5353268154045464, 'eval_precision': 0.5177911044477762, 'eval_recall': 0.5565217391304348, 'eval_runtime': 0.6938, 'eval_samples_per_second': 82.158, 'eval_steps_per_second': 5.765, 'epoch': 6.0}
{'loss': 0.1872, 'grad_norm': 0.34705784916877747, 'learning_rate': 2.6524653974483466e-05, 'epoch': 6.190476190476191}
{'loss': 0.1836, 'grad_norm': 0.41777685284614563, 'learning_rate': 2.5538607358331666e-05, 'epoch': 6.666666666666667}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21345779299736023, 'eval_accuracy': 0.2631578947368421, 'eval_f1': 0.5311248546849571, 'eval_precision': 0.5816425120772947, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.6967, 'eval_samples_per_second': 81.814, 'eval_steps_per_second': 5.741, 'epoch': 7.0}
{'loss': 0.1657, 'grad_norm': 0.3576123118400574, 'learning_rate': 2.4552560742179864e-05, 'epoch': 7.142857142857143}
{'loss': 0.1623, 'grad_norm': 0.327747642993927, 'learning_rate': 2.356651412602806e-05, 'epoch': 7.619047619047619}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20641256868839264, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.5659378578633236, 'eval_precision': 0.5990056139512662, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.6907, 'eval_samples_per_second': 82.522, 'eval_steps_per_second': 5.791, 'epoch': 8.0}
{'loss': 0.1607, 'grad_norm': 0.37342092394828796, 'learning_rate': 2.258046750987626e-05, 'epoch': 8.095238095238095}
{'loss': 0.1448, 'grad_norm': 0.5665784478187561, 'learning_rate': 2.159442089372446e-05, 'epoch': 8.571428571428571}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20082375407218933, 'eval_accuracy': 0.3508771929824561, 'eval_f1': 0.5811115205109311, 'eval_precision': 0.636920210224558, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.7098, 'eval_samples_per_second': 80.309, 'eval_steps_per_second': 5.636, 'epoch': 9.0}
{'loss': 0.1556, 'grad_norm': 0.29320216178894043, 'learning_rate': 2.0608374277572656e-05, 'epoch': 9.047619047619047}
{'loss': 0.1453, 'grad_norm': 0.31776338815689087, 'learning_rate': 1.9622327661420853e-05, 'epoch': 9.523809523809524}
{'loss': 0.1339, 'grad_norm': 0.3906534016132355, 'learning_rate': 1.863628104526905e-05, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19604040682315826, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.5858282300803428, 'eval_precision': 0.6398573021181717, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.6591, 'eval_samples_per_second': 86.482, 'eval_steps_per_second': 6.069, 'epoch': 10.0}
{'loss': 0.1328, 'grad_norm': 0.32415810227394104, 'learning_rate': 1.765023442911725e-05, 'epoch': 10.476190476190476}
{'loss': 0.1341, 'grad_norm': 0.39156007766723633, 'learning_rate': 1.6664187812965448e-05, 'epoch': 10.952380952380953}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19576184451580048, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.6001064987868057, 'eval_precision': 0.6305958132045089, 'eval_recall': 0.6086956521739131, 'eval_runtime': 0.7004, 'eval_samples_per_second': 81.379, 'eval_steps_per_second': 5.711, 'epoch': 11.0}
{'loss': 0.1273, 'grad_norm': 0.3117484152317047, 'learning_rate': 1.5678141196813645e-05, 'epoch': 11.428571428571429}
{'loss': 0.1282, 'grad_norm': 0.3609994947910309, 'learning_rate': 1.4692094580661844e-05, 'epoch': 11.904761904761905}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19252942502498627, 'eval_accuracy': 0.3508771929824561, 'eval_f1': 0.6114655870445344, 'eval_precision': 0.6829820742864221, 'eval_recall': 0.5826086956521739, 'eval_runtime': 0.7139, 'eval_samples_per_second': 79.845, 'eval_steps_per_second': 5.603, 'epoch': 12.0}
{'loss': 0.1275, 'grad_norm': 0.41586676239967346, 'learning_rate': 1.3706047964510043e-05, 'epoch': 12.380952380952381}
{'loss': 0.1192, 'grad_norm': 0.3670886158943176, 'learning_rate': 1.2720001348358242e-05, 'epoch': 12.857142857142858}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.18900611996650696, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.626728921253659, 'eval_precision': 0.6661711879103183, 'eval_recall': 0.6173913043478261, 'eval_runtime': 0.7118, 'eval_samples_per_second': 80.082, 'eval_steps_per_second': 5.62, 'epoch': 13.0}
{'loss': 0.1155, 'grad_norm': 0.3600284159183502, 'learning_rate': 1.173395473220644e-05, 'epoch': 13.333333333333334}
{'loss': 0.1102, 'grad_norm': 0.37987664341926575, 'learning_rate': 1.0747908116054638e-05, 'epoch': 13.80952380952381}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.18869231641292572, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.6380121416289434, 'eval_precision': 0.7101440427527383, 'eval_recall': 0.6260869565217392, 'eval_runtime': 0.695, 'eval_samples_per_second': 82.011, 'eval_steps_per_second': 5.755, 'epoch': 14.0}
{'loss': 0.1173, 'grad_norm': 0.2701732814311981, 'learning_rate': 9.761861499902837e-06, 'epoch': 14.285714285714286}
{'loss': 0.1056, 'grad_norm': 0.3616527318954468, 'learning_rate': 8.775814883751034e-06, 'epoch': 14.761904761904763}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.18862448632717133, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.6162941386449632, 'eval_precision': 0.662157809983897, 'eval_recall': 0.6, 'eval_runtime': 0.6957, 'eval_samples_per_second': 81.932, 'eval_steps_per_second': 5.75, 'epoch': 15.0}
{'loss': 0.1174, 'grad_norm': 0.24898605048656464, 'learning_rate': 7.789768267599233e-06, 'epoch': 15.238095238095237}
{'loss': 0.107, 'grad_norm': 0.2695745527744293, 'learning_rate': 6.803721651447431e-06, 'epoch': 15.714285714285714}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.18642431497573853, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.6462937151279082, 'eval_precision': 0.7101440427527383, 'eval_recall': 0.6347826086956522, 'eval_runtime': 0.6957, 'eval_samples_per_second': 81.933, 'eval_steps_per_second': 5.75, 'epoch': 16.0}
{'loss': 0.1014, 'grad_norm': 0.27634143829345703, 'learning_rate': 5.81767503529563e-06, 'epoch': 16.19047619047619}
{'loss': 0.1069, 'grad_norm': 0.2941097319126129, 'learning_rate': 4.831628419143828e-06, 'epoch': 16.666666666666668}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.18663793802261353, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.6350104947526236, 'eval_precision': 0.6661711879103183, 'eval_recall': 0.6260869565217392, 'eval_runtime': 0.7059, 'eval_samples_per_second': 80.752, 'eval_steps_per_second': 5.667, 'epoch': 17.0}
{'loss': 0.1013, 'grad_norm': 0.21988168358802795, 'learning_rate': 3.845581802992026e-06, 'epoch': 17.142857142857142}
{'loss': 0.1079, 'grad_norm': 0.32234418392181396, 'learning_rate': 2.8595351868402248e-06, 'epoch': 17.61904761904762}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.18535520136356354, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.6499173270507603, 'eval_precision': 0.7183451009537966, 'eval_recall': 0.6347826086956522, 'eval_runtime': 0.6996, 'eval_samples_per_second': 81.48, 'eval_steps_per_second': 5.718, 'epoch': 18.0}
{'loss': 0.1065, 'grad_norm': 0.26746901869773865, 'learning_rate': 1.8734885706884233e-06, 'epoch': 18.095238095238095}
{'loss': 0.1022, 'grad_norm': 0.25301024317741394, 'learning_rate': 8.874419545366215e-07, 'epoch': 18.571428571428573}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.1855354756116867, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.6462937151279082, 'eval_precision': 0.7101440427527383, 'eval_recall': 0.6347826086956522, 'eval_runtime': 0.6584, 'eval_samples_per_second': 86.579, 'eval_steps_per_second': 6.076, 'epoch': 19.0}
{'train_runtime': 365.6027, 'train_samples_per_second': 16.786, 'train_steps_per_second': 1.091, 'train_loss': 0.1801112367395769, 'epoch': 19.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-03 12:44:14,861] Trial 7 finished with value: 0.6499173270507603 and parameters: {'learning_rate': 3.934325998445689e-05, 'batch_size': 16, 'weight_decay': 0.09062151503274436, 'num_train_epochs': 19}. Best is trial 7 with value: 0.6499173270507603.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)


{'eval_loss': 0.18535520136356354, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.6499173270507603, 'eval_precision': 0.7183451009537966, 'eval_recall': 0.6347826086956522, 'eval_runtime': 0.6721, 'eval_samples_per_second': 84.803, 'eval_steps_per_second': 5.951, 'epoch': 19.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'loss': 0.5822, 'grad_norm': 1.770201563835144, 'learning_rate': 2.976580626641994e-05, 'epoch': 0.24390243902439024}
{'loss': 0.4562, 'grad_norm': 1.1870136260986328, 'learning_rate': 2.9332533977243525e-05, 'epoch': 0.4878048780487805}
{'loss': 0.3729, 'grad_norm': 0.8025399446487427, 'learning_rate': 2.8899261688067106e-05, 'epoch': 0.7317073170731707}
{'loss': 0.3285, 'grad_norm': 0.7691493630409241, 'learning_rate': 2.8465989398890686e-05, 'epoch': 0.975609756097561}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.31021276116371155, 'eval_accuracy': 0.10526315789473684, 'eval_f1': 0.3572981366459627, 'eval_precision': 0.3096702214930271, 'eval_recall': 0.43478260869565216, 'eval_runtime': 0.7072, 'eval_samples_per_second': 80.603, 'eval_steps_per_second': 11.313, 'epoch': 1.0}
{'loss': 0.3078, 'grad_norm': 0.5070943832397461, 'learning_rate': 2.8032717109714267e-05, 'epoch': 1.2195121951219512}
{'loss': 0.2934, 'grad_norm': 0.6830063462257385, 'learning_rate': 2.759944482053785e-05, 'epoch': 1.4634146341463414}
{'loss': 0.2632, 'grad_norm': 0.6922929883003235, 'learning_rate': 2.716617253136143e-05, 'epoch': 1.7073170731707317}
{'loss': 0.2655, 'grad_norm': 0.5419002175331116, 'learning_rate': 2.6732900242185015e-05, 'epoch': 1.951219512195122}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2652244567871094, 'eval_accuracy': 0.14035087719298245, 'eval_f1': 0.44617384462764687, 'eval_precision': 0.5129216904152966, 'eval_recall': 0.46956521739130436, 'eval_runtime': 0.7087, 'eval_samples_per_second': 80.43, 'eval_steps_per_second': 11.288, 'epoch': 2.0}
{'loss': 0.2764, 'grad_norm': 0.5635392069816589, 'learning_rate': 2.6299627953008595e-05, 'epoch': 2.1951219512195124}
{'loss': 0.2442, 'grad_norm': 0.49389469623565674, 'learning_rate': 2.586635566383218e-05, 'epoch': 2.4390243902439024}
{'loss': 0.2364, 'grad_norm': 0.7360255718231201, 'learning_rate': 2.543308337465576e-05, 'epoch': 2.682926829268293}
{'loss': 0.2445, 'grad_norm': 0.5870044827461243, 'learning_rate': 2.499981108547934e-05, 'epoch': 2.926829268292683}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2397909015417099, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.4943447676111093, 'eval_precision': 0.4968016113214312, 'eval_recall': 0.5217391304347826, 'eval_runtime': 0.7116, 'eval_samples_per_second': 80.096, 'eval_steps_per_second': 11.241, 'epoch': 3.0}
{'loss': 0.2089, 'grad_norm': 0.6850317120552063, 'learning_rate': 2.456653879630292e-05, 'epoch': 3.1707317073170733}
{'loss': 0.2167, 'grad_norm': 0.569256603717804, 'learning_rate': 2.4133266507126505e-05, 'epoch': 3.4146341463414633}
{'loss': 0.2312, 'grad_norm': 0.597496509552002, 'learning_rate': 2.3699994217950085e-05, 'epoch': 3.658536585365854}
{'loss': 0.2051, 'grad_norm': 0.6728280186653137, 'learning_rate': 2.3266721928773666e-05, 'epoch': 3.902439024390244}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.21800248324871063, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5233583288387819, 'eval_precision': 0.5014167595689335, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.7081, 'eval_samples_per_second': 80.498, 'eval_steps_per_second': 11.298, 'epoch': 4.0}
{'loss': 0.2025, 'grad_norm': 0.45831239223480225, 'learning_rate': 2.283344963959725e-05, 'epoch': 4.146341463414634}
{'loss': 0.2031, 'grad_norm': 0.6218544244766235, 'learning_rate': 2.240017735042083e-05, 'epoch': 4.390243902439025}
{'loss': 0.1705, 'grad_norm': 0.6541077494621277, 'learning_rate': 2.196690506124441e-05, 'epoch': 4.634146341463414}
{'loss': 0.1786, 'grad_norm': 0.6804752349853516, 'learning_rate': 2.153363277206799e-05, 'epoch': 4.878048780487805}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20739717781543732, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.5485135637309551, 'eval_precision': 0.5343873517786562, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.7048, 'eval_samples_per_second': 80.871, 'eval_steps_per_second': 11.35, 'epoch': 5.0}
{'loss': 0.1846, 'grad_norm': 0.4805348813533783, 'learning_rate': 2.1100360482891575e-05, 'epoch': 5.121951219512195}
{'loss': 0.1632, 'grad_norm': 0.5543131232261658, 'learning_rate': 2.0667088193715156e-05, 'epoch': 5.365853658536586}
{'loss': 0.1533, 'grad_norm': 0.5011523962020874, 'learning_rate': 2.023381590453874e-05, 'epoch': 5.609756097560975}
{'loss': 0.1706, 'grad_norm': 0.5289484858512878, 'learning_rate': 1.980054361536232e-05, 'epoch': 5.853658536585366}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.20402027666568756, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.5550687979920172, 'eval_precision': 0.6110248447204969, 'eval_recall': 0.5652173913043478, 'eval_runtime': 0.7044, 'eval_samples_per_second': 80.915, 'eval_steps_per_second': 11.357, 'epoch': 6.0}
{'loss': 0.1554, 'grad_norm': 0.5746945738792419, 'learning_rate': 1.9367271326185904e-05, 'epoch': 6.097560975609756}
{'loss': 0.1398, 'grad_norm': 0.39539971947669983, 'learning_rate': 1.893399903700948e-05, 'epoch': 6.341463414634147}
{'loss': 0.1573, 'grad_norm': 0.494503915309906, 'learning_rate': 1.8500726747833065e-05, 'epoch': 6.585365853658536}
{'loss': 0.1381, 'grad_norm': 0.37825334072113037, 'learning_rate': 1.8067454458656645e-05, 'epoch': 6.829268292682927}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19766919314861298, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5687460617517329, 'eval_precision': 0.6506561264822134, 'eval_recall': 0.5391304347826087, 'eval_runtime': 0.7132, 'eval_samples_per_second': 79.918, 'eval_steps_per_second': 11.217, 'epoch': 7.0}
{'loss': 0.1404, 'grad_norm': 0.42842257022857666, 'learning_rate': 1.763418216948023e-05, 'epoch': 7.073170731707317}
{'loss': 0.1371, 'grad_norm': 0.4694189429283142, 'learning_rate': 1.720090988030381e-05, 'epoch': 7.317073170731708}
{'loss': 0.133, 'grad_norm': 0.5492737889289856, 'learning_rate': 1.676763759112739e-05, 'epoch': 7.560975609756097}
{'loss': 0.135, 'grad_norm': 0.49982866644859314, 'learning_rate': 1.6334365301950974e-05, 'epoch': 7.804878048780488}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.19131112098693848, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.5973064121812496, 'eval_precision': 0.6570066889632107, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.71, 'eval_samples_per_second': 80.281, 'eval_steps_per_second': 11.267, 'epoch': 8.0}
{'loss': 0.1229, 'grad_norm': 0.3446386754512787, 'learning_rate': 1.5901093012774555e-05, 'epoch': 8.048780487804878}
{'loss': 0.1251, 'grad_norm': 0.4522034525871277, 'learning_rate': 1.5467820723598135e-05, 'epoch': 8.292682926829269}
{'loss': 0.115, 'grad_norm': 0.5045050382614136, 'learning_rate': 1.5034548434421717e-05, 'epoch': 8.536585365853659}
{'loss': 0.1373, 'grad_norm': 0.34520572423934937, 'learning_rate': 1.46012761452453e-05, 'epoch': 8.78048780487805}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.18733000755310059, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.6339860520867385, 'eval_precision': 0.7382551361681796, 'eval_recall': 0.591304347826087, 'eval_runtime': 0.7097, 'eval_samples_per_second': 80.318, 'eval_steps_per_second': 11.273, 'epoch': 9.0}
{'loss': 0.1056, 'grad_norm': 0.38729554414749146, 'learning_rate': 1.4168003856068882e-05, 'epoch': 9.024390243902438}
{'loss': 0.1103, 'grad_norm': 0.46734410524368286, 'learning_rate': 1.3734731566892464e-05, 'epoch': 9.268292682926829}
{'loss': 0.121, 'grad_norm': 0.4515976309776306, 'learning_rate': 1.3301459277716043e-05, 'epoch': 9.512195121951219}
{'loss': 0.1091, 'grad_norm': 0.3024665415287018, 'learning_rate': 1.2868186988539625e-05, 'epoch': 9.75609756097561}
{'loss': 0.1078, 'grad_norm': 0.3214772343635559, 'learning_rate': 1.2434914699363207e-05, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.18603485822677612, 'eval_accuracy': 0.3333333333333333, 'eval_f1': 0.6103145235892692, 'eval_precision': 0.6691096408317581, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.6709, 'eval_samples_per_second': 84.961, 'eval_steps_per_second': 11.924, 'epoch': 10.0}
{'loss': 0.1064, 'grad_norm': 0.3576661944389343, 'learning_rate': 1.200164241018679e-05, 'epoch': 10.24390243902439}
{'loss': 0.1009, 'grad_norm': 0.49318015575408936, 'learning_rate': 1.156837012101037e-05, 'epoch': 10.487804878048781}
{'loss': 0.1054, 'grad_norm': 0.34764885902404785, 'learning_rate': 1.1135097831833952e-05, 'epoch': 10.731707317073171}
{'loss': 0.1068, 'grad_norm': 0.3567737340927124, 'learning_rate': 1.0701825542657534e-05, 'epoch': 10.975609756097562}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.18194027245044708, 'eval_accuracy': 0.3508771929824561, 'eval_f1': 0.6526407313060796, 'eval_precision': 0.7303821379908336, 'eval_recall': 0.6260869565217392, 'eval_runtime': 0.7049, 'eval_samples_per_second': 80.863, 'eval_steps_per_second': 11.349, 'epoch': 11.0}
{'loss': 0.1028, 'grad_norm': 0.330251544713974, 'learning_rate': 1.0268553253481115e-05, 'epoch': 11.21951219512195}
{'loss': 0.0945, 'grad_norm': 0.37531524896621704, 'learning_rate': 9.835280964304697e-06, 'epoch': 11.463414634146341}
{'loss': 0.0963, 'grad_norm': 0.3740999698638916, 'learning_rate': 9.402008675128279e-06, 'epoch': 11.707317073170731}
{'loss': 0.1049, 'grad_norm': 0.34253349900245667, 'learning_rate': 8.968736385951861e-06, 'epoch': 11.951219512195122}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.1804260015487671, 'eval_accuracy': 0.3508771929824561, 'eval_f1': 0.6610257220058251, 'eval_precision': 0.7374906832298136, 'eval_recall': 0.6347826086956522, 'eval_runtime': 0.7035, 'eval_samples_per_second': 81.025, 'eval_steps_per_second': 11.372, 'epoch': 12.0}
{'loss': 0.1044, 'grad_norm': 0.45515426993370056, 'learning_rate': 8.535464096775442e-06, 'epoch': 12.195121951219512}
{'loss': 0.0964, 'grad_norm': 0.539563000202179, 'learning_rate': 8.102191807599024e-06, 'epoch': 12.439024390243903}
{'loss': 0.0946, 'grad_norm': 0.639778196811676, 'learning_rate': 7.668919518422606e-06, 'epoch': 12.682926829268293}
{'loss': 0.0974, 'grad_norm': 0.31411728262901306, 'learning_rate': 7.235647229246187e-06, 'epoch': 12.926829268292684}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.18060797452926636, 'eval_accuracy': 0.3333333333333333, 'eval_f1': 0.652139746453304, 'eval_precision': 0.7294557165861514, 'eval_recall': 0.6260869565217392, 'eval_runtime': 0.705, 'eval_samples_per_second': 80.853, 'eval_steps_per_second': 11.348, 'epoch': 13.0}
{'loss': 0.0812, 'grad_norm': 0.3176894187927246, 'learning_rate': 6.802374940069769e-06, 'epoch': 13.170731707317072}
{'loss': 0.0983, 'grad_norm': 0.25448641180992126, 'learning_rate': 6.36910265089335e-06, 'epoch': 13.414634146341463}
{'loss': 0.0934, 'grad_norm': 0.2852447032928467, 'learning_rate': 5.9358303617169316e-06, 'epoch': 13.658536585365853}
{'loss': 0.0841, 'grad_norm': 0.4273904860019684, 'learning_rate': 5.502558072540514e-06, 'epoch': 13.902439024390244}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.1794224977493286, 'eval_accuracy': 0.3333333333333333, 'eval_f1': 0.6498038574697612, 'eval_precision': 0.7345284280936454, 'eval_recall': 0.6173913043478261, 'eval_runtime': 0.706, 'eval_samples_per_second': 80.733, 'eval_steps_per_second': 11.331, 'epoch': 14.0}
{'loss': 0.0901, 'grad_norm': 0.30227166414260864, 'learning_rate': 5.069285783364095e-06, 'epoch': 14.146341463414634}
{'loss': 0.091, 'grad_norm': 0.3725146949291229, 'learning_rate': 4.6360134941876764e-06, 'epoch': 14.390243902439025}
{'loss': 0.0862, 'grad_norm': 0.30644264817237854, 'learning_rate': 4.202741205011258e-06, 'epoch': 14.634146341463415}
{'loss': 0.0912, 'grad_norm': 0.3777189254760742, 'learning_rate': 3.7694689158348396e-06, 'epoch': 14.878048780487806}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.17989754676818848, 'eval_accuracy': 0.3508771929824561, 'eval_f1': 0.657653938509818, 'eval_precision': 0.7308819875776398, 'eval_recall': 0.6347826086956522, 'eval_runtime': 0.7114, 'eval_samples_per_second': 80.12, 'eval_steps_per_second': 11.245, 'epoch': 15.0}
{'loss': 0.1001, 'grad_norm': 0.5024279356002808, 'learning_rate': 3.3361966266584213e-06, 'epoch': 15.121951219512194}
{'loss': 0.0843, 'grad_norm': 0.2765589952468872, 'learning_rate': 2.902924337482003e-06, 'epoch': 15.365853658536585}
{'loss': 0.0846, 'grad_norm': 0.26658180356025696, 'learning_rate': 2.4696520483055845e-06, 'epoch': 15.609756097560975}
{'loss': 0.0867, 'grad_norm': 0.3271233141422272, 'learning_rate': 2.0363797591291662e-06, 'epoch': 15.853658536585366}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.17948994040489197, 'eval_accuracy': 0.3333333333333333, 'eval_f1': 0.6555115299493111, 'eval_precision': 0.7360644122383252, 'eval_recall': 0.6260869565217392, 'eval_runtime': 0.7099, 'eval_samples_per_second': 80.289, 'eval_steps_per_second': 11.269, 'epoch': 16.0}
{'loss': 0.0867, 'grad_norm': 0.35505372285842896, 'learning_rate': 1.603107469952748e-06, 'epoch': 16.097560975609756}
{'loss': 0.0741, 'grad_norm': 0.30304184556007385, 'learning_rate': 1.1698351807763296e-06, 'epoch': 16.341463414634145}
{'loss': 0.0941, 'grad_norm': 0.34282729029655457, 'learning_rate': 7.365628915999112e-07, 'epoch': 16.585365853658537}
{'loss': 0.0911, 'grad_norm': 0.3343210816383362, 'learning_rate': 3.0329060242349283e-07, 'epoch': 16.829268292682926}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.17912696301937103, 'eval_accuracy': 0.3333333333333333, 'eval_f1': 0.652139746453304, 'eval_precision': 0.7294557165861514, 'eval_recall': 0.6260869565217392, 'eval_runtime': 0.9037, 'eval_samples_per_second': 63.076, 'eval_steps_per_second': 8.853, 'epoch': 17.0}
{'train_runtime': 365.6265, 'train_samples_per_second': 15.018, 'train_steps_per_second': 1.906, 'train_loss': 0.15843397267750722, 'epoch': 17.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-03 12:50:21,807] Trial 8 finished with value: 0.6610257220058251 and parameters: {'learning_rate': 3.019907855559636e-05, 'batch_size': 8, 'weight_decay': 0.014597336715952852, 'num_train_epochs': 17}. Best is trial 8 with value: 0.6610257220058251.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  weight_decay = trial.suggest_uniform('weight_decay', 0.01, 0.1)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 0.1804260015487671, 'eval_accuracy': 0.3508771929824561, 'eval_f1': 0.6610257220058251, 'eval_precision': 0.7374906832298136, 'eval_recall': 0.6347826086956522, 'eval_runtime': 0.6789, 'eval_samples_per_second': 83.965, 'eval_steps_per_second': 11.785, 'epoch': 17.0}
{'loss': 0.6419, 'grad_norm': 1.6372389793395996, 'learning_rate': 1.5430291407609153e-05, 'epoch': 0.9090909090909091}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.5559136271476746, 'eval_accuracy': 0.0, 'eval_f1': 0.2547046692882343, 'eval_precision': 0.2527734908555514, 'eval_recall': 0.3391304347826087, 'eval_runtime': 11.5775, 'eval_samples_per_second': 4.923, 'eval_steps_per_second': 0.173, 'epoch': 1.0}
{'loss': 0.5357, 'grad_norm': 1.30563485622406, 'learning_rate': 1.4558523531473041e-05, 'epoch': 1.8181818181818183}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.47620096802711487, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.1899209486166008, 'eval_precision': 0.14660564454614797, 'eval_recall': 0.26956521739130435, 'eval_runtime': 12.1169, 'eval_samples_per_second': 4.704, 'eval_steps_per_second': 0.165, 'epoch': 2.0}
{'loss': 0.4689, 'grad_norm': 1.029947280883789, 'learning_rate': 1.368675565533693e-05, 'epoch': 2.7272727272727275}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.4198874235153198, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.1899209486166008, 'eval_precision': 0.14660564454614797, 'eval_recall': 0.26956521739130435, 'eval_runtime': 12.0697, 'eval_samples_per_second': 4.723, 'eval_steps_per_second': 0.166, 'epoch': 3.0}
{'loss': 0.4173, 'grad_norm': 0.9522910118103027, 'learning_rate': 1.2814987779200822e-05, 'epoch': 3.6363636363636362}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.38186755776405334, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.19210394802598701, 'eval_precision': 0.14922360248447208, 'eval_recall': 0.26956521739130435, 'eval_runtime': 12.0748, 'eval_samples_per_second': 4.721, 'eval_steps_per_second': 0.166, 'epoch': 4.0}
{'loss': 0.384, 'grad_norm': 0.9191919565200806, 'learning_rate': 1.1943219903064712e-05, 'epoch': 4.545454545454545}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.35753193497657776, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.18612836438923397, 'eval_precision': 0.14749794913863823, 'eval_recall': 0.25217391304347825, 'eval_runtime': 11.6394, 'eval_samples_per_second': 4.897, 'eval_steps_per_second': 0.172, 'epoch': 5.0}
{'loss': 0.3614, 'grad_norm': 0.7111070156097412, 'learning_rate': 1.1071452026928601e-05, 'epoch': 5.454545454545454}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.33935460448265076, 'eval_accuracy': 0.03508771929824561, 'eval_f1': 0.26888698507888914, 'eval_precision': 0.34749794913863824, 'eval_recall': 0.30434782608695654, 'eval_runtime': 11.0572, 'eval_samples_per_second': 5.155, 'eval_steps_per_second': 0.181, 'epoch': 6.0}
{'loss': 0.341, 'grad_norm': 0.6497620344161987, 'learning_rate': 1.019968415079249e-05, 'epoch': 6.363636363636363}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.32568082213401794, 'eval_accuracy': 0.05263157894736842, 'eval_f1': 0.2597994462321335, 'eval_precision': 0.3503344481605351, 'eval_recall': 0.2956521739130435, 'eval_runtime': 11.4801, 'eval_samples_per_second': 4.965, 'eval_steps_per_second': 0.174, 'epoch': 7.0}
{'loss': 0.3289, 'grad_norm': 0.5877724885940552, 'learning_rate': 9.327916274656379e-06, 'epoch': 7.2727272727272725}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.31664666533470154, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.2004459308807135, 'eval_precision': 0.16632747456059205, 'eval_recall': 0.25217391304347825, 'eval_runtime': 12.0804, 'eval_samples_per_second': 4.718, 'eval_steps_per_second': 0.166, 'epoch': 8.0}
{'loss': 0.3176, 'grad_norm': 0.5613172054290771, 'learning_rate': 8.456148398520269e-06, 'epoch': 8.181818181818182}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.3084104359149933, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.2314044073853484, 'eval_precision': 0.3732919254658385, 'eval_recall': 0.25217391304347825, 'eval_runtime': 11.0335, 'eval_samples_per_second': 5.166, 'eval_steps_per_second': 0.181, 'epoch': 9.0}
{'loss': 0.3091, 'grad_norm': 0.49606621265411377, 'learning_rate': 7.58438052238416e-06, 'epoch': 9.090909090909092}
{'loss': 0.2995, 'grad_norm': 0.757254421710968, 'learning_rate': 6.712612646248049e-06, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.30205655097961426, 'eval_accuracy': 0.03508771929824561, 'eval_f1': 0.2708329788139198, 'eval_precision': 0.3732919254658385, 'eval_recall': 0.2782608695652174, 'eval_runtime': 11.489, 'eval_samples_per_second': 4.961, 'eval_steps_per_second': 0.174, 'epoch': 10.0}
{'loss': 0.2977, 'grad_norm': 0.4851831793785095, 'learning_rate': 5.840844770111938e-06, 'epoch': 10.909090909090908}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.29696810245513916, 'eval_accuracy': 0.03508771929824561, 'eval_f1': 0.296279669688965, 'eval_precision': 0.602445652173913, 'eval_recall': 0.2608695652173913, 'eval_runtime': 12.309, 'eval_samples_per_second': 4.631, 'eval_steps_per_second': 0.162, 'epoch': 11.0}
{'loss': 0.301, 'grad_norm': 0.45817437767982483, 'learning_rate': 4.969076893975828e-06, 'epoch': 11.818181818181818}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2933167517185211, 'eval_accuracy': 0.05263157894736842, 'eval_f1': 0.35794152551515807, 'eval_precision': 0.5294117647058824, 'eval_recall': 0.3130434782608696, 'eval_runtime': 11.4592, 'eval_samples_per_second': 4.974, 'eval_steps_per_second': 0.175, 'epoch': 12.0}
{'loss': 0.2899, 'grad_norm': 0.4175030589103699, 'learning_rate': 4.0973090178397185e-06, 'epoch': 12.727272727272727}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2904907166957855, 'eval_accuracy': 0.10526315789473684, 'eval_f1': 0.42785431959345, 'eval_precision': 0.5583662714097497, 'eval_recall': 0.3739130434782609, 'eval_runtime': 11.4673, 'eval_samples_per_second': 4.971, 'eval_steps_per_second': 0.174, 'epoch': 13.0}
{'loss': 0.2823, 'grad_norm': 0.4937356412410736, 'learning_rate': 3.225541141703608e-06, 'epoch': 13.636363636363637}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.28838157653808594, 'eval_accuracy': 0.14035087719298245, 'eval_f1': 0.45541346973572033, 'eval_precision': 0.5479526349091567, 'eval_recall': 0.41739130434782606, 'eval_runtime': 11.1342, 'eval_samples_per_second': 5.119, 'eval_steps_per_second': 0.18, 'epoch': 14.0}
{'loss': 0.2893, 'grad_norm': 0.39086058735847473, 'learning_rate': 2.353773265567498e-06, 'epoch': 14.545454545454545}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.2862159013748169, 'eval_accuracy': 0.14035087719298245, 'eval_f1': 0.4658982124263289, 'eval_precision': 0.5609364548494984, 'eval_recall': 0.40869565217391307, 'eval_runtime': 11.0384, 'eval_samples_per_second': 5.164, 'eval_steps_per_second': 0.181, 'epoch': 15.0}
{'loss': 0.2847, 'grad_norm': 0.4068179130554199, 'learning_rate': 1.4820053894313874e-06, 'epoch': 15.454545454545455}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.28583118319511414, 'eval_accuracy': 0.14035087719298245, 'eval_f1': 0.48653406090477025, 'eval_precision': 0.5637583729143831, 'eval_recall': 0.4434782608695652, 'eval_runtime': 12.2154, 'eval_samples_per_second': 4.666, 'eval_steps_per_second': 0.164, 'epoch': 16.0}
{'loss': 0.2762, 'grad_norm': 0.4737359881401062, 'learning_rate': 6.102375132952771e-07, 'epoch': 16.363636363636363}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.28526827692985535, 'eval_accuracy': 0.14035087719298245, 'eval_f1': 0.4935475976833347, 'eval_precision': 0.5453005115089514, 'eval_recall': 0.4608695652173913, 'eval_runtime': 12.0447, 'eval_samples_per_second': 4.732, 'eval_steps_per_second': 0.166, 'epoch': 17.0}
{'train_runtime': 3939.4381, 'train_samples_per_second': 1.394, 'train_steps_per_second': 0.047, 'train_loss': 0.35428380009962274, 'epoch': 17.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-01-03 13:56:13,719] Trial 9 finished with value: 0.4935475976833347 and parameters: {'learning_rate': 1.6302059283745262e-05, 'batch_size': 32, 'weight_decay': 0.06827464801918784, 'num_train_epochs': 17}. Best is trial 8 with value: 0.6610257220058251.


{'eval_loss': 0.28526827692985535, 'eval_accuracy': 0.14035087719298245, 'eval_f1': 0.4935475976833347, 'eval_precision': 0.5453005115089514, 'eval_recall': 0.4608695652173913, 'eval_runtime': 11.9539, 'eval_samples_per_second': 4.768, 'eval_steps_per_second': 0.167, 'epoch': 17.0}
Best Hyperparameters: {'learning_rate': 3.019907855559636e-05, 'batch_size': 8, 'weight_decay': 0.014597336715952852, 'num_train_epochs': 17}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                
  6%|▌         | 41/697 [00:18<03:11,  3.43it/s]

{'eval_loss': 0.3258223533630371, 'eval_accuracy': 0.03508771929824561, 'eval_f1': 0.1999265569917744, 'eval_precision': 0.46956521739130436, 'eval_recall': 0.13043478260869565, 'eval_runtime': 0.7381, 'eval_samples_per_second': 77.225, 'eval_steps_per_second': 10.839, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                
 12%|█▏        | 82/697 [00:37<03:00,  3.40it/s]

{'eval_loss': 0.2733701467514038, 'eval_accuracy': 0.017543859649122806, 'eval_f1': 0.20694799658994034, 'eval_precision': 0.3497584541062802, 'eval_recall': 0.26956521739130435, 'eval_runtime': 0.725, 'eval_samples_per_second': 78.62, 'eval_steps_per_second': 11.034, 'epoch': 2.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 18%|█▊        | 123/697 [01:02<02:47,  3.43it/s]

{'eval_loss': 0.24633438885211945, 'eval_accuracy': 0.22807017543859648, 'eval_f1': 0.549922910884983, 'eval_precision': 0.5038034641215977, 'eval_recall': 0.6086956521739131, 'eval_runtime': 0.723, 'eval_samples_per_second': 78.841, 'eval_steps_per_second': 11.065, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 24%|██▎       | 164/697 [01:30<02:36,  3.42it/s]

{'eval_loss': 0.22645996510982513, 'eval_accuracy': 0.24561403508771928, 'eval_f1': 0.5169780963176948, 'eval_precision': 0.5177891446433646, 'eval_recall': 0.5304347826086957, 'eval_runtime': 0.7408, 'eval_samples_per_second': 76.946, 'eval_steps_per_second': 10.799, 'epoch': 4.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 29%|██▉       | 205/697 [01:46<02:24,  3.41it/s]

{'eval_loss': 0.210386723279953, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5348199360566077, 'eval_precision': 0.531716833890747, 'eval_recall': 0.5478260869565217, 'eval_runtime': 0.727, 'eval_samples_per_second': 78.4, 'eval_steps_per_second': 11.004, 'epoch': 5.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 35%|███▌      | 246/697 [02:02<02:11,  3.42it/s]

{'eval_loss': 0.2044815719127655, 'eval_accuracy': 0.2807017543859649, 'eval_f1': 0.5595859213250518, 'eval_precision': 0.5984189723320158, 'eval_recall': 0.5739130434782609, 'eval_runtime': 0.7321, 'eval_samples_per_second': 77.862, 'eval_steps_per_second': 10.928, 'epoch': 6.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 41%|████      | 287/697 [02:18<02:00,  3.41it/s]

{'eval_loss': 0.1927984654903412, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.5888205446516847, 'eval_precision': 0.6180586594331335, 'eval_recall': 0.6, 'eval_runtime': 0.7225, 'eval_samples_per_second': 78.893, 'eval_steps_per_second': 11.073, 'epoch': 7.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 47%|████▋     | 328/697 [02:34<01:48,  3.41it/s]

{'eval_loss': 0.18876515328884125, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.6137996098104793, 'eval_precision': 0.6118995336386641, 'eval_recall': 0.6347826086956522, 'eval_runtime': 0.7304, 'eval_samples_per_second': 78.043, 'eval_steps_per_second': 10.953, 'epoch': 8.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 53%|█████▎    | 369/697 [03:04<01:35,  3.43it/s]

{'eval_loss': 0.18430161476135254, 'eval_accuracy': 0.3333333333333333, 'eval_f1': 0.6187802425317953, 'eval_precision': 0.631184407796102, 'eval_recall': 0.6173913043478261, 'eval_runtime': 0.7357, 'eval_samples_per_second': 77.482, 'eval_steps_per_second': 10.875, 'epoch': 9.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 59%|█████▉    | 410/697 [03:20<01:23,  3.43it/s]

{'eval_loss': 0.18281258642673492, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.6180271813957713, 'eval_precision': 0.6554340924775708, 'eval_recall': 0.6086956521739131, 'eval_runtime': 0.7225, 'eval_samples_per_second': 78.892, 'eval_steps_per_second': 11.073, 'epoch': 10.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 65%|██████▍   | 451/697 [03:36<01:11,  3.43it/s]

{'eval_loss': 0.1797669231891632, 'eval_accuracy': 0.2982456140350877, 'eval_f1': 0.6218672169643492, 'eval_precision': 0.628416149068323, 'eval_recall': 0.6260869565217392, 'eval_runtime': 0.7249, 'eval_samples_per_second': 78.633, 'eval_steps_per_second': 11.036, 'epoch': 11.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 71%|███████   | 492/697 [04:08<00:59,  3.44it/s]

{'eval_loss': 0.17968983948230743, 'eval_accuracy': 0.3333333333333333, 'eval_f1': 0.6197926799312208, 'eval_precision': 0.640578825361434, 'eval_recall': 0.6173913043478261, 'eval_runtime': 0.7377, 'eval_samples_per_second': 77.271, 'eval_steps_per_second': 10.845, 'epoch': 12.0}


 72%|███████▏  | 500/697 [04:17<01:39,  1.98it/s]

{'loss': 0.1917, 'grad_norm': 0.6103445887565613, 'learning_rate': 8.535464096775442e-06, 'epoch': 12.2}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 76%|███████▋  | 533/697 [04:29<00:47,  3.43it/s]

{'eval_loss': 0.17600755393505096, 'eval_accuracy': 0.3333333333333333, 'eval_f1': 0.6408988399351188, 'eval_precision': 0.6525157933853586, 'eval_recall': 0.6434782608695652, 'eval_runtime': 0.7252, 'eval_samples_per_second': 78.604, 'eval_steps_per_second': 11.032, 'epoch': 13.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 82%|████████▏ | 574/697 [04:45<00:35,  3.43it/s]

{'eval_loss': 0.17579756677150726, 'eval_accuracy': 0.3333333333333333, 'eval_f1': 0.6503554465264366, 'eval_precision': 0.7028306359640693, 'eval_recall': 0.6434782608695652, 'eval_runtime': 0.7204, 'eval_samples_per_second': 79.121, 'eval_steps_per_second': 11.105, 'epoch': 14.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 88%|████████▊ | 615/697 [05:00<00:24,  3.41it/s]

{'eval_loss': 0.1765529066324234, 'eval_accuracy': 0.3157894736842105, 'eval_f1': 0.6351304347826087, 'eval_precision': 0.6503346617289646, 'eval_recall': 0.6347826086956522, 'eval_runtime': 0.7313, 'eval_samples_per_second': 77.948, 'eval_steps_per_second': 10.94, 'epoch': 15.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
 94%|█████████▍| 656/697 [05:19<00:11,  3.43it/s]

{'eval_loss': 0.17655648291110992, 'eval_accuracy': 0.3508771929824561, 'eval_f1': 0.6538582713851937, 'eval_precision': 0.7104661039443647, 'eval_recall': 0.6434782608695652, 'eval_runtime': 0.7262, 'eval_samples_per_second': 78.489, 'eval_steps_per_second': 11.016, 'epoch': 16.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
                                                 
100%|██████████| 697/697 [05:52<00:00,  3.41it/s]

{'eval_loss': 0.1758529245853424, 'eval_accuracy': 0.3508771929824561, 'eval_f1': 0.6538582713851937, 'eval_precision': 0.7104661039443647, 'eval_recall': 0.6434782608695652, 'eval_runtime': 0.7001, 'eval_samples_per_second': 81.416, 'eval_steps_per_second': 11.427, 'epoch': 17.0}


100%|██████████| 697/697 [05:53<00:00,  1.97it/s]


{'train_runtime': 353.8693, 'train_samples_per_second': 15.517, 'train_steps_per_second': 1.97, 'train_loss': 0.16374067254524832, 'epoch': 17.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 6/6 [00:00<00:00, 27.88it/s]

Final Test Results: {'eval_loss': 0.1783057302236557, 'eval_accuracy': 0.3488372093023256, 'eval_f1': 0.6300961248928729, 'eval_precision': 0.726111111111111, 'eval_recall': 0.5777777777777777, 'eval_runtime': 0.2622, 'eval_samples_per_second': 163.981, 'eval_steps_per_second': 22.881, 'epoch': 17.0}



