# 0. Imports, libraries and rusable functions

In [2]:
from project_imports import *
import use_gpu
# Clear any cached memory to start fresh for each trial
torch.cuda.empty_cache()
gc.collect()

Project libraries imported!
GPU: NVIDIA GeForce RTX 4070 Ti SUPER is available.
Device:cuda


76

# 1. Global Variables

In [3]:
## Arguments and global vriables
dataset_name="MCQA-Combined-3"
global_run_name="Optuna-1"
pretrained_model_name = "microsoft/deberta-v3-base"
normalized_model_name = pretrained_model_name.replace("/", "-")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
assert isinstance( tokenizer, PreTrainedTokenizerFast )
data_collator = DefaultDataCollator()
max_length = 512 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
pad_on_right = right_padding = tokenizer.padding_side == 'right'
global_counter = 0
traing_answer_mismatches = []
logger = logging.getLogger(__name__)



# 2. Prepare the Dataset 

In [4]:
# Load the combined dataset
combined_dataset = load_from_disk('cleaned_dataset')

combined_dataset

DatasetDict({
    train: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 1072514
    })
    validation: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 118521
    })
    test: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 200566
    })
})

In [5]:
# Initialize lists to hold datasets
train_datasets = []
val_datasets = []
test_datasets = []

# List of datasets to combine
datasets_to_combine = ['AR-LSAT', 'ReClor', 'LogiQA 2.0']

# Loop through each dataset and filter
for ds_name in datasets_to_combine:
    train_datasets.append(combined_dataset['train'].filter(lambda x: x['Source Dataset'] == ds_name))
    val_datasets.append(combined_dataset['validation'].filter(lambda x: x['Source Dataset'] == ds_name))
    test_datasets.append(combined_dataset['test'].filter(lambda x: x['Source Dataset'] == ds_name))

# Concatenate datasets
combined_train = concatenate_datasets(train_datasets)
combined_val = concatenate_datasets(val_datasets)
combined_test = concatenate_datasets(test_datasets)


# Shuffle the combined dataset
# To ensure that each training batch has a chance to contain a mix of examples from all sources. 
# This helps in reducing variance and improving the generalization of the model.
combined_train = combined_train.shuffle(seed=42)

In [6]:
def mcqa_preprocess_function(examples):
    # Determine the maximum number of choices
    max_num_choices = 5  # Since AR-LSAT has 5 options, we'll pad others to 5
    contexts = examples['Context']
    questions = examples['Question']
    options_list = examples['Options']
    labels = examples['Label']
    
    first_sentences = []
    second_sentences = []
    labels_adjusted = []
    
    for context, question, options, label in zip(contexts, questions, options_list, labels):
        num_choices = len(options)
        # Pad options to have max_num_choices
        if num_choices < max_num_choices:
            options += [''] * (max_num_choices - num_choices)
        first_sentences.append([context] * max_num_choices)
        second_sentences.append([f"{question} {option}" for option in options])
        labels_adjusted.append(label)
    
    # Flatten the lists
    first_sentences = [item for sublist in first_sentences for item in sublist]
    second_sentences = [item for sublist in second_sentences for item in sublist]
    
    # Tokenize the inputs
    tokenized_examples = tokenizer(
        first_sentences,
        second_sentences,
        truncation=True,
        max_length=512,
        padding='max_length',
    )
    
    # Un-flatten to shape (num_examples, max_num_choices, seq_length)
    tokenized_inputs = {
        k: [v[i:i + max_num_choices] for i in range(0, len(v), max_num_choices)]
        for k, v in tokenized_examples.items()
    }
    
    # Labels
    tokenized_inputs["labels"] = labels_adjusted
    
    return tokenized_inputs




In [7]:
# Apply the preprocessing function to the combined datasets
encoded_train = combined_train.map(mcqa_preprocess_function, batched=True)
encoded_val = combined_val.map(mcqa_preprocess_function, batched=True)
encoded_test = combined_test.map(mcqa_preprocess_function, batched=True)


In [8]:
# Set the format of the datasets to PyTorch tensors
encoded_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
encoded_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
encoded_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


def get_train_encoded():
    return encoded_train

def get_val_encoded():
    return encoded_val

def get_test_encoded():
    return encoded_test

print("Number of training examples:", len(encoded_train))
print("Number of validation examples:", len(encoded_val))
print("Number of test examples:", len(encoded_test))

Number of training examples: 18290
Number of validation examples: 2299
Number of test examples: 2301


# 3. Reusable Functions

In [8]:
# Load the accuracy metric
accuracy = evaluate.load('accuracy')

# Define the compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy.compute(predictions=predictions, references=labels)['accuracy']
    f1 = f1_score(labels, predictions, average='weighted')
    return {'eval_accuracy': acc, 'eval_f1': f1}

In [9]:
from transformers import TrainingArguments

def create_training_args(run_name="Default-Run", num_train_epochs=3, learning_rate=4.92e-05, batch_size=3):
    """
    Generates training arguments for training a machine learning model.

    Parameters:
    - dataset_name (str): The name of the dataset.
    - run_name (str): The name of the run, useful for logging and saving models.
    - model_name (str): The name of the model, typically including its configuration.
    - num_train_epochs (int): The number of epochs to train for.
    - learning_rate (float): The learning rate for training.
    - batch_size (int): The batch size used for training.

    Returns:
    - TrainingArguments: A configured TrainingArguments instance.
    """    
    output_dir = f"./{dataset_name}/{run_name}/{normalized_model_name}"
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        report_to="none",  # Disable all integrations
        overwrite_output_dir=True,
        metric_for_best_model='eval_accuracy',
        greater_is_better=True,
        load_best_model_at_end=True,
        save_total_limit=3,
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=4,
        warmup_steps=398,
        weight_decay=0.194,
        adam_beta1=0.837,
        adam_beta2=0.997,
        adam_epsilon=5.87e-07,
        lr_scheduler_type='cosine',
        fp16=True,  # Enable mixed-precision training
    )
    
    return training_args


In [10]:
def model_init(model_name=pretrained_model_name):
    return AutoModelForMultipleChoice.from_pretrained(model_name)

In [11]:
def create_trainer(model_name=pretrained_model_name,run_name="Default-Run", num_train_epochs=3, learning_rate=4.92e-05, batch_size=4):
    trainer = Trainer(
        model=model_init(model_name),
        args=create_training_args(run_name=run_name, num_train_epochs=num_train_epochs, learning_rate=learning_rate, batch_size=batch_size),
        train_dataset=get_train_encoded(),
        eval_dataset=get_val_encoded(),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    return trainer


In [12]:
class AdvancedEarlyStoppingCallback(TrainerCallback):
    """
    A callback to stop training when either the performance falls below a certain threshold
    or if there is no improvement over a set number of epochs.
    """
    def __init__(self, metric_name, patience):
        self.metric_name = metric_name
        self.patience = patience        
        self.best_score = None
        self.no_improve_epochs = 0
        self.config_file = "early_stopping_config.json"  # Config file for early stopping values

    def read_early_stopping_config(self):
        """
        Reads the early stopping configuration from the file system.
        Returns the configuration as a dictionary.
        """
        if os.path.exists(self.config_file):
            with open(self.config_file, 'r') as file:
                config = json.load(file)
            return config
        else:
            raise FileNotFoundError(f"Config file not found: {self.config_file}")
    def reset_manual_stop_flag(self):
        """
        Resets the manual stop flag to False in the early stopping config file.
        """
        config = self.read_early_stopping_config()
        config['manual_stop'] = False
        with open(self.config_file, 'w') as file:
            json.dump(config, file, indent=4)

    def on_evaluate(self, args, state, control, **kwargs):
        metric_value = kwargs['metrics'].get(self.metric_name)

        if self.best_score is None or metric_value > self.best_score:
            self.best_score = metric_value
            self.no_improve_epochs = 0
        else:
            self.no_improve_epochs += 1

        # Check if no improvement has been seen over the allowed patience
        if self.no_improve_epochs >= self.patience:
            control.should_training_stop = True
            print(f"Stopping training: No improvement in {self.metric_name} for {self.patience} epochs")


        # Read the early stopping configuration
        config = self.read_early_stopping_config()
        min_accuracy = config.get("min_accuracy", 0.35)                
        num_epochs_min_acc = config.get("num_epochs_min_acc", 2)  
        max_variance = config.get("max_variance", 0.2)  

        # Check if performance is below the threshold
        if metric_value < min_accuracy:
            control.should_training_stop = True
            print(f"Stopping training: {self.metric_name} below manual min_acc of {min_accuracy}")

         # Manual stop from config
        if config.get("manual_stop", False):
            control.should_training_stop = True
            print(f"Manual early stopping triggered!!")
            self.reset_manual_stop_flag()  # Reset the flag for future runs
            


In [13]:
# Optuna objective function for hyperparameter tuning
def objective(trial):
    # Clear any cached memory to start fresh for each trial
    torch.cuda.empty_cache()
    gc.collect()

    
    model_name = trial.suggest_categorical('model_name', [pretrained_model_name, "./squad-trained-model", './MCQA-Combined/Optuna/trial_5/checkpoint-22865', './MCQA-Combined/Optuna/trial_0/checkpoint-30485','./MCQA-Combined/Optuna/trial_6/checkpoint-30485' ]) 
    learning_rate = trial.suggest_float('learning_rate', 1e-7, 1e-4, log=True)
    batch_size = trial.suggest_categorical('batch_size', [3, 4])
    #warmup_steps = trial.suggest_int('warmup_steps', 0, 1000)
    warmup_ratio= trial.suggest_float('warmup_ratio', 0.0, 1.0)
    weight_decay = trial.suggest_float('weight_decay', 0.0, 0.25)
    adam_beta1 = trial.suggest_float('adam_beta1', 0.8, 0.95)
    adam_beta2 = trial.suggest_float('adam_beta2', 0.990, 0.999)
    adam_epsilon = trial.suggest_float('adam_epsilon', 1e-8, 1e-6)
    lr_scheduler_type = trial.suggest_categorical('lr_scheduler_type', ['linear', 'cosine', 'cosine_with_restarts']) #,'constant_with_warmup'   
    

    output_dir = f"./{dataset_name}/{global_run_name}/trial_{trial.number}"

    training_args = TrainingArguments(
        output_dir=output_dir,
        report_to="none",  # Disable all integrations
        overwrite_output_dir=True,
        metric_for_best_model='eval_accuracy',
        greater_is_better=True,
        load_best_model_at_end=True,
        save_total_limit=3,
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=30,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_ratio=warmup_ratio,
        weight_decay=weight_decay,
        adam_beta1=adam_beta1,
        adam_beta2=adam_beta2,
        adam_epsilon=adam_epsilon,
        lr_scheduler_type=lr_scheduler_type,
        fp16=True,  # Enable mixed-precision training
    ) 
    
    # Print trial parameters
    print(f"Current Trial {trial.number} parameters: {trial.params}")
    
    trainer = Trainer(
        model=model_init(model_name),
        args=training_args,
        train_dataset=get_train_encoded(),
        eval_dataset=get_val_encoded(),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[AdvancedEarlyStoppingCallback(metric_name='eval_accuracy', patience=1)]
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
        
    torch.cuda.empty_cache()  # Clear cache after evaluation
    gc.collect()  # Collect garbage

    return eval_results['eval_accuracy']


# 4. Fine-tuning DeBERTa on the Dataset

## 4.1 Evaluate Vanilla DeBERTa (Acc=19.30%)

In [None]:
# Create the Trainer
trainer = create_trainer()

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print("Test Results:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/576 [00:00<?, ?it/s]

## 4.2 Evaluate Trained Vanilla DeBERTa (Acc=27.92%)

In [27]:
# Create the Trainer
trainer = create_trainer()
# Train the model
trainer.train()
# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print("Test Results:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.4646,1.45654,0.256198,0.226952
2,1.4684,1.446175,0.253589,0.234731
3,1.4188,1.430362,0.300565,0.301469


Test Results:
Accuracy: 0.2786
F1 Score: 0.2792


In [13]:
# Create the Trainer
trainer = create_trainer(run_name="Default-Run2", num_train_epochs=10, batch_size=3)
# Train the model
trainer.train()
# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print("Test Results:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: mzak071 (COMPSCI714). Use `wandb login --relogin` to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

  0%|          | 0/60970 [00:00<?, ?it/s]

{'loss': 1.4817, 'grad_norm': 10.460925102233887, 'learning_rate': 4.919968223047422e-05, 'epoch': 0.08}
{'loss': 1.5081, 'grad_norm': 3.530348062515259, 'learning_rate': 4.918816880581236e-05, 'epoch': 0.16}
{'loss': 1.4864, 'grad_norm': 6.0712995529174805, 'learning_rate': 4.916019321314551e-05, 'epoch': 0.25}
{'loss': 1.5478, 'grad_norm': 4.789742469787598, 'learning_rate': 4.911576767182646e-05, 'epoch': 0.33}
{'loss': 1.4785, 'grad_norm': 26.84292984008789, 'learning_rate': 4.905478354129259e-05, 'epoch': 0.41}
{'loss': 1.5623, 'grad_norm': 3.22577166557312, 'learning_rate': 4.897752559048373e-05, 'epoch': 0.49}
{'loss': 1.6182, 'grad_norm': 2.662576198577881, 'learning_rate': 4.8883736139178334e-05, 'epoch': 0.57}
{'loss': 1.6135, 'grad_norm': 2.9794094562530518, 'learning_rate': 4.877361665371831e-05, 'epoch': 0.66}
{'loss': 1.6136, 'grad_norm': 1.978592872619629, 'learning_rate': 4.864724118592782e-05, 'epoch': 0.74}
{'loss': 1.6108, 'grad_norm': 2.1126723289489746, 'learning_r

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.2122662026968247, 'eval_f1': 0.2045627686513225, 'eval_loss': 1.609375, 'eval_runtime': 82.643, 'eval_samples_per_second': 27.818, 'eval_steps_per_second': 6.958, 'epoch': 1.0}


KeyboardInterrupt: 

## 4.3 Optuna Hyperparameters Tuning 1

In [14]:
# Create a study object and optimize the objective
study = optuna.create_study(direction='maximize')
study.enqueue_trial({'model_name':"microsoft/deberta-v3-base", 'learning_rate': 9.891138752479374e-06, 'batch_size': 3, 'warmup_ratio': 0.5982282303832456, 'weight_decay': 0.17633588993115804, 'adam_beta1': 0.8747290421857349, 'adam_beta2': 0.9927786970263835, 'adam_epsilon': 9.90768817706196e-07, 'lr_scheduler_type': 'linear'})
study.optimize(objective, n_trials=20)


[I 2024-10-16 15:03:56,781] A new study created in memory with name: no-name-0ded4bce-c54e-4ab6-8334-5b865ddd797e


Current Trial 0 parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 9.891138752479374e-06, 'batch_size': 3, 'warmup_ratio': 0.5982282303832456, 'weight_decay': 0.17633588993115804, 'adam_beta1': 0.8747290421857349, 'adam_beta2': 0.9927786970263835, 'adam_epsilon': 9.90768817706196e-07, 'lr_scheduler_type': 'linear'}


Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: mzak071 (COMPSCI714). Use `wandb login --relogin` to force relogin


  0%|          | 0/30485 [00:00<?, ?it/s]

{'loss': 1.6108, 'grad_norm': 1.8310006856918335, 'learning_rate': 2.711832744552112e-07, 'epoch': 0.08}
{'loss': 1.6049, 'grad_norm': 2.0155208110809326, 'learning_rate': 5.423665489104224e-07, 'epoch': 0.16}
{'loss': 1.5479, 'grad_norm': 6.674936771392822, 'learning_rate': 8.124650902678128e-07, 'epoch': 0.25}
{'loss': 1.4779, 'grad_norm': 11.677717208862305, 'learning_rate': 1.0825636316252032e-06, 'epoch': 0.33}
{'loss': 1.4543, 'grad_norm': 4.926329612731934, 'learning_rate': 1.3537469060804143e-06, 'epoch': 0.41}


wandb: Network error resolved after 0:02:16.440282, resuming normal operation.


{'loss': 1.4366, 'grad_norm': 13.124654769897461, 'learning_rate': 1.6249301805356256e-06, 'epoch': 0.49}
{'loss': 1.4318, 'grad_norm': 12.793344497680664, 'learning_rate': 1.896113454990837e-06, 'epoch': 0.57}
{'loss': 1.411, 'grad_norm': 15.144495964050293, 'learning_rate': 2.1667543628971375e-06, 'epoch': 0.66}
{'loss': 1.3965, 'grad_norm': 22.932546615600586, 'learning_rate': 2.437937637352349e-06, 'epoch': 0.74}
{'loss': 1.3741, 'grad_norm': 10.780708312988281, 'learning_rate': 2.7091209118075597e-06, 'epoch': 0.82}
{'loss': 1.3568, 'grad_norm': 19.797386169433594, 'learning_rate': 2.980304186262771e-06, 'epoch': 0.9}
{'loss': 1.3453, 'grad_norm': inf, 'learning_rate': 3.250945094169072e-06, 'epoch': 0.98}


  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.41365811222270554, 'eval_f1': 0.41452016409540837, 'eval_loss': 1.2694002389907837, 'eval_runtime': 85.7551, 'eval_samples_per_second': 26.809, 'eval_steps_per_second': 8.944, 'epoch': 1.0}
{'loss': 1.31, 'grad_norm': 9.396753311157227, 'learning_rate': 3.5221283686242833e-06, 'epoch': 1.07}
{'loss': 1.3161, 'grad_norm': 28.072525024414062, 'learning_rate': 3.7933116430794946e-06, 'epoch': 1.15}
{'loss': 1.3015, 'grad_norm': 12.580765724182129, 'learning_rate': 4.0644949175347055e-06, 'epoch': 1.23}
{'loss': 1.2701, 'grad_norm': 19.165576934814453, 'learning_rate': 4.335678191989917e-06, 'epoch': 1.31}
{'loss': 1.29, 'grad_norm': 48.18876266479492, 'learning_rate': 4.6057767333473074e-06, 'epoch': 1.39}
{'loss': 1.2402, 'grad_norm': 15.27053451538086, 'learning_rate': 4.876960007802518e-06, 'epoch': 1.48}
{'loss': 1.2358, 'grad_norm': 27.17981719970703, 'learning_rate': 5.148143282257729e-06, 'epoch': 1.56}
{'loss': 1.2313, 'grad_norm': 37.995967864990234, 'learning

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.512396694214876, 'eval_f1': 0.5141582818892122, 'eval_loss': 1.1319924592971802, 'eval_runtime': 85.532, 'eval_samples_per_second': 26.879, 'eval_steps_per_second': 8.967, 'epoch': 2.0}
{'loss': 1.0905, 'grad_norm': 37.540035247802734, 'learning_rate': 6.7741581958911764e-06, 'epoch': 2.05}
{'loss': 1.0627, 'grad_norm': 57.054805755615234, 'learning_rate': 7.045341470346387e-06, 'epoch': 2.13}
{'loss': 1.078, 'grad_norm': 88.00515747070312, 'learning_rate': 7.316524744801599e-06, 'epoch': 2.21}
{'loss': 1.0316, 'grad_norm': 37.43239974975586, 'learning_rate': 7.58770801925681e-06, 'epoch': 2.3}
{'loss': 1.0776, 'grad_norm': 36.924102783203125, 'learning_rate': 7.858348927163111e-06, 'epoch': 2.38}
{'loss': 1.0004, 'grad_norm': 58.424015045166016, 'learning_rate': 8.129532201618322e-06, 'epoch': 2.46}
{'loss': 1.0437, 'grad_norm': 14.681171417236328, 'learning_rate': 8.400715476073533e-06, 'epoch': 2.54}
{'loss': 0.9755, 'grad_norm': 12.852766036987305, 'learning_rat

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.5967812092214007, 'eval_f1': 0.59856429277332, 'eval_loss': 1.042001724243164, 'eval_runtime': 82.9414, 'eval_samples_per_second': 27.718, 'eval_steps_per_second': 9.247, 'epoch': 3.0}
{'loss': 0.9127, 'grad_norm': 10.677167892456055, 'learning_rate': 9.688438244080261e-06, 'epoch': 3.03}
{'loss': 0.7734, 'grad_norm': 21.07221794128418, 'learning_rate': 9.285459942521869e-06, 'epoch': 3.12}
{'loss': 0.8189, 'grad_norm': 71.03575897216797, 'learning_rate': 8.881674069216865e-06, 'epoch': 3.2}
{'loss': 0.8349, 'grad_norm': 178.13543701171875, 'learning_rate': 8.477888195911861e-06, 'epoch': 3.28}
{'loss': 0.7607, 'grad_norm': 44.89866638183594, 'learning_rate': 8.074102322606857e-06, 'epoch': 3.36}
{'loss': 0.7832, 'grad_norm': 0.015755489468574524, 'learning_rate': 7.670316449301853e-06, 'epoch': 3.44}
{'loss': 0.8392, 'grad_norm': 32.59613037109375, 'learning_rate': 7.267338147743459e-06, 'epoch': 3.53}
{'loss': 0.779, 'grad_norm': 106.7799301147461, 'learning_rate'

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6489778164419313, 'eval_f1': 0.6515209391720787, 'eval_loss': 1.2690231800079346, 'eval_runtime': 82.9168, 'eval_samples_per_second': 27.727, 'eval_steps_per_second': 9.25, 'epoch': 4.0}
{'loss': 0.6748, 'grad_norm': 16.509668350219727, 'learning_rate': 4.846238051406657e-06, 'epoch': 4.02}
{'loss': 0.5074, 'grad_norm': 190.19386291503906, 'learning_rate': 4.443259749848262e-06, 'epoch': 4.1}
{'loss': 0.5464, 'grad_norm': 106.08119201660156, 'learning_rate': 4.039473876543259e-06, 'epoch': 4.18}
{'loss': 0.5656, 'grad_norm': 64.0899429321289, 'learning_rate': 3.6356880032382542e-06, 'epoch': 4.26}
{'loss': 0.5538, 'grad_norm': 98.51527404785156, 'learning_rate': 3.2319021299332507e-06, 'epoch': 4.35}
{'loss': 0.4943, 'grad_norm': 273.8886413574219, 'learning_rate': 2.8281162566282466e-06, 'epoch': 4.43}
{'loss': 0.5168, 'grad_norm': 76.48027801513672, 'learning_rate': 2.424330383323243e-06, 'epoch': 4.51}
{'loss': 0.5182, 'grad_norm': 14.625099182128906, 'learning_r

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6755110917790343, 'eval_f1': 0.6790337949238184, 'eval_loss': 1.6976536512374878, 'eval_runtime': 82.9849, 'eval_samples_per_second': 27.704, 'eval_steps_per_second': 9.243, 'epoch': 5.0}
{'train_runtime': 11908.2565, 'train_samples_per_second': 7.68, 'train_steps_per_second': 2.56, 'train_loss': 1.010124164748352, 'epoch': 5.0}


  0%|          | 0/767 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-16 18:23:49,618] Trial 0 finished with value: 0.6755110917790343 and parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 9.891138752479374e-06, 'batch_size': 3, 'warmup_ratio': 0.5982282303832456, 'weight_decay': 0.17633588993115804, 'adam_beta1': 0.8747290421857349, 'adam_beta2': 0.9927786970263835, 'adam_epsilon': 9.90768817706196e-07, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.6755110917790343.


Current Trial 1 parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.3042227136021964e-06, 'batch_size': 2, 'warmup_ratio': 0.09918174110702827, 'weight_decay': 0.014766619671606473, 'adam_beta1': 0.925261304375699, 'adam_beta2': 0.9972694011663767, 'adam_epsilon': 6.348691828190262e-07, 'lr_scheduler_type': 'linear'}


Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/45725 [00:00<?, ?it/s]

{'loss': 1.6082, 'grad_norm': 2.585113048553467, 'learning_rate': 1.437635266316354e-07, 'epoch': 0.05}
{'loss': 1.6092, 'grad_norm': 3.525179147720337, 'learning_rate': 2.875270532632708e-07, 'epoch': 0.11}
{'loss': 1.6047, 'grad_norm': 2.733987808227539, 'learning_rate': 4.312905798949062e-07, 'epoch': 0.16}
{'loss': 1.5669, 'grad_norm': 2.842104196548462, 'learning_rate': 5.747665794732784e-07, 'epoch': 0.22}
{'loss': 1.4959, 'grad_norm': 19.781494140625, 'learning_rate': 7.179550519983873e-07, 'epoch': 0.27}
{'loss': 1.4939, 'grad_norm': 34.2506217956543, 'learning_rate': 8.614310515767595e-07, 'epoch': 0.33}
{'loss': 1.4744, 'grad_norm': 14.1617431640625, 'learning_rate': 1.0051945782083948e-06, 'epoch': 0.38}
{'loss': 1.4594, 'grad_norm': 13.139301300048828, 'learning_rate': 1.14895810484003e-06, 'epoch': 0.44}
{'loss': 1.4255, 'grad_norm': 16.428150177001953, 'learning_rate': 1.2927216314716656e-06, 'epoch': 0.49}
{'loss': 1.4518, 'grad_norm': 23.827686309814453, 'learning_rate'

  0%|          | 0/1150 [00:00<?, ?it/s]

{'eval_accuracy': 0.39712918660287083, 'eval_f1': 0.39737049706515465, 'eval_loss': 1.3019558191299438, 'eval_runtime': 85.0011, 'eval_samples_per_second': 27.047, 'eval_steps_per_second': 13.529, 'epoch': 1.0}
{'loss': 1.3763, 'grad_norm': 16.46857452392578, 'learning_rate': 1.1472942188921408e-06, 'epoch': 1.04}
{'loss': 1.336, 'grad_norm': 32.43342208862305, 'learning_rate': 1.1314620463023449e-06, 'epoch': 1.09}
{'loss': 1.3594, 'grad_norm': 51.57057571411133, 'learning_rate': 1.1156298737125491e-06, 'epoch': 1.15}
{'loss': 1.3668, 'grad_norm': 35.937095642089844, 'learning_rate': 1.0997977011227534e-06, 'epoch': 1.2}
{'loss': 1.3233, 'grad_norm': 55.94281005859375, 'learning_rate': 1.0839971928781372e-06, 'epoch': 1.26}
{'loss': 1.3477, 'grad_norm': 29.807373046875, 'learning_rate': 1.0681650202883414e-06, 'epoch': 1.31}
{'loss': 1.3324, 'grad_norm': 39.78839874267578, 'learning_rate': 1.0523328476985456e-06, 'epoch': 1.37}
{'loss': 1.314, 'grad_norm': 75.51228332519531, 'learning

  0%|          | 0/1150 [00:00<?, ?it/s]

{'eval_accuracy': 0.4480208786428882, 'eval_f1': 0.44956409030811867, 'eval_loss': 1.2185862064361572, 'eval_runtime': 84.858, 'eval_samples_per_second': 27.092, 'eval_steps_per_second': 13.552, 'epoch': 2.0}
{'loss': 1.2556, 'grad_norm': 67.97032928466797, 'learning_rate': 8.624101053113556e-07, 'epoch': 2.02}
{'loss': 1.2354, 'grad_norm': 49.313636779785156, 'learning_rate': 8.466095970667394e-07, 'epoch': 2.08}
{'loss': 1.2561, 'grad_norm': 26.08816909790039, 'learning_rate': 8.307774244769435e-07, 'epoch': 2.13}
{'loss': 1.2543, 'grad_norm': 26.237464904785156, 'learning_rate': 8.149452518871479e-07, 'epoch': 2.19}
{'loss': 1.2442, 'grad_norm': 61.43034744262695, 'learning_rate': 7.99113079297352e-07, 'epoch': 2.24}
{'loss': 1.2517, 'grad_norm': 17.540170669555664, 'learning_rate': 7.833125710527358e-07, 'epoch': 2.3}
{'loss': 1.2646, 'grad_norm': 14.766806602478027, 'learning_rate': 7.6748039846294e-07, 'epoch': 2.35}
{'loss': 1.2502, 'grad_norm': 31.206735610961914, 'learning_rat

  0%|          | 0/1150 [00:00<?, ?it/s]

{'eval_accuracy': 0.4745541539799913, 'eval_f1': 0.47618466348176836, 'eval_loss': 1.1888489723205566, 'eval_runtime': 84.8967, 'eval_samples_per_second': 27.08, 'eval_steps_per_second': 13.546, 'epoch': 3.0}
{'loss': 1.237, 'grad_norm': 28.527843475341797, 'learning_rate': 5.7755765607575e-07, 'epoch': 3.01}
{'loss': 1.1674, 'grad_norm': 38.5905876159668, 'learning_rate': 5.617254834859541e-07, 'epoch': 3.06}
{'loss': 1.1991, 'grad_norm': 6.073561668395996, 'learning_rate': 5.458933108961583e-07, 'epoch': 3.12}
{'loss': 1.2135, 'grad_norm': 9.602526664733887, 'learning_rate': 5.301244669967217e-07, 'epoch': 3.17}
{'loss': 1.2226, 'grad_norm': 66.79460144042969, 'learning_rate': 5.14292294406926e-07, 'epoch': 3.23}
{'loss': 1.2166, 'grad_norm': 53.70254898071289, 'learning_rate': 4.984601218171302e-07, 'epoch': 3.28}
{'loss': 1.1783, 'grad_norm': 51.066951751708984, 'learning_rate': 4.826279492273344e-07, 'epoch': 3.34}
{'loss': 1.1833, 'grad_norm': 23.885244369506836, 'learning_rate':

  0%|          | 0/1150 [00:00<?, ?it/s]

{'eval_accuracy': 0.4936929099608525, 'eval_f1': 0.4954601263039024, 'eval_loss': 1.1860566139221191, 'eval_runtime': 84.9502, 'eval_samples_per_second': 27.063, 'eval_steps_per_second': 13.537, 'epoch': 4.0}
{'loss': 1.1134, 'grad_norm': 75.09806060791016, 'learning_rate': 2.769046985955281e-07, 'epoch': 4.05}
{'loss': 1.17, 'grad_norm': 117.3279800415039, 'learning_rate': 2.6107252600573235e-07, 'epoch': 4.1}
{'loss': 1.1759, 'grad_norm': 65.0226058959961, 'learning_rate': 2.4524035341593654e-07, 'epoch': 4.16}
{'loss': 1.152, 'grad_norm': 47.28899383544922, 'learning_rate': 2.294398451713204e-07, 'epoch': 4.21}
{'loss': 1.1272, 'grad_norm': 104.80470275878906, 'learning_rate': 2.136076725815246e-07, 'epoch': 4.26}
{'loss': 1.159, 'grad_norm': 38.04895782470703, 'learning_rate': 1.9777549999172885e-07, 'epoch': 4.32}
{'loss': 1.1884, 'grad_norm': 24.521554946899414, 'learning_rate': 1.8194332740193307e-07, 'epoch': 4.37}
{'loss': 1.1683, 'grad_norm': 28.663904190063477, 'learning_rat

  0%|          | 0/1150 [00:00<?, ?it/s]

{'eval_accuracy': 0.4989125706829056, 'eval_f1': 0.5003226595967735, 'eval_loss': 1.1893906593322754, 'eval_runtime': 83.5258, 'eval_samples_per_second': 27.524, 'eval_steps_per_second': 13.768, 'epoch': 5.0}
{'train_runtime': 12375.9476, 'train_samples_per_second': 7.389, 'train_steps_per_second': 3.695, 'train_loss': 1.275919856868721, 'epoch': 5.0}


  0%|          | 0/1150 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-16 21:51:31,490] Trial 1 finished with value: 0.4989125706829056 and parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.3042227136021964e-06, 'batch_size': 2, 'warmup_ratio': 0.09918174110702827, 'weight_decay': 0.014766619671606473, 'adam_beta1': 0.925261304375699, 'adam_beta2': 0.9972694011663767, 'adam_epsilon': 6.348691828190262e-07, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.6755110917790343.


Current Trial 2 parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 6.896556694878205e-07, 'batch_size': 4, 'warmup_ratio': 0.7952567679467826, 'weight_decay': 0.04803187586959426, 'adam_beta1': 0.9150496535084768, 'adam_beta2': 0.9965449660871344, 'adam_epsilon': 5.402942880312419e-07, 'lr_scheduler_type': 'cosine'}


Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/22865 [00:00<?, ?it/s]

{'loss': 1.611, 'grad_norm': 1.9610719680786133, 'learning_rate': 1.896325532027663e-08, 'epoch': 0.11}
{'loss': 1.6129, 'grad_norm': 2.407881021499634, 'learning_rate': 3.792651064055326e-08, 'epoch': 0.22}
{'loss': 1.6111, 'grad_norm': 1.9186650514602661, 'learning_rate': 5.6889765960829894e-08, 'epoch': 0.33}
{'loss': 1.611, 'grad_norm': 1.6168291568756104, 'learning_rate': 7.585302128110653e-08, 'epoch': 0.44}
{'loss': 1.6091, 'grad_norm': 1.6504346132278442, 'learning_rate': 9.47783500907426e-08, 'epoch': 0.55}
{'loss': 1.6091, 'grad_norm': 1.8088207244873047, 'learning_rate': 1.1374160541101924e-07, 'epoch': 0.66}
{'loss': 1.607, 'grad_norm': 2.33923077583313, 'learning_rate': 1.3270486073129587e-07, 'epoch': 0.77}
{'loss': 1.5897, 'grad_norm': 1.8429104089736938, 'learning_rate': 1.516681160515725e-07, 'epoch': 0.87}
{'loss': 1.5725, 'grad_norm': 2.1944947242736816, 'learning_rate': 1.7059344486120857e-07, 'epoch': 0.98}


  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.31056981296215747, 'eval_f1': 0.31138836928944935, 'eval_loss': 1.546942114830017, 'eval_runtime': 93.8825, 'eval_samples_per_second': 24.488, 'eval_steps_per_second': 6.125, 'epoch': 1.0}
Stopping training: eval_accuracy below threshold of 0.35
{'train_runtime': 3231.6143, 'train_samples_per_second': 28.299, 'train_steps_per_second': 7.075, 'train_loss': 1.6029558046208725, 'epoch': 1.0}


  0%|          | 0/575 [00:00<?, ?it/s]

Stopping training: eval_accuracy below threshold of 0.35
Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-16 22:46:58,164] Trial 2 finished with value: 0.31056981296215747 and parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 6.896556694878205e-07, 'batch_size': 4, 'warmup_ratio': 0.7952567679467826, 'weight_decay': 0.04803187586959426, 'adam_beta1': 0.9150496535084768, 'adam_beta2': 0.9965449660871344, 'adam_epsilon': 5.402942880312419e-07, 'lr_scheduler_type': 'cosine'}. Best is trial 0 with value: 0.6755110917790343.


Current Trial 3 parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.0149372419374312e-07, 'batch_size': 4, 'warmup_ratio': 0.7256166381179319, 'weight_decay': 0.21189248409312114, 'adam_beta1': 0.9391653610469487, 'adam_beta2': 0.9903297892881989, 'adam_epsilon': 1.392138634568796e-07, 'lr_scheduler_type': 'linear'}


Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/22865 [00:00<?, ?it/s]

{'loss': 1.6108, 'grad_norm': 1.986233115196228, 'learning_rate': 3.0585138679406674e-09, 'epoch': 0.11}
{'loss': 1.61, 'grad_norm': 2.2650704383850098, 'learning_rate': 6.117027735881335e-09, 'epoch': 0.22}
{'loss': 1.611, 'grad_norm': 1.8638962507247925, 'learning_rate': 9.175541603822003e-09, 'epoch': 0.33}
{'loss': 1.6112, 'grad_norm': 1.607124924659729, 'learning_rate': 1.223405547176267e-08, 'epoch': 0.44}
{'loss': 1.6122, 'grad_norm': 1.6188691854476929, 'learning_rate': 1.5286452311967456e-08, 'epoch': 0.55}
{'loss': 1.6118, 'grad_norm': 1.650319218635559, 'learning_rate': 1.8344966179908126e-08, 'epoch': 0.66}
{'loss': 1.6099, 'grad_norm': 2.0125181674957275, 'learning_rate': 2.1403480047848793e-08, 'epoch': 0.77}
{'loss': 1.6109, 'grad_norm': 1.726709008216858, 'learning_rate': 2.4461993915789462e-08, 'epoch': 0.87}
{'loss': 1.6113, 'grad_norm': 2.0067360401153564, 'learning_rate': 2.7514390755994246e-08, 'epoch': 0.98}


  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.19138755980861244, 'eval_f1': 0.2157940352311797, 'eval_loss': 1.609500765800476, 'eval_runtime': 1380.2172, 'eval_samples_per_second': 1.666, 'eval_steps_per_second': 0.417, 'epoch': 1.0}
Stopping training: eval_accuracy below threshold of 0.35
{'train_runtime': 5685.4029, 'train_samples_per_second': 16.085, 'train_steps_per_second': 4.022, 'train_loss': 1.6109933325531653, 'epoch': 1.0}


  0%|          | 0/575 [00:00<?, ?it/s]

Stopping training: eval_accuracy below threshold of 0.35
Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-17 00:44:44,836] Trial 3 finished with value: 0.19138755980861244 and parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.0149372419374312e-07, 'batch_size': 4, 'warmup_ratio': 0.7256166381179319, 'weight_decay': 0.21189248409312114, 'adam_beta1': 0.9391653610469487, 'adam_beta2': 0.9903297892881989, 'adam_epsilon': 1.392138634568796e-07, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.6755110917790343.


Current Trial 4 parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 2.2101758678517027e-07, 'batch_size': 4, 'warmup_ratio': 0.01664485287339723, 'weight_decay': 0.05229210409552859, 'adam_beta1': 0.8293091118799443, 'adam_beta2': 0.991007938374325, 'adam_epsilon': 4.4675288273328307e-07, 'lr_scheduler_type': 'linear'}


Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/22865 [00:00<?, ?it/s]

{'loss': 1.6108, 'grad_norm': 1.9842169284820557, 'learning_rate': 2.1984781749023008e-07, 'epoch': 0.11}
{'loss': 1.6098, 'grad_norm': 2.2542338371276855, 'learning_rate': 2.1493282045266626e-07, 'epoch': 0.22}
{'loss': 1.6106, 'grad_norm': 1.8546525239944458, 'learning_rate': 2.1001782341510243e-07, 'epoch': 0.33}
{'loss': 1.6105, 'grad_norm': 1.6134577989578247, 'learning_rate': 2.0510282637753858e-07, 'epoch': 0.44}
{'loss': 1.6085, 'grad_norm': 1.8253097534179688, 'learning_rate': 2.001976593340499e-07, 'epoch': 0.55}
{'loss': 1.5943, 'grad_norm': 1.73936128616333, 'learning_rate': 1.9528266229648605e-07, 'epoch': 0.66}
{'loss': 1.5684, 'grad_norm': 2.2078733444213867, 'learning_rate': 1.9037749525299734e-07, 'epoch': 0.77}
{'loss': 1.5338, 'grad_norm': 1.9535191059112549, 'learning_rate': 1.8546249821543352e-07, 'epoch': 0.87}
{'loss': 1.4973, 'grad_norm': 4.079860210418701, 'learning_rate': 1.8055733117194483e-07, 'epoch': 0.98}


  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.31361461505002175, 'eval_f1': 0.31443212285676564, 'eval_loss': 1.4577686786651611, 'eval_runtime': 120.0404, 'eval_samples_per_second': 19.152, 'eval_steps_per_second': 4.79, 'epoch': 1.0}
Stopping training: eval_accuracy below threshold of 0.35
{'train_runtime': 3808.4581, 'train_samples_per_second': 24.012, 'train_steps_per_second': 6.004, 'train_loss': 1.5810835167012902, 'epoch': 1.0}


  0%|          | 0/575 [00:00<?, ?it/s]

Stopping training: eval_accuracy below threshold of 0.35
Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-17 01:50:15,399] Trial 4 finished with value: 0.31361461505002175 and parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 2.2101758678517027e-07, 'batch_size': 4, 'warmup_ratio': 0.01664485287339723, 'weight_decay': 0.05229210409552859, 'adam_beta1': 0.8293091118799443, 'adam_beta2': 0.991007938374325, 'adam_epsilon': 4.4675288273328307e-07, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.6755110917790343.


Current Trial 5 parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/22865 [00:00<?, ?it/s]

{'loss': 1.6108, 'grad_norm': 1.9778715372085571, 'learning_rate': 5.766490247568446e-07, 'epoch': 0.11}
{'loss': 1.6044, 'grad_norm': 2.3704445362091064, 'learning_rate': 1.1532980495136891e-06, 'epoch': 0.22}
{'loss': 1.4953, 'grad_norm': 10.23387336730957, 'learning_rate': 1.7264871801219926e-06, 'epoch': 0.33}
{'loss': 1.4417, 'grad_norm': 11.970279693603516, 'learning_rate': 2.303136204878837e-06, 'epoch': 0.44}
{'loss': 1.4174, 'grad_norm': 13.763654708862305, 'learning_rate': 2.8797852296356817e-06, 'epoch': 0.55}
{'loss': 1.4072, 'grad_norm': 14.317955017089844, 'learning_rate': 3.4552809563430125e-06, 'epoch': 0.66}
{'loss': 1.3841, 'grad_norm': 16.474056243896484, 'learning_rate': 4.031929981099857e-06, 'epoch': 0.77}
{'loss': 1.3625, 'grad_norm': 13.519970893859863, 'learning_rate': 4.608579005856701e-06, 'epoch': 0.87}
{'loss': 1.3217, 'grad_norm': 19.089096069335938, 'learning_rate': 5.185228030613547e-06, 'epoch': 0.98}


  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.432361896476729, 'eval_f1': 0.43306610856151, 'eval_loss': 1.2588419914245605, 'eval_runtime': 97.0358, 'eval_samples_per_second': 23.692, 'eval_steps_per_second': 5.926, 'epoch': 1.0}
{'loss': 1.2863, 'grad_norm': 24.355783462524414, 'learning_rate': 5.7607237573208775e-06, 'epoch': 1.09}
{'loss': 1.2862, 'grad_norm': 13.579124450683594, 'learning_rate': 6.3373727820777214e-06, 'epoch': 1.2}
{'loss': 1.2524, 'grad_norm': 22.594913482666016, 'learning_rate': 6.914021806834566e-06, 'epoch': 1.31}
{'loss': 1.251, 'grad_norm': 38.666847229003906, 'learning_rate': 7.490670831591411e-06, 'epoch': 1.42}
{'loss': 1.1994, 'grad_norm': 17.02745819091797, 'learning_rate': 8.067319856348255e-06, 'epoch': 1.53}
{'loss': 1.1836, 'grad_norm': 52.12561798095703, 'learning_rate': 8.6439688811051e-06, 'epoch': 1.64}
{'loss': 1.1796, 'grad_norm': 22.33612632751465, 'learning_rate': 9.21946460781243e-06, 'epoch': 1.75}
{'loss': 1.1744, 'grad_norm': 22.792207717895508, 'learning_rate':

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.526750761200522, 'eval_f1': 0.529128899201623, 'eval_loss': 1.0664671659469604, 'eval_runtime': 96.9607, 'eval_samples_per_second': 23.711, 'eval_steps_per_second': 5.93, 'epoch': 2.0}
{'loss': 1.0124, 'grad_norm': 9.407689094543457, 'learning_rate': 1.0949411682082965e-05, 'epoch': 2.08}
{'loss': 0.9633, 'grad_norm': 24.681081771850586, 'learning_rate': 1.152606070683981e-05, 'epoch': 2.19}
{'loss': 0.9638, 'grad_norm': 30.624353408813477, 'learning_rate': 1.2100403135497626e-05, 'epoch': 2.3}
{'loss': 0.9796, 'grad_norm': 60.10416793823242, 'learning_rate': 1.2677052160254471e-05, 'epoch': 2.41}
{'loss': 0.9645, 'grad_norm': 17.69792366027832, 'learning_rate': 1.3253701185011316e-05, 'epoch': 2.51}
{'loss': 0.9201, 'grad_norm': 67.5637435913086, 'learning_rate': 1.383035020976816e-05, 'epoch': 2.62}
{'loss': 0.9832, 'grad_norm': 19.638622283935547, 'learning_rate': 1.4406999234525005e-05, 'epoch': 2.73}
{'loss': 0.957, 'grad_norm': 32.09751892089844, 'learning_rat

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.6207046541974771, 'eval_f1': 0.6225858799770051, 'eval_loss': 1.0125455856323242, 'eval_runtime': 96.9248, 'eval_samples_per_second': 23.719, 'eval_steps_per_second': 5.932, 'epoch': 3.0}
{'loss': 0.766, 'grad_norm': 77.90558624267578, 'learning_rate': 1.57691034101029e-05, 'epoch': 3.06}
{'loss': 0.7241, 'grad_norm': 15.309263229370117, 'learning_rate': 1.5522322889961592e-05, 'epoch': 3.17}
{'loss': 0.7278, 'grad_norm': 16.297252655029297, 'learning_rate': 1.5051732904565056e-05, 'epoch': 3.28}
{'loss': 0.683, 'grad_norm': 28.09387969970703, 'learning_rate': 1.4372963891239031e-05, 'epoch': 3.39}
{'loss': 0.725, 'grad_norm': 22.27782440185547, 'learning_rate': 1.3503217961023888e-05, 'epoch': 3.5}
{'loss': 0.6945, 'grad_norm': 81.18882751464844, 'learning_rate': 1.2469170719621684e-05, 'epoch': 3.61}
{'loss': 0.6719, 'grad_norm': 23.0159854888916, 'learning_rate': 1.1301162423864706e-05, 'epoch': 3.72}
{'loss': 0.6976, 'grad_norm': 50.78999710083008, 'learning_rat

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.6698564593301436, 'eval_f1': 0.6717159944115402, 'eval_loss': 1.168811559677124, 'eval_runtime': 96.7147, 'eval_samples_per_second': 23.771, 'eval_steps_per_second': 5.945, 'epoch': 4.0}
{'loss': 0.5326, 'grad_norm': 9.531551361083984, 'learning_rate': 7.352317793748273e-06, 'epoch': 4.05}
{'loss': 0.4342, 'grad_norm': 44.113094329833984, 'learning_rate': 6.01484139636786e-06, 'epoch': 4.15}
{'loss': 0.4232, 'grad_norm': 33.592132568359375, 'learning_rate': 4.732782148555083e-06, 'epoch': 4.26}
{'loss': 0.4256, 'grad_norm': 4.531754016876221, 'learning_rate': 3.5437572937930357e-06, 'epoch': 4.37}
{'loss': 0.4175, 'grad_norm': 3.8217132091522217, 'learning_rate': 2.484627454241376e-06, 'epoch': 4.48}
{'loss': 0.3758, 'grad_norm': 32.461368560791016, 'learning_rate': 1.5822342634285167e-06, 'epoch': 4.59}
{'loss': 0.373, 'grad_norm': 4.0994672775268555, 'learning_rate': 8.653165342848131e-07, 'epoch': 4.7}
{'loss': 0.3716, 'grad_norm': 14.042684555053711, 'learning_r

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.6824706394084384, 'eval_f1': 0.685762574950659, 'eval_loss': 1.8748891353607178, 'eval_runtime': 97.2477, 'eval_samples_per_second': 23.641, 'eval_steps_per_second': 5.913, 'epoch': 5.0}
{'train_runtime': 19770.9209, 'train_samples_per_second': 4.625, 'train_steps_per_second': 1.156, 'train_loss': 0.9428062133655594, 'epoch': 5.0}


  0%|          | 0/575 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-17 07:21:25,018] Trial 5 finished with value: 0.6824706394084384 and parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 5 with value: 0.6824706394084384.


Current Trial 6 parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.0052258285035737e-05, 'batch_size': 3, 'warmup_ratio': 0.04149176551014211, 'weight_decay': 0.006685281279171756, 'adam_beta1': 0.9429922176765829, 'adam_beta2': 0.9918592948813898, 'adam_epsilon': 8.867767549079712e-08, 'lr_scheduler_type': 'cosine_with_restarts'}


Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/30485 [00:00<?, ?it/s]

{'loss': 1.5661, 'grad_norm': 11.711294174194336, 'learning_rate': 3.949385270879653e-06, 'epoch': 0.08}
{'loss': 1.4489, 'grad_norm': 10.554149627685547, 'learning_rate': 7.914663440233671e-06, 'epoch': 0.16}
{'loss': 1.4098, 'grad_norm': 11.79371166229248, 'learning_rate': 1.0050708238967809e-05, 'epoch': 0.25}
{'loss': 1.3533, 'grad_norm': 19.431964874267578, 'learning_rate': 1.0036743199974956e-05, 'epoch': 0.33}
{'loss': 1.3473, 'grad_norm': 24.68314552307129, 'learning_rate': 1.0008301590380624e-05, 'epoch': 0.41}
{'loss': 1.3161, 'grad_norm': 25.692615509033203, 'learning_rate': 9.965565545397013e-06, 'epoch': 0.49}
{'loss': 1.3102, 'grad_norm': 10.560772895812988, 'learning_rate': 9.908487291771306e-06, 'epoch': 0.57}
{'loss': 1.2576, 'grad_norm': 11.055680274963379, 'learning_rate': 9.83730302206875e-06, 'epoch': 0.66}
{'loss': 1.2612, 'grad_norm': 17.875919342041016, 'learning_rate': 9.752218400524156e-06, 'epoch': 0.74}
{'loss': 1.2479, 'grad_norm': 17.603532791137695, 'lear

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.5289256198347108, 'eval_f1': 0.5308621643875746, 'eval_loss': 1.0884441137313843, 'eval_runtime': 82.5838, 'eval_samples_per_second': 27.838, 'eval_steps_per_second': 9.288, 'epoch': 1.0}
{'loss': 1.0655, 'grad_norm': 31.870115280151367, 'learning_rate': 9.278955692616672e-06, 'epoch': 1.07}
{'loss': 1.0042, 'grad_norm': 46.548980712890625, 'learning_rate': 9.12888096412232e-06, 'epoch': 1.15}
{'loss': 1.0272, 'grad_norm': 28.26569175720215, 'learning_rate': 8.966952643288838e-06, 'epoch': 1.23}
{'loss': 0.9901, 'grad_norm': 21.341144561767578, 'learning_rate': 8.793638570333422e-06, 'epoch': 1.31}
{'loss': 0.9863, 'grad_norm': 66.85433197021484, 'learning_rate': 8.609439480971216e-06, 'epoch': 1.39}
{'loss': 0.9759, 'grad_norm': 31.788536071777344, 'learning_rate': 8.41488755969923e-06, 'epoch': 1.48}
{'loss': 0.9488, 'grad_norm': 55.67478561401367, 'learning_rate': 8.210544902219039e-06, 'epoch': 1.56}
{'loss': 0.9794, 'grad_norm': 58.71052551269531, 'learning_rat

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6133101348412353, 'eval_f1': 0.6170144726614226, 'eval_loss': 1.0821328163146973, 'eval_runtime': 82.5989, 'eval_samples_per_second': 27.833, 'eval_steps_per_second': 9.286, 'epoch': 2.0}
{'loss': 0.7402, 'grad_norm': 31.724708557128906, 'learning_rate': 6.814713924071282e-06, 'epoch': 2.05}
{'loss': 0.7246, 'grad_norm': 82.34474182128906, 'learning_rate': 6.5602605664758775e-06, 'epoch': 2.13}
{'loss': 0.7114, 'grad_norm': 241.4402313232422, 'learning_rate': 6.300869824375521e-06, 'epoch': 2.21}
{'loss': 0.6798, 'grad_norm': 59.85123062133789, 'learning_rate': 6.037796125801591e-06, 'epoch': 2.3}
{'loss': 0.7688, 'grad_norm': 161.48974609375, 'learning_rate': 5.771799538268193e-06, 'epoch': 2.38}
{'loss': 0.6825, 'grad_norm': 29.87432861328125, 'learning_rate': 5.503648574044489e-06, 'epoch': 2.46}
{'loss': 0.7242, 'grad_norm': 13.752564430236816, 'learning_rate': 5.234117969783694e-06, 'epoch': 2.54}
{'loss': 0.6424, 'grad_norm': 11.418335914611816, 'learning_rate

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6528925619834711, 'eval_f1': 0.6548668627985723, 'eval_loss': 1.3163256645202637, 'eval_runtime': 82.5619, 'eval_samples_per_second': 27.846, 'eval_steps_per_second': 9.29, 'epoch': 3.0}
{'loss': 0.5888, 'grad_norm': 19.721939086914062, 'learning_rate': 3.632084900798461e-06, 'epoch': 3.03}
{'loss': 0.4963, 'grad_norm': 3.8470921516418457, 'learning_rate': 3.375142285260742e-06, 'epoch': 3.12}
{'loss': 0.5199, 'grad_norm': 138.81040954589844, 'learning_rate': 3.1224504777332095e-06, 'epoch': 3.2}
{'loss': 0.5351, 'grad_norm': 79.50592041015625, 'learning_rate': 2.8752587423080687e-06, 'epoch': 3.28}
{'loss': 0.5114, 'grad_norm': 42.25868225097656, 'learning_rate': 2.634281260634312e-06, 'epoch': 3.36}
{'loss': 0.4829, 'grad_norm': 0.0002444460988044739, 'learning_rate': 2.400214260257463e-06, 'epoch': 3.44}
{'loss': 0.5622, 'grad_norm': 1.629701018333435, 'learning_rate': 2.1746239364265767e-06, 'epoch': 3.53}
{'loss': 0.4635, 'grad_norm': 103.116943359375, 'learnin

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6702914310569813, 'eval_f1': 0.6729194953170986, 'eval_loss': 1.876667857170105, 'eval_runtime': 82.6202, 'eval_samples_per_second': 27.826, 'eval_steps_per_second': 9.283, 'epoch': 4.0}
{'loss': 0.508, 'grad_norm': 17.28328514099121, 'learning_rate': 1.0100251870693411e-06, 'epoch': 4.02}
{'loss': 0.367, 'grad_norm': 149.6023712158203, 'learning_rate': 8.534461084606884e-07, 'epoch': 4.1}
{'loss': 0.3991, 'grad_norm': 183.7718963623047, 'learning_rate': 7.089226661168624e-07, 'epoch': 4.18}
{'loss': 0.4, 'grad_norm': 0.056669510900974274, 'learning_rate': 5.76872414411971e-07, 'epoch': 4.26}
{'loss': 0.4622, 'grad_norm': 64.31035614013672, 'learning_rate': 4.5767687041554545e-07, 'epoch': 4.35}
{'loss': 0.3811, 'grad_norm': 53.15037155151367, 'learning_rate': 3.516804116206939e-07, 'epoch': 4.43}
{'loss': 0.3977, 'grad_norm': 18.294050216674805, 'learning_rate': 2.5953197557920225e-07, 'epoch': 4.51}
{'loss': 0.4058, 'grad_norm': 22.009178161621094, 'learning_rate'

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6733362331448456, 'eval_f1': 0.6759954248736463, 'eval_loss': 2.0366921424865723, 'eval_runtime': 82.0636, 'eval_samples_per_second': 28.015, 'eval_steps_per_second': 9.346, 'epoch': 5.0}
{'train_runtime': 11591.6283, 'train_samples_per_second': 7.889, 'train_steps_per_second': 2.63, 'train_loss': 0.7826298665351236, 'epoch': 5.0}


  0%|          | 0/767 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-17 10:36:00,662] Trial 6 finished with value: 0.6733362331448456 and parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.0052258285035737e-05, 'batch_size': 3, 'warmup_ratio': 0.04149176551014211, 'weight_decay': 0.006685281279171756, 'adam_beta1': 0.9429922176765829, 'adam_beta2': 0.9918592948813898, 'adam_epsilon': 8.867767549079712e-08, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 5 with value: 0.6824706394084384.


Current Trial 7 parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 2.7102291331781023e-06, 'batch_size': 4, 'warmup_ratio': 0.0871495018092514, 'weight_decay': 0.08376945393642041, 'adam_beta1': 0.8976771551751836, 'adam_beta2': 0.9922976863312725, 'adam_epsilon': 6.461136389064594e-07, 'lr_scheduler_type': 'cosine'}


Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/22865 [00:00<?, ?it/s]

{'loss': 1.6107, 'grad_norm': 1.969455361366272, 'learning_rate': 6.799370630150783e-07, 'epoch': 0.11}
{'loss': 1.5503, 'grad_norm': 4.61126708984375, 'learning_rate': 1.3585142519041265e-06, 'epoch': 0.22}
{'loss': 1.4651, 'grad_norm': 18.674701690673828, 'learning_rate': 2.0343716925411145e-06, 'epoch': 0.33}
{'loss': 1.43, 'grad_norm': 10.058382034301758, 'learning_rate': 2.7102289950251617e-06, 'epoch': 0.44}
{'loss': 1.415, 'grad_norm': 12.71101188659668, 'learning_rate': 2.706347217095712e-06, 'epoch': 0.55}
{'loss': 1.4003, 'grad_norm': 11.993844985961914, 'learning_rate': 2.6948158737756076e-06, 'epoch': 0.66}
{'loss': 1.3736, 'grad_norm': 21.316970825195312, 'learning_rate': 2.675700246024967e-06, 'epoch': 0.77}
{'loss': 1.3569, 'grad_norm': 12.652982711791992, 'learning_rate': 2.6491691021865293e-06, 'epoch': 0.87}
{'loss': 1.3312, 'grad_norm': 15.776825904846191, 'learning_rate': 2.6152663527710614e-06, 'epoch': 0.98}


  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.4197477163984341, 'eval_f1': 0.418994982174714, 'eval_loss': 1.2638765573501587, 'eval_runtime': 122.7548, 'eval_samples_per_second': 18.728, 'eval_steps_per_second': 4.684, 'epoch': 1.0}
{'loss': 1.299, 'grad_norm': 33.74323654174805, 'learning_rate': 2.5742296627370517e-06, 'epoch': 1.09}
{'loss': 1.3051, 'grad_norm': 13.195956230163574, 'learning_rate': 2.526291347995245e-06, 'epoch': 1.2}
{'loss': 1.2894, 'grad_norm': 21.00675392150879, 'learning_rate': 2.4718383505988317e-06, 'epoch': 1.31}
{'loss': 1.2724, 'grad_norm': 37.94491958618164, 'learning_rate': 2.4109607932708704e-06, 'epoch': 1.42}
{'loss': 1.2248, 'grad_norm': 24.363128662109375, 'learning_rate': 2.344105904984801e-06, 'epoch': 1.53}
{'loss': 1.2372, 'grad_norm': 46.987159729003906, 'learning_rate': 2.271652163006873e-06, 'epoch': 1.64}
{'loss': 1.2278, 'grad_norm': 36.3315315246582, 'learning_rate': 2.194169916339691e-06, 'epoch': 1.75}
{'loss': 1.2346, 'grad_norm': 21.15753746032715, 'learning_ra

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.4810787298825576, 'eval_f1': 0.480769738520469, 'eval_loss': 1.1494512557983398, 'eval_runtime': 108.5904, 'eval_samples_per_second': 21.171, 'eval_steps_per_second': 5.295, 'epoch': 2.0}
{'loss': 1.1635, 'grad_norm': 12.913232803344727, 'learning_rate': 1.9346620333868667e-06, 'epoch': 2.08}
{'loss': 1.1512, 'grad_norm': 24.09821128845215, 'learning_rate': 1.841112320828192e-06, 'epoch': 2.19}
{'loss': 1.1273, 'grad_norm': 42.238067626953125, 'learning_rate': 1.7446269228696428e-06, 'epoch': 2.3}
{'loss': 1.1456, 'grad_norm': 35.595550537109375, 'learning_rate': 1.6459364270408495e-06, 'epoch': 2.41}
{'loss': 1.1086, 'grad_norm': 27.671863555908203, 'learning_rate': 1.5455995375745695e-06, 'epoch': 2.51}
{'loss': 1.0868, 'grad_norm': 28.301483154296875, 'learning_rate': 1.4441842792271468e-06, 'epoch': 2.62}
{'loss': 1.1415, 'grad_norm': 31.132102966308594, 'learning_rate': 1.342468740463127e-06, 'epoch': 2.73}
{'loss': 1.1231, 'grad_norm': 64.490234375, 'learning_

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.5176163549369291, 'eval_f1': 0.518378985518002, 'eval_loss': 1.1023606061935425, 'eval_runtime': 130.0531, 'eval_samples_per_second': 17.677, 'eval_steps_per_second': 4.421, 'epoch': 3.0}


[W 2024-10-17 14:46:12,377] Trial 7 failed with parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 2.7102291331781023e-06, 'batch_size': 4, 'warmup_ratio': 0.0871495018092514, 'weight_decay': 0.08376945393642041, 'adam_beta1': 0.8976771551751836, 'adam_beta2': 0.9922976863312725, 'adam_epsilon': 6.461136389064594e-07, 'lr_scheduler_type': 'cosine'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\OEM\AppData\Local\Temp\ipykernel_27760\1569860036.py", line 53, in objective
    trainer.train()
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\site-packages\transformers\trainer.py", line 1938, in train
    return inner_training_loop(
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\site-packages\transformers\trainer.py", line 2279, in _inner_training_l

KeyboardInterrupt: 

## 4.3 Optuna Hyperparameters Tuning 2

In [14]:
# Create a study object and optimize the objective
study = optuna.create_study(direction='maximize')
#study.enqueue_trial({'model_name': './squad-trained-model', 'learning_rate': 1.5807103066634623e-05, 'batch_size': 3, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'})
study.enqueue_trial({'model_name': './MCQA-Combined/Optuna/trial_5/checkpoint-22865', 'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'})
study.enqueue_trial({'model_name': './MCQA-Combined/Optuna/trial_0/checkpoint-30485', 'learning_rate': 9.891138752479374e-06, 'batch_size': 3, 'warmup_ratio': 0.5982282303832456, 'weight_decay': 0.17633588993115804, 'adam_beta1': 0.8747290421857349, 'adam_beta2': 0.9927786970263835, 'adam_epsilon': 9.90768817706196e-07, 'lr_scheduler_type': 'linear'})
study.enqueue_trial({'model_name': './MCQA-Combined/Optuna/trial_6/checkpoint-30485', 'learning_rate': 1.0052258285035737e-05, 'batch_size': 3, 'warmup_ratio': 0.04149176551014211, 'weight_decay': 0.006685281279171756, 'adam_beta1': 0.9429922176765829, 'adam_beta2': 0.9918592948813898, 'adam_epsilon': 8.867767549079712e-08, 'lr_scheduler_type': 'cosine_with_restarts'})
study.enqueue_trial({'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'})

study.optimize(objective, n_trials=20)


[I 2024-10-18 01:48:33,486] A new study created in memory with name: no-name-48d19931-bfa9-4c3d-b317-7013bfdd5859


Current Trial 0 parameters: {'model_name': './MCQA-Combined/Optuna/trial_5/checkpoint-22865', 'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/137190 [00:00<?, ?it/s]

{'loss': 0.2705, 'grad_norm': 99.15497589111328, 'learning_rate': 9.514940314205971e-08, 'epoch': 0.11}
{'loss': 0.2782, 'grad_norm': 0.18223179876804352, 'learning_rate': 1.9125991136636245e-07, 'epoch': 0.22}
{'loss': 0.3224, 'grad_norm': 1.258844256401062, 'learning_rate': 2.873704195906652e-07, 'epoch': 0.33}
{'loss': 0.302, 'grad_norm': 67.60765838623047, 'learning_rate': 3.8348092781496795e-07, 'epoch': 0.44}
{'loss': 0.2937, 'grad_norm': 0.0005371381412260234, 'learning_rate': 4.793992150228221e-07, 'epoch': 0.55}
{'loss': 0.3094, 'grad_norm': 0.0005397977656684816, 'learning_rate': 5.755097232471248e-07, 'epoch': 0.66}
{'loss': 0.3291, 'grad_norm': 14.216509819030762, 'learning_rate': 6.716202314714275e-07, 'epoch': 0.77}
{'loss': 0.3382, 'grad_norm': 1.9903104305267334, 'learning_rate': 7.677307396957303e-07, 'epoch': 0.87}
{'loss': 0.3188, 'grad_norm': 0.03658886253833771, 'learning_rate': 8.636490269035844e-07, 'epoch': 0.98}


  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.683340582862114, 'eval_f1': 0.6857499996570497, 'eval_loss': 1.9124321937561035, 'eval_runtime': 94.5023, 'eval_samples_per_second': 24.327, 'eval_steps_per_second': 6.085, 'epoch': 1.0}
{'loss': 0.3088, 'grad_norm': 16.608346939086914, 'learning_rate': 9.597595351278871e-07, 'epoch': 1.09}
{'loss': 0.3221, 'grad_norm': 22.766817092895508, 'learning_rate': 1.05587004335219e-06, 'epoch': 1.2}
{'loss': 0.3189, 'grad_norm': 10.460604667663574, 'learning_rate': 1.1519805515764925e-06, 'epoch': 1.31}
{'loss': 0.3254, 'grad_norm': 1.3408642189460807e-05, 'learning_rate': 1.2480910598007954e-06, 'epoch': 1.42}
{'loss': 0.3043, 'grad_norm': 1.1166165769793679e-09, 'learning_rate': 1.3442015680250982e-06, 'epoch': 1.53}
{'loss': 0.2966, 'grad_norm': 50.4769401550293, 'learning_rate': 1.4403120762494008e-06, 'epoch': 1.64}
{'loss': 0.2969, 'grad_norm': 10.994010925292969, 'learning_rate': 1.5364225844737036e-06, 'epoch': 1.75}
{'loss': 0.2914, 'grad_norm': 0.00241814344190061

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.6963897346672466, 'eval_f1': 0.6987776690853735, 'eval_loss': 2.284477472305298, 'eval_runtime': 94.5198, 'eval_samples_per_second': 24.323, 'eval_steps_per_second': 6.083, 'epoch': 2.0}
{'loss': 0.2865, 'grad_norm': 289.2450256347656, 'learning_rate': 1.8245618881301632e-06, 'epoch': 2.08}
{'loss': 0.253, 'grad_norm': 0.013860290870070457, 'learning_rate': 1.9206723963544663e-06, 'epoch': 2.19}
{'loss': 0.2606, 'grad_norm': 19.288745880126953, 'learning_rate': 2.016782904578769e-06, 'epoch': 2.3}
{'loss': 0.2748, 'grad_norm': 26.506500244140625, 'learning_rate': 2.1128934128030715e-06, 'epoch': 2.41}
{'loss': 0.267, 'grad_norm': 76.48756408691406, 'learning_rate': 2.209003921027374e-06, 'epoch': 2.51}
{'loss': 0.2425, 'grad_norm': 35.718902587890625, 'learning_rate': 2.305114429251677e-06, 'epoch': 2.62}
{'loss': 0.2879, 'grad_norm': 5.830728054046631, 'learning_rate': 2.4012249374759797e-06, 'epoch': 2.73}
{'loss': 0.321, 'grad_norm': 0.0029808077961206436, 'learn

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.6911700739451936, 'eval_f1': 0.6930771667119316, 'eval_loss': 2.2938365936279297, 'eval_runtime': 94.5937, 'eval_samples_per_second': 24.304, 'eval_steps_per_second': 6.079, 'epoch': 3.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 10293.2707, 'train_samples_per_second': 53.307, 'train_steps_per_second': 13.328, 'train_loss': 0.29866341761123233, 'epoch': 3.0}


  0%|          | 0/575 [00:00<?, ?it/s]

[I 2024-10-18 04:41:41,547] Trial 0 finished with value: 0.6963897346672466 and parameters: {'model_name': './MCQA-Combined/Optuna/trial_5/checkpoint-22865', 'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 0.6963897346672466.


Stopping training: No improvement in eval_accuracy for 1 epochs
Current Trial 1 parameters: {'model_name': './MCQA-Combined/Optuna/trial_0/checkpoint-30485', 'learning_rate': 9.891138752479374e-06, 'batch_size': 3, 'warmup_ratio': 0.5982282303832456, 'weight_decay': 0.17633588993115804, 'adam_beta1': 0.8747290421857349, 'adam_beta2': 0.9927786970263835, 'adam_epsilon': 9.90768817706196e-07, 'lr_scheduler_type': 'linear'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/182910 [00:00<?, ?it/s]

{'loss': 0.375, 'grad_norm': 2.2848993808111118e-07, 'learning_rate': 4.465484586029145e-08, 'epoch': 0.08}
{'loss': 0.3694, 'grad_norm': 264.77593994140625, 'learning_rate': 8.985205826949333e-08, 'epoch': 0.16}
{'loss': 0.3431, 'grad_norm': 126.29911041259766, 'learning_rate': 1.350492706786952e-07, 'epoch': 0.25}
{'loss': 0.3766, 'grad_norm': 0.0011504247086122632, 'learning_rate': 1.8024648308789708e-07, 'epoch': 0.33}
{'loss': 0.45, 'grad_norm': 2.490936040878296, 'learning_rate': 2.2544369549709894e-07, 'epoch': 0.41}
{'loss': 0.3445, 'grad_norm': 0.02701166644692421, 'learning_rate': 2.706409079063008e-07, 'epoch': 0.49}
{'loss': 0.3742, 'grad_norm': 22.46174430847168, 'learning_rate': 3.158381203155027e-07, 'epoch': 0.57}
{'loss': 0.4265, 'grad_norm': 3.9365717384498566e-05, 'learning_rate': 3.610353327247045e-07, 'epoch': 0.66}
{'loss': 0.4518, 'grad_norm': 23.836828231811523, 'learning_rate': 4.060517562842696e-07, 'epoch': 0.74}
{'loss': 0.3876, 'grad_norm': 0.36213865876197

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6746411483253588, 'eval_f1': 0.6783092901670232, 'eval_loss': 1.972519040107727, 'eval_runtime': 82.756, 'eval_samples_per_second': 27.78, 'eval_steps_per_second': 9.268, 'epoch': 1.0}
{'loss': 0.4086, 'grad_norm': 168.22952270507812, 'learning_rate': 5.867502114962587e-07, 'epoch': 1.07}
{'loss': 0.3838, 'grad_norm': 0.47670459747314453, 'learning_rate': 6.319474239054606e-07, 'epoch': 1.15}
{'loss': 0.4116, 'grad_norm': 183.18838500976562, 'learning_rate': 6.771446363146624e-07, 'epoch': 1.23}
{'loss': 0.4401, 'grad_norm': 35.64690017700195, 'learning_rate': 7.223418487238643e-07, 'epoch': 1.31}
{'loss': 0.3961, 'grad_norm': 0.5133939385414124, 'learning_rate': 7.675390611330661e-07, 'epoch': 1.39}
{'loss': 0.3798, 'grad_norm': 5.298861651681364e-06, 'learning_rate': 8.126458791174496e-07, 'epoch': 1.48}
{'loss': 0.4052, 'grad_norm': 38.95729446411133, 'learning_rate': 8.578430915266515e-07, 'epoch': 1.56}
{'loss': 0.3824, 'grad_norm': 23.23221778869629, 'learning

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6755110917790343, 'eval_f1': 0.6781748576728076, 'eval_loss': 2.0128607749938965, 'eval_runtime': 82.5231, 'eval_samples_per_second': 27.859, 'eval_steps_per_second': 9.294, 'epoch': 2.0}
{'loss': 0.4027, 'grad_norm': 52.395896911621094, 'learning_rate': 1.1289359715570443e-06, 'epoch': 2.05}
{'loss': 0.392, 'grad_norm': 2.4953598976135254, 'learning_rate': 1.1741331839662462e-06, 'epoch': 2.13}
{'loss': 0.3836, 'grad_norm': 24.53590202331543, 'learning_rate': 1.219330396375448e-06, 'epoch': 2.21}
{'loss': 0.3746, 'grad_norm': 35.88473892211914, 'learning_rate': 1.26452760878465e-06, 'epoch': 2.3}
{'loss': 0.375, 'grad_norm': 7.686323165893555, 'learning_rate': 1.3097248211938518e-06, 'epoch': 2.38}
{'loss': 0.3613, 'grad_norm': 0.017652036622166634, 'learning_rate': 1.3548316391782352e-06, 'epoch': 2.46}
{'loss': 0.3839, 'grad_norm': 0.1621805727481842, 'learning_rate': 1.4000288515874371e-06, 'epoch': 2.54}
{'loss': 0.3689, 'grad_norm': 16.461877822875977, 'learni

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6859504132231405, 'eval_f1': 0.6884094783241329, 'eval_loss': 2.1828103065490723, 'eval_runtime': 82.6416, 'eval_samples_per_second': 27.819, 'eval_steps_per_second': 9.281, 'epoch': 3.0}
{'loss': 0.3601, 'grad_norm': 0.02080305479466915, 'learning_rate': 1.6711217316178298e-06, 'epoch': 3.03}
{'loss': 0.329, 'grad_norm': 0.0005194161203689873, 'learning_rate': 1.7162285496022134e-06, 'epoch': 3.12}
{'loss': 0.3386, 'grad_norm': 138.39730834960938, 'learning_rate': 1.7614257620114153e-06, 'epoch': 3.2}
{'loss': 0.3487, 'grad_norm': 15.076576232910156, 'learning_rate': 1.8066229744206172e-06, 'epoch': 3.28}
{'loss': 0.3628, 'grad_norm': 0.3702249526977539, 'learning_rate': 1.851820186829819e-06, 'epoch': 3.36}
{'loss': 0.2977, 'grad_norm': 26.885936737060547, 'learning_rate': 1.8969270048142025e-06, 'epoch': 3.44}
{'loss': 0.3142, 'grad_norm': 30.011932373046875, 'learning_rate': 1.942124217223404e-06, 'epoch': 3.53}
{'loss': 0.3755, 'grad_norm': 46.05055618286133, '

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6911700739451936, 'eval_f1': 0.6935897092331728, 'eval_loss': 2.3350729942321777, 'eval_runtime': 82.6281, 'eval_samples_per_second': 27.823, 'eval_steps_per_second': 9.283, 'epoch': 4.0}
{'loss': 0.3271, 'grad_norm': 40.06705856323242, 'learning_rate': 2.213126702828979e-06, 'epoch': 4.02}
{'loss': 0.3236, 'grad_norm': 30.957353591918945, 'learning_rate': 2.2583239152381807e-06, 'epoch': 4.1}
{'loss': 0.321, 'grad_norm': 5.337359887391813e-09, 'learning_rate': 2.3035211276473828e-06, 'epoch': 4.18}
{'loss': 0.329, 'grad_norm': 27.053117752075195, 'learning_rate': 2.3487183400565844e-06, 'epoch': 4.26}
{'loss': 0.3242, 'grad_norm': 0.3099638521671295, 'learning_rate': 2.3939155524657865e-06, 'epoch': 4.35}
{'loss': 0.2886, 'grad_norm': 0.00022847039508633316, 'learning_rate': 2.439112764874988e-06, 'epoch': 4.43}
{'loss': 0.324, 'grad_norm': 1.5601292848587036, 'learning_rate': 2.4843099772841903e-06, 'epoch': 4.51}
{'loss': 0.3594, 'grad_norm': 0.000706507067661732

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6911700739451936, 'eval_f1': 0.6939035399771951, 'eval_loss': 2.580587387084961, 'eval_runtime': 82.6908, 'eval_samples_per_second': 27.802, 'eval_steps_per_second': 9.276, 'epoch': 5.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 11595.5841, 'train_samples_per_second': 47.32, 'train_steps_per_second': 15.774, 'train_loss': 0.3771073816720826, 'epoch': 5.0}


  0%|          | 0/767 [00:00<?, ?it/s]

[I 2024-10-18 07:56:21,309] Trial 1 finished with value: 0.6911700739451936 and parameters: {'model_name': './MCQA-Combined/Optuna/trial_0/checkpoint-30485', 'learning_rate': 9.891138752479374e-06, 'batch_size': 3, 'warmup_ratio': 0.5982282303832456, 'weight_decay': 0.17633588993115804, 'adam_beta1': 0.8747290421857349, 'adam_beta2': 0.9927786970263835, 'adam_epsilon': 9.90768817706196e-07, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.6963897346672466.


Stopping training: No improvement in eval_accuracy for 1 epochs
Current Trial 2 parameters: {'model_name': './MCQA-Combined/Optuna/trial_6/checkpoint-30485', 'learning_rate': 1.0052258285035737e-05, 'batch_size': 3, 'warmup_ratio': 0.04149176551014211, 'weight_decay': 0.006685281279171756, 'adam_beta1': 0.9429922176765829, 'adam_beta2': 0.9918592948813898, 'adam_epsilon': 8.867767549079712e-08, 'lr_scheduler_type': 'cosine_with_restarts'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/182910 [00:00<?, ?it/s]

{'loss': 0.445, 'grad_norm': 0.0, 'learning_rate': 6.542576538613509e-07, 'epoch': 0.08}
{'loss': 0.3755, 'grad_norm': 7.25158166885376, 'learning_rate': 1.3164617569598844e-06, 'epoch': 0.16}
{'loss': 0.3561, 'grad_norm': 125.56159210205078, 'learning_rate': 1.9786658600584178e-06, 'epoch': 0.25}
{'loss': 0.4039, 'grad_norm': 43.95442199707031, 'learning_rate': 2.6408699631569514e-06, 'epoch': 0.33}
{'loss': 0.4612, 'grad_norm': 2.5171901143039577e-05, 'learning_rate': 3.3017496580492876e-06, 'epoch': 0.41}
{'loss': 0.374, 'grad_norm': 0.1422954499721527, 'learning_rate': 3.963953761147821e-06, 'epoch': 0.49}
{'loss': 0.4267, 'grad_norm': 36.26209259033203, 'learning_rate': 4.626157864246355e-06, 'epoch': 0.57}
{'loss': 0.4454, 'grad_norm': 0.000985930673778057, 'learning_rate': 5.288361967344887e-06, 'epoch': 0.66}
{'loss': 0.5264, 'grad_norm': 31.491241455078125, 'learning_rate': 5.95056607044342e-06, 'epoch': 0.74}
{'loss': 0.466, 'grad_norm': 19.953073501586914, 'learning_rate': 6

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6681165724227925, 'eval_f1': 0.6712466084338798, 'eval_loss': 2.0135676860809326, 'eval_runtime': 84.2165, 'eval_samples_per_second': 27.299, 'eval_steps_per_second': 9.107, 'epoch': 1.0}
{'loss': 0.456, 'grad_norm': 5.765288352966309, 'learning_rate': 8.598058074631356e-06, 'epoch': 1.07}
{'loss': 0.4585, 'grad_norm': 0.012237763032317162, 'learning_rate': 9.260262177729892e-06, 'epoch': 1.15}
{'loss': 0.5119, 'grad_norm': 150.01853942871094, 'learning_rate': 9.922466280828424e-06, 'epoch': 1.23}
{'loss': 0.5092, 'grad_norm': 19.789443969726562, 'learning_rate': 1.0052127881100565e-05, 'epoch': 1.31}
{'loss': 0.578, 'grad_norm': 0.06391139328479767, 'learning_rate': 1.0051603225735257e-05, 'epoch': 1.39}
{'loss': 0.4865, 'grad_norm': 0.017514964565634727, 'learning_rate': 1.0050674508406943e-05, 'epoch': 1.48}
{'loss': 0.5037, 'grad_norm': 38.18337631225586, 'learning_rate': 1.0049342451633382e-05, 'epoch': 1.56}
{'loss': 0.5309, 'grad_norm': 110.04790496826172, 'l

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6733362331448456, 'eval_f1': 0.6743090396682621, 'eval_loss': 2.1987509727478027, 'eval_runtime': 84.2601, 'eval_samples_per_second': 27.285, 'eval_steps_per_second': 9.103, 'epoch': 2.0}
{'loss': 0.5713, 'grad_norm': 42.225181579589844, 'learning_rate': 1.00328961344583e-05, 'epoch': 2.05}
{'loss': 0.4159, 'grad_norm': 0.39606156945228577, 'learning_rate': 1.0028746310826174e-05, 'epoch': 2.13}
{'loss': 0.4835, 'grad_norm': 0.4710776209831238, 'learning_rate': 1.0024194908008965e-05, 'epoch': 2.21}
{'loss': 0.4383, 'grad_norm': 54.79790115356445, 'learning_rate': 1.001924229136516e-05, 'epoch': 2.3}
{'loss': 0.422, 'grad_norm': 87.28656768798828, 'learning_rate': 1.0013899965060368e-05, 'epoch': 2.38}
{'loss': 0.4584, 'grad_norm': 1.9011500626220368e-05, 'learning_rate': 1.0008146945957569e-05, 'epoch': 2.46}
{'loss': 0.3971, 'grad_norm': 1.7128010988235474, 'learning_rate': 1.0001994001259376e-05, 'epoch': 2.54}
{'loss': 0.4821, 'grad_norm': 6.297253131866455, 'le

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6859504132231405, 'eval_f1': 0.689047034019059, 'eval_loss': 2.972546100616455, 'eval_runtime': 96.1115, 'eval_samples_per_second': 23.92, 'eval_steps_per_second': 7.98, 'epoch': 3.0}
{'loss': 0.4082, 'grad_norm': 0.004036388825625181, 'learning_rate': 9.956742750059558e-06, 'epoch': 3.03}
{'loss': 0.3907, 'grad_norm': 0.017696648836135864, 'learning_rate': 9.947807530220855e-06, 'epoch': 3.12}
{'loss': 0.4468, 'grad_norm': 0.003038992639631033, 'learning_rate': 9.938477228462098e-06, 'epoch': 3.2}
{'loss': 0.4029, 'grad_norm': 47.63961410522461, 'learning_rate': 9.928752593762235e-06, 'epoch': 3.28}
{'loss': 0.3898, 'grad_norm': 341.1247253417969, 'learning_rate': 9.918655035359445e-06, 'epoch': 3.36}
{'loss': 0.4115, 'grad_norm': 104.11070251464844, 'learning_rate': 9.908144892918616e-06, 'epoch': 3.44}
{'loss': 0.4193, 'grad_norm': 54.144039154052734, 'learning_rate': 9.897242852429097e-06, 'epoch': 3.53}
{'loss': 0.3899, 'grad_norm': 85.18841552734375, 'learning

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.6759460635058722, 'eval_f1': 0.6767204573821777, 'eval_loss': 3.629800319671631, 'eval_runtime': 86.6431, 'eval_samples_per_second': 26.534, 'eval_steps_per_second': 8.852, 'epoch': 4.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 9487.1783, 'train_samples_per_second': 57.836, 'train_steps_per_second': 19.28, 'train_loss': 0.4590179983936452, 'epoch': 4.0}


  0%|          | 0/767 [00:00<?, ?it/s]

[I 2024-10-18 10:35:55,111] Trial 2 finished with value: 0.6859504132231405 and parameters: {'model_name': './MCQA-Combined/Optuna/trial_6/checkpoint-30485', 'learning_rate': 1.0052258285035737e-05, 'batch_size': 3, 'warmup_ratio': 0.04149176551014211, 'weight_decay': 0.006685281279171756, 'adam_beta1': 0.9429922176765829, 'adam_beta2': 0.9918592948813898, 'adam_epsilon': 8.867767549079712e-08, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 0.6963897346672466.


Stopping training: No improvement in eval_accuracy for 1 epochs
Current Trial 3 parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/137190 [00:00<?, ?it/s]

{'loss': 1.6108, 'grad_norm': 2.020512819290161, 'learning_rate': 9.611050822430274e-08, 'epoch': 0.11}
{'loss': 1.6101, 'grad_norm': 2.085341691970825, 'learning_rate': 1.9222101644860547e-07, 'epoch': 0.22}
{'loss': 1.6122, 'grad_norm': 1.6762933731079102, 'learning_rate': 2.883315246729082e-07, 'epoch': 0.33}
{'loss': 1.6112, 'grad_norm': 2.2332630157470703, 'learning_rate': 3.8444203289721095e-07, 'epoch': 0.44}
{'loss': 1.5889, 'grad_norm': 1.8620704412460327, 'learning_rate': 4.803603201050651e-07, 'epoch': 0.55}
{'loss': 1.5185, 'grad_norm': 6.934852123260498, 'learning_rate': 5.762786073129193e-07, 'epoch': 0.66}
{'loss': 1.4754, 'grad_norm': 10.272505760192871, 'learning_rate': 6.720046735043248e-07, 'epoch': 0.77}
{'loss': 1.4694, 'grad_norm': 8.192824363708496, 'learning_rate': 7.681151817286275e-07, 'epoch': 0.87}
{'loss': 1.4503, 'grad_norm': 9.931807518005371, 'learning_rate': 8.640334689364817e-07, 'epoch': 0.98}


  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.3501522401043932, 'eval_f1': 0.3514140656822659, 'eval_loss': 1.3924517631530762, 'eval_runtime': 108.3799, 'eval_samples_per_second': 21.212, 'eval_steps_per_second': 5.305, 'epoch': 1.0}
Stopping training: eval_accuracy below manual min_acc of 0.4
{'train_runtime': 3702.9615, 'train_samples_per_second': 148.179, 'train_steps_per_second': 37.049, 'train_loss': 1.5478570347095042, 'epoch': 1.0}


  0%|          | 0/575 [00:00<?, ?it/s]

[I 2024-10-18 11:39:21,823] Trial 3 finished with value: 0.3501522401043932 and parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 0.6963897346672466.


Stopping training: No improvement in eval_accuracy for 1 epochs
Stopping training: eval_accuracy below manual min_acc of 0.4


Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at ./squad-trained-model and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Current Trial 4 parameters: {'model_name': './squad-trained-model', 'learning_rate': 2.2913924741371357e-07, 'batch_size': 3, 'warmup_ratio': 0.6782879296196984, 'weight_decay': 0.04860407678647671, 'adam_beta1': 0.8164084430629853, 'adam_beta2': 0.9906390554740927, 'adam_epsilon': 6.695655421893776e-07, 'lr_scheduler_type': 'linear'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/182910 [00:00<?, ?it/s]

{'loss': 1.61, 'grad_norm': 2.1761417388916016, 'learning_rate': 9.19763232569998e-10, 'epoch': 0.08}
{'loss': 1.6096, 'grad_norm': 2.805415153503418, 'learning_rate': 1.8432202933832488e-09, 'epoch': 0.16}
{'loss': 1.6099, 'grad_norm': 2.8612136840820312, 'learning_rate': 2.7666773541964996e-09, 'epoch': 0.25}
{'loss': 1.6118, 'grad_norm': 2.628831386566162, 'learning_rate': 3.6901344150097503e-09, 'epoch': 0.33}
{'loss': 1.6102, 'grad_norm': 3.427438259124756, 'learning_rate': 4.613591475823001e-09, 'epoch': 0.41}
{'loss': 1.613, 'grad_norm': 3.6097707748413086, 'learning_rate': 5.535201622514626e-09, 'epoch': 0.49}
{'loss': 1.6107, 'grad_norm': 2.4153101444244385, 'learning_rate': 6.458658683327877e-09, 'epoch': 0.57}
{'loss': 1.6116, 'grad_norm': 3.0242533683776855, 'learning_rate': 7.3821157441411286e-09, 'epoch': 0.66}
{'loss': 1.61, 'grad_norm': 2.5526537895202637, 'learning_rate': 8.305572804954378e-09, 'epoch': 0.74}
{'loss': 1.6113, 'grad_norm': 6.2558794021606445, 'learning_

  0%|          | 0/767 [00:00<?, ?it/s]

{'eval_accuracy': 0.1644193127446716, 'eval_f1': 0.1907047402973053, 'eval_loss': 1.6099488735198975, 'eval_runtime': 83.3643, 'eval_samples_per_second': 27.578, 'eval_steps_per_second': 9.201, 'epoch': 1.0}
Stopping training: eval_accuracy below manual min_acc of 0.4
{'train_runtime': 2326.3998, 'train_samples_per_second': 235.858, 'train_steps_per_second': 78.624, 'train_loss': 1.6111839360754392, 'epoch': 1.0}


  0%|          | 0/767 [00:00<?, ?it/s]

[I 2024-10-18 12:19:34,622] Trial 4 finished with value: 0.1644193127446716 and parameters: {'model_name': './squad-trained-model', 'learning_rate': 2.2913924741371357e-07, 'batch_size': 3, 'warmup_ratio': 0.6782879296196984, 'weight_decay': 0.04860407678647671, 'adam_beta1': 0.8164084430629853, 'adam_beta2': 0.9906390554740927, 'adam_epsilon': 6.695655421893776e-07, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.6963897346672466.


Stopping training: No improvement in eval_accuracy for 1 epochs
Stopping training: eval_accuracy below manual min_acc of 0.4
Current Trial 5 parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.2781743075162943e-06, 'batch_size': 4, 'warmup_ratio': 0.6682042061326278, 'weight_decay': 0.11855116033570309, 'adam_beta1': 0.8482673079863498, 'adam_beta2': 0.9938674227791334, 'adam_epsilon': 8.072037956756882e-08, 'lr_scheduler_type': 'linear'}


Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/137190 [00:00<?, ?it/s]

{'loss': 1.6108, 'grad_norm': 1.8368643522262573, 'learning_rate': 6.971530295929434e-09, 'epoch': 0.11}


[W 2024-10-18 12:28:51,841] Trial 5 failed with parameters: {'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.2781743075162943e-06, 'batch_size': 4, 'warmup_ratio': 0.6682042061326278, 'weight_decay': 0.11855116033570309, 'adam_beta1': 0.8482673079863498, 'adam_beta2': 0.9938674227791334, 'adam_epsilon': 8.072037956756882e-08, 'lr_scheduler_type': 'linear'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\OEM\AppData\Local\Temp\ipykernel_4464\1873140353.py", line 59, in objective
    trainer.train()
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\site-packages\transformers\trainer.py", line 1938, in train
    return inner_training_loop(
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\site-packages\transformers\trainer.py", line 2279, in _inner_training_lo

KeyboardInterrupt: 

## 4.4 Optuna Hyperparameters Tuning 3

## 4.4 Evaluation of Best Model

In [32]:
# Create the Trainer
trainer = create_trainer(model_name="./MCQA-Combined/Optuna/trial_5/checkpoint-22865", run_name="Optuna")
# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print("Test Results:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/576 [00:00<?, ?it/s]

Test Results:
Accuracy: 0.6484
F1 Score: 0.6501


## 4.3 Evaluate SQUAD DeBERTa (Acc=21.9%)

In [16]:
# Create the Trainer
trainer = create_trainer(model_name="./squad-trained-model", run_name="Squad-Run")
# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print("Test Results:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at ./squad-trained-model and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/576 [00:00<?, ?it/s]

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: mzak071 (COMPSCI714). Use `wandb login --relogin` to force relogin


Test Results:
Accuracy: 0.2190
F1 Score: 0.2305


## 4.4 Evaluate Trained SQUAD DeBERTa


In [12]:
path = "./squad-trained-model"
model =  (path)
# Create the Trainer
squad_trainer = create_trainer(run_name="Squad-Run2", batch_size=3, num_train_epochs=10)

# Train the model
squad_trainer.train()

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at ./squad-trained-model and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: mzak071 (COMPSCI714). Use `wandb login --relogin` to force relogin


  0%|          | 0/60970 [00:00<?, ?it/s]

{'loss': 1.5257, 'grad_norm': 7.596715450286865, 'learning_rate': 4.919968223047422e-05, 'epoch': 0.08}
{'loss': 1.5156, 'grad_norm': 9.40099811553955, 'learning_rate': 4.918816880581236e-05, 'epoch': 0.16}
{'loss': 1.4678, 'grad_norm': 3.4478559494018555, 'learning_rate': 4.916012062578594e-05, 'epoch': 0.25}
{'loss': 1.5513, 'grad_norm': 17.781841278076172, 'learning_rate': 4.911566214483556e-05, 'epoch': 0.33}
{'loss': 1.539, 'grad_norm': 2.640469789505005, 'learning_rate': 4.905492193723485e-05, 'epoch': 0.41}
{'loss': 1.5319, 'grad_norm': 4.278200626373291, 'learning_rate': 4.897752559048373e-05, 'epoch': 0.49}
{'loss': 1.6115, 'grad_norm': 4.821766376495361, 'learning_rate': 4.8883736139178334e-05, 'epoch': 0.57}
{'loss': 1.6179, 'grad_norm': 2.340773344039917, 'learning_rate': 4.877361665371831e-05, 'epoch': 0.66}
{'loss': 1.6139, 'grad_norm': 1.792155146598816, 'learning_rate': 4.864724118592782e-05, 'epoch': 0.74}
{'loss': 1.612, 'grad_norm': 1.8655813932418823, 'learning_rate

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.23053501522401043, 'eval_f1': 0.08637932701924746, 'eval_loss': 1.609375, 'eval_runtime': 84.1784, 'eval_samples_per_second': 27.311, 'eval_steps_per_second': 6.831, 'epoch': 1.0}
{'loss': 1.5381, 'grad_norm': 4.530053615570068, 'learning_rate': 4.7981041883623604e-05, 'epoch': 1.07}
{'loss': 1.5394, 'grad_norm': 6.36260461807251, 'learning_rate': 4.7774877735027944e-05, 'epoch': 1.15}
{'loss': 1.5295, 'grad_norm': 6.978030681610107, 'learning_rate': 4.755312922401073e-05, 'epoch': 1.23}
{'loss': 1.5172, 'grad_norm': 4.914771556854248, 'learning_rate': 4.731594546933761e-05, 'epoch': 1.31}
{'loss': 1.5025, 'grad_norm': 5.194087505340576, 'learning_rate': 4.706348596948085e-05, 'epoch': 1.39}
{'loss': 1.5084, 'grad_norm': 6.125194549560547, 'learning_rate': 4.679592049536167e-05, 'epoch': 1.48}
{'loss': 1.5085, 'grad_norm': 7.808128356933594, 'learning_rate': 4.651342897618479e-05, 'epoch': 1.56}
{'loss': 1.5049, 'grad_norm': 4.580046653747559, 'learning_rate': 4.621

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.23053501522401043, 'eval_f1': 0.08637932701924746, 'eval_loss': 1.609375, 'eval_runtime': 83.064, 'eval_samples_per_second': 27.677, 'eval_steps_per_second': 6.922, 'epoch': 2.0}
{'loss': 1.5065, 'grad_norm': 3.979177474975586, 'learning_rate': 4.4516356996006614e-05, 'epoch': 2.05}
{'loss': 1.503, 'grad_norm': 4.245189666748047, 'learning_rate': 4.413602280000609e-05, 'epoch': 2.13}
{'loss': 1.4982, 'grad_norm': 4.549670219421387, 'learning_rate': 4.374180237162288e-05, 'epoch': 2.21}
{'loss': 1.473, 'grad_norm': 4.50761604309082, 'learning_rate': 4.3334709694722466e-05, 'epoch': 2.3}
{'loss': 1.491, 'grad_norm': 5.555902004241943, 'learning_rate': 4.291501852608098e-05, 'epoch': 2.38}
{'loss': 1.4926, 'grad_norm': 7.867809772491455, 'learning_rate': 4.2483887209316017e-05, 'epoch': 2.46}
{'loss': 1.4895, 'grad_norm': 4.849914073944092, 'learning_rate': 4.2039877782223496e-05, 'epoch': 2.54}
{'loss': 1.4876, 'grad_norm': 6.086119651794434, 'learning_rate': 4.158414

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.23140495867768596, 'eval_f1': 0.0997437610992934, 'eval_loss': 1.6093804836273193, 'eval_runtime': 82.8693, 'eval_samples_per_second': 27.742, 'eval_steps_per_second': 6.939, 'epoch': 3.0}
{'loss': 1.4853, 'grad_norm': 4.019048690795898, 'learning_rate': 3.914155816324496e-05, 'epoch': 3.03}
{'loss': 1.4793, 'grad_norm': 3.2914605140686035, 'learning_rate': 3.862217079335065e-05, 'epoch': 3.12}
{'loss': 1.475, 'grad_norm': 4.0273590087890625, 'learning_rate': 3.809442077199905e-05, 'epoch': 3.2}
{'loss': 1.4893, 'grad_norm': 4.321653366088867, 'learning_rate': 3.755654788239679e-05, 'epoch': 3.28}
{'loss': 1.4797, 'grad_norm': 3.2571210861206055, 'learning_rate': 3.7009962129657125e-05, 'epoch': 3.36}
{'loss': 1.4827, 'grad_norm': 4.648759841918945, 'learning_rate': 3.6455031075178315e-05, 'epoch': 3.44}
{'loss': 1.4983, 'grad_norm': 4.272787094116211, 'learning_rate': 3.589326140412408e-05, 'epoch': 3.53}
{'loss': 1.4997, 'grad_norm': 4.295674800872803, 'learning_r

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.23053501522401043, 'eval_f1': 0.08637932701924746, 'eval_loss': 1.609375, 'eval_runtime': 83.0768, 'eval_samples_per_second': 27.673, 'eval_steps_per_second': 6.921, 'epoch': 4.0}
{'loss': 1.4719, 'grad_norm': 2.996605634689331, 'learning_rate': 3.23712560831127e-05, 'epoch': 4.02}
{'loss': 1.4712, 'grad_norm': 6.47822904586792, 'learning_rate': 3.1763434772831785e-05, 'epoch': 4.1}
{'loss': 1.4711, 'grad_norm': 4.895840644836426, 'learning_rate': 3.1152026093494284e-05, 'epoch': 4.18}
{'loss': 1.468, 'grad_norm': 5.00419807434082, 'learning_rate': 3.0534990799659266e-05, 'epoch': 4.26}
{'loss': 1.5023, 'grad_norm': 6.500093460083008, 'learning_rate': 2.9913964414753657e-05, 'epoch': 4.35}
{'loss': 1.4751, 'grad_norm': 3.58735728263855, 'learning_rate': 2.9289364559114705e-05, 'epoch': 4.43}
{'loss': 1.4775, 'grad_norm': 3.1968586444854736, 'learning_rate': 2.86616112561242e-05, 'epoch': 4.51}
{'loss': 1.4799, 'grad_norm': 4.51552152633667, 'learning_rate': 2.803112

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.23053501522401043, 'eval_f1': 0.08637932701924746, 'eval_loss': 1.609375, 'eval_runtime': 82.9285, 'eval_samples_per_second': 27.723, 'eval_steps_per_second': 6.934, 'epoch': 5.0}
{'loss': 1.4922, 'grad_norm': 4.3420538902282715, 'learning_rate': 2.485389750418722e-05, 'epoch': 5.0}
{'loss': 1.4729, 'grad_norm': 3.0050296783447266, 'learning_rate': 2.4217248592186236e-05, 'epoch': 5.08}
{'loss': 1.4857, 'grad_norm': 2.575566053390503, 'learning_rate': 2.3579581247030347e-05, 'epoch': 5.17}
{'loss': 1.482, 'grad_norm': 3.6931982040405273, 'learning_rate': 2.2942600100776998e-05, 'epoch': 5.25}
{'loss': 1.4651, 'grad_norm': 4.2977800369262695, 'learning_rate': 2.2306733502829025e-05, 'epoch': 5.33}
{'loss': 1.4858, 'grad_norm': 3.5150718688964844, 'learning_rate': 2.167367587920281e-05, 'epoch': 5.41}
{'loss': 1.4576, 'grad_norm': 3.327179431915283, 'learning_rate': 2.1041315778235267e-05, 'epoch': 5.49}
{'loss': 1.4876, 'grad_norm': 4.94452428817749, 'learning_rate':

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.23053501522401043, 'eval_f1': 0.08637932701924746, 'eval_loss': 1.609375, 'eval_runtime': 83.1284, 'eval_samples_per_second': 27.656, 'eval_steps_per_second': 6.917, 'epoch': 6.0}
{'loss': 1.4649, 'grad_norm': 3.833265781402588, 'learning_rate': 1.670537689695318e-05, 'epoch': 6.07}
{'loss': 1.4657, 'grad_norm': 3.4999027252197266, 'learning_rate': 1.6105094870624732e-05, 'epoch': 6.15}
{'loss': 1.4825, 'grad_norm': 5.763112545013428, 'learning_rate': 1.5509316981925158e-05, 'epoch': 6.23}
{'loss': 1.4832, 'grad_norm': 2.866100549697876, 'learning_rate': 1.491965228615947e-05, 'epoch': 6.31}
{'loss': 1.4803, 'grad_norm': 2.8617475032806396, 'learning_rate': 1.4337656865767735e-05, 'epoch': 6.4}
{'loss': 1.4757, 'grad_norm': 7.2106709480285645, 'learning_rate': 1.376138957802862e-05, 'epoch': 6.48}
{'loss': 1.4663, 'grad_norm': 5.259767532348633, 'learning_rate': 1.3192410908373847e-05, 'epoch': 6.56}
{'loss': 1.4785, 'grad_norm': 4.387902736663818, 'learning_rate': 

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.23183993040452372, 'eval_f1': 0.10354681077762767, 'eval_loss': 1.6093758344650269, 'eval_runtime': 83.0025, 'eval_samples_per_second': 27.698, 'eval_steps_per_second': 6.928, 'epoch': 7.0}
{'loss': 1.4672, 'grad_norm': 4.07132625579834, 'learning_rate': 9.95367645532406e-06, 'epoch': 7.05}
{'loss': 1.4828, 'grad_norm': 4.484358787536621, 'learning_rate': 9.44610498263851e-06, 'epoch': 7.13}
{'loss': 1.455, 'grad_norm': 5.40916109085083, 'learning_rate': 8.94872401860413e-06, 'epoch': 7.22}
{'loss': 1.4818, 'grad_norm': 3.5796735286712646, 'learning_rate': 8.461868035976024e-06, 'epoch': 7.3}
{'loss': 1.4889, 'grad_norm': 7.282806873321533, 'learning_rate': 7.986805392342239e-06, 'epoch': 7.38}
{'loss': 1.4453, 'grad_norm': 3.1540772914886475, 'learning_rate': 7.522869949373102e-06, 'epoch': 7.46}
{'loss': 1.464, 'grad_norm': 3.1888012886047363, 'learning_rate': 7.069477321731563e-06, 'epoch': 7.54}
{'loss': 1.4721, 'grad_norm': 4.016075611114502, 'learning_rate': 6

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.23053501522401043, 'eval_f1': 0.08637932701924746, 'eval_loss': 1.609375, 'eval_runtime': 83.0326, 'eval_samples_per_second': 27.688, 'eval_steps_per_second': 6.925, 'epoch': 8.0}
{'loss': 1.4533, 'grad_norm': 3.5164966583251953, 'learning_rate': 4.607533233848777e-06, 'epoch': 8.04}
{'loss': 1.477, 'grad_norm': 3.4276087284088135, 'learning_rate': 4.2425795917773645e-06, 'epoch': 8.12}
{'loss': 1.4612, 'grad_norm': 6.394713401794434, 'learning_rate': 3.891315662386942e-06, 'epoch': 8.2}
{'loss': 1.4453, 'grad_norm': 4.075328826904297, 'learning_rate': 3.5539776594105037e-06, 'epoch': 8.28}
{'loss': 1.4689, 'grad_norm': 3.991068124771118, 'learning_rate': 3.230792431841896e-06, 'epoch': 8.36}
{'loss': 1.4974, 'grad_norm': 4.339132785797119, 'learning_rate': 2.9219773113871763e-06, 'epoch': 8.45}
{'loss': 1.4617, 'grad_norm': 7.477202415466309, 'learning_rate': 2.628313759225937e-06, 'epoch': 8.53}
{'loss': 1.455, 'grad_norm': 3.3329036235809326, 'learning_rate': 2.3

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.23053501522401043, 'eval_f1': 0.08637932701924746, 'eval_loss': 1.6093724966049194, 'eval_runtime': 83.0172, 'eval_samples_per_second': 27.693, 'eval_steps_per_second': 6.926, 'epoch': 9.0}
{'loss': 1.4614, 'grad_norm': 4.2972731590271, 'learning_rate': 1.1799981822759164e-06, 'epoch': 9.02}
{'loss': 1.4505, 'grad_norm': 4.579303741455078, 'learning_rate': 9.926857147363265e-07, 'epoch': 9.1}
{'loss': 1.4744, 'grad_norm': 2.0948286056518555, 'learning_rate': 8.212484093368643e-07, 'epoch': 9.18}
{'loss': 1.4763, 'grad_norm': 2.1727755069732666, 'learning_rate': 6.658015521700958e-07, 'epoch': 9.27}
{'loss': 1.4798, 'grad_norm': 4.078769683837891, 'learning_rate': 5.264496762647245e-07, 'epoch': 9.35}
{'loss': 1.4521, 'grad_norm': 5.194500923156738, 'learning_rate': 4.0328649129046553e-07, 'epoch': 9.43}
{'loss': 1.4733, 'grad_norm': 4.355737686157227, 'learning_rate': 2.9639482054129713e-07, 'epoch': 9.51}
{'loss': 1.4531, 'grad_norm': 2.2465782165527344, 'learning_

  0%|          | 0/575 [00:00<?, ?it/s]

{'eval_accuracy': 0.23053501522401043, 'eval_f1': 0.08637932701924746, 'eval_loss': 1.609375, 'eval_runtime': 82.9839, 'eval_samples_per_second': 27.704, 'eval_steps_per_second': 6.929, 'epoch': 10.0}
{'train_runtime': 23204.0495, 'train_samples_per_second': 7.882, 'train_steps_per_second': 2.628, 'train_loss': 1.4887918260501452, 'epoch': 10.0}


TrainOutput(global_step=60970, training_loss=1.4887918260501452, metrics={'train_runtime': 23204.0495, 'train_samples_per_second': 7.882, 'train_steps_per_second': 2.628, 'total_flos': 2.40617214893568e+17, 'train_loss': 1.4887918260501452, 'epoch': 10.0})

In [13]:
# Evaluate the model on the test set
test_results = squad_trainer.evaluate(eval_dataset=get_test_encoded())
print("Test Results:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

  0%|          | 0/576 [00:00<?, ?it/s]

Test Results:
Accuracy: 0.2190
F1 Score: 0.0898


In [27]:
path = "./MCQA-Combined/Squad-Run/microsoft-deberta-v3-base/checkpoint-18291"
model =  AutoModelForMultipleChoice.from_pretrained(path)
# Create the Trainer
squad_trainer = create_trainer(run_name="Squad-Run2", batch_size=3)

# Evaluate the model on the test set
test_results = squad_trainer.evaluate(eval_dataset=get_test_encoded())
print("Test Results:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


wandb: Currently logged in as: mzak071 (COMPSCI714). Use `wandb login --relogin` to force relogin


Test Results:
Accuracy: 0.2873
F1 Score: 0.2872


In [45]:
# Train the model
squad_trainer.train()

# Evaluate the model on the test set
test_results = squad_trainer.evaluate(eval_dataset=get_test_encoded())
print("Test Results:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

# End of NoteBook