# 0. Imports, libraries and rusable functions

In [1]:
from project_imports import *
import use_gpu
# Clear any cached memory to start fresh for each trial
torch.cuda.empty_cache()
gc.collect()

Project libraries imported!
GPU: NVIDIA GeForce RTX 4070 Ti SUPER is available.
Device:cuda


26

# 1. Global Variables

In [2]:
## Arguments and global vriables
dataset_name="MCQA-Combined-8"
global_run_name="Optuna-1"
pretrained_model_name = "microsoft/deberta-v3-base"
#pretrained_model_name = "sentence-transformers/all-mpnet-base-v2"
normalized_model_name = pretrained_model_name.replace("/", "-")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
assert isinstance( tokenizer, PreTrainedTokenizerFast )
data_collator = DefaultDataCollator()
max_length = 512 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
pad_on_right = right_padding = tokenizer.padding_side == 'right'
global_counter = 0
traing_answer_mismatches = []
logger = logging.getLogger(__name__)



# 2. Prepare the Dataset 

In [3]:
# Load the combined dataset
combined_dataset = load_from_disk('cleaned_dataset')

combined_dataset

DatasetDict({
    train: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 1072514
    })
    validation: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 118521
    })
    test: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 200566
    })
})

In [4]:
# Initialize lists to hold datasets
train_datasets = []
val_datasets = []
test_datasets = []

# List of datasets to combine without 'ReClor'
datasets_to_combine = ['AR-LSAT', 'LogiQA 2.0', 'RTE', 'FOLIO', 'PrOntoQA', 'MRPC', 'Adversarial NLI', 'ConTRoL' ]

# Loop through each dataset and filter
for ds_name in datasets_to_combine:
    train_datasets.append(combined_dataset['train'].filter(lambda x: x['Source Dataset'] == ds_name))
    val_datasets.append(combined_dataset['validation'].filter(lambda x: x['Source Dataset'] == ds_name))
    test_datasets.append(combined_dataset['test'].filter(lambda x: x['Source Dataset'] == ds_name))

# Concatenate datasets
combined_train = concatenate_datasets(train_datasets)
combined_val = concatenate_datasets(val_datasets)
combined_test = concatenate_datasets(test_datasets)

# Concatenate validation data into the training data and use the Test dataset for validation 
combined_train = concatenate_datasets([combined_train, combined_val])
combined_val = combined_test

# Shuffle the combined dataset
# To ensure that each training batch has a chance to contain a mix of examples from all sources. 
# This helps in reducing variance and improving the generalization of the model.
combined_train = combined_train.shuffle(seed=42)

In [5]:
def mcqa_preprocess_function(examples):
    # Determine the maximum number of choices
    max_num_choices = 5  # Since AR-LSAT has 5 options, we'll pad others to 5
    contexts = examples['Context']
    questions = examples['Question']
    options_list = examples['Options']
    labels = examples['Label']
    
    first_sentences = []
    second_sentences = []
    labels_adjusted = []
    
    for context, question, options, label in zip(contexts, questions, options_list, labels):
        num_choices = len(options)
        # Pad options to have max_num_choices
        if num_choices < max_num_choices:
            options += [''] * (max_num_choices - num_choices)
        first_sentences.append([context] * max_num_choices)
        second_sentences.append([f"{question} {option}" for option in options])
        labels_adjusted.append(label)
    
    # Flatten the lists
    first_sentences = [item for sublist in first_sentences for item in sublist]
    second_sentences = [item for sublist in second_sentences for item in sublist]
    
    # Tokenize the inputs
    tokenized_examples = tokenizer(
        first_sentences,
        second_sentences,
        truncation=True,
        max_length=512,
        padding='max_length',
    )
    
    # Un-flatten to shape (num_examples, max_num_choices, seq_length)
    tokenized_inputs = {
        k: [v[i:i + max_num_choices] for i in range(0, len(v), max_num_choices)]
        for k, v in tokenized_examples.items()
    }
    
    # Labels
    tokenized_inputs["labels"] = labels_adjusted
    
    return tokenized_inputs




In [6]:
# Apply the preprocessing function to the combined datasets
encoded_train = combined_train.map(mcqa_preprocess_function, batched=True)
encoded_val = combined_val.map(mcqa_preprocess_function, batched=True)
encoded_test = combined_test.map(mcqa_preprocess_function, batched=True)


Map:   0%|          | 0/31993 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [8]:
# Set the format of the datasets to PyTorch tensors
encoded_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
encoded_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
encoded_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


def get_train_encoded():
    return encoded_train

def get_val_encoded():
    return encoded_val

def get_test_encoded():
    return encoded_test


print("Number of training examples:", len(encoded_train))
print("Number of validation examples:", len(encoded_val))
print("Number of test examples:", len(encoded_test))

Number of training examples: 31993
Number of validation examples: 5069
Number of test examples: 5069


# 3. Reusable Functions

In [9]:
# Load the accuracy metric
accuracy = evaluate.load('accuracy')

# Define the compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy.compute(predictions=predictions, references=labels)['accuracy']
    f1 = f1_score(labels, predictions, average='weighted')
    return {'eval_accuracy': acc, 'eval_f1': f1}

In [10]:
def create_training_args(run_name="Default-Run", num_train_epochs=3, learning_rate=4.92e-05, batch_size=3):
    """
    Generates training arguments for training a machine learning model.

    Parameters:
    - dataset_name (str): The name of the dataset.
    - run_name (str): The name of the run, useful for logging and saving models.
    - model_name (str): The name of the model, typically including its configuration.
    - num_train_epochs (int): The number of epochs to train for.
    - learning_rate (float): The learning rate for training.
    - batch_size (int): The batch size used for training.

    Returns:
    - TrainingArguments: A configured TrainingArguments instance.
    """    
    output_dir = f"./{dataset_name}/{run_name}/{normalized_model_name}"
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        report_to="none",  # Disable all integrations
        overwrite_output_dir=True,
        metric_for_best_model='eval_accuracy',
        greater_is_better=True,
        load_best_model_at_end=True,
        save_total_limit=3,
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=4,
        warmup_steps=398,
        weight_decay=0.194,
        adam_beta1=0.837,
        adam_beta2=0.997,
        adam_epsilon=5.87e-07,
        lr_scheduler_type='cosine',
        fp16=True,  # Enable mixed-precision training
    )
    
    return training_args


In [11]:
def model_init(model_name=pretrained_model_name, dropout_rate=0.1):
    model = AutoModelForMultipleChoice.from_pretrained(model_name)
    model.config.hidden_dropout_prob = dropout_rate
    return model

In [12]:
def create_trainer(model_name=pretrained_model_name,run_name="Default-Run", num_train_epochs=3, learning_rate=4.92e-05, batch_size=4):
    trainer = Trainer(
        model=model_init(model_name),
        args=create_training_args(run_name=run_name, num_train_epochs=num_train_epochs, learning_rate=learning_rate, batch_size=batch_size),
        train_dataset=get_train_encoded(),
        eval_dataset=get_val_encoded(),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    return trainer


In [13]:
class AdvancedEarlyStoppingCallback(TrainerCallback):
    """
    A callback to stop training when either the performance falls below a certain threshold
    or if there is no improvement over a set number of epochs.
    """
    def __init__(self, metric_name, patience):
        self.metric_name = metric_name
        self.patience = patience        
        self.best_score = None
        self.no_improve_epochs = 0
        self.config_file = "early_stopping_config.json"  # Config file for early stopping values

    def read_early_stopping_config(self):
        """
        Reads the early stopping configuration from the file system.
        Returns the configuration as a dictionary.
        """
        if os.path.exists(self.config_file):
            with open(self.config_file, 'r') as file:
                config = json.load(file)
            return config
        else:
            raise FileNotFoundError(f"Config file not found: {self.config_file}")
    def reset_manual_stop_flag(self):
        """
        Resets the manual stop flag to False in the early stopping config file.
        """
        config = self.read_early_stopping_config()
        config['manual_stop'] = False
        with open(self.config_file, 'w') as file:
            json.dump(config, file, indent=4)

    def on_evaluate(self, args, state, control, **kwargs):
        metric_value = kwargs['metrics'].get(self.metric_name)

        if self.best_score is None or metric_value > self.best_score:
            self.best_score = metric_value
            self.no_improve_epochs = 0
        else:
            self.no_improve_epochs += 1

        # Check if no improvement has been seen over the allowed patience
        if self.no_improve_epochs >= self.patience:
            control.should_training_stop = True
            print(f"Stopping training: No improvement in {self.metric_name} for {self.patience} epochs")


        # Read the early stopping configuration
        config = self.read_early_stopping_config()
        min_accuracy = config.get("min_accuracy", 0.35)                
        num_epochs_min_acc = config.get("num_epochs_min_acc", 2)  
        max_variance = config.get("max_variance", 0.2)  

        # Check if performance is below the threshold
        if metric_value < min_accuracy:
            control.should_training_stop = True
            print(f"Stopping training: {self.metric_name} below manual min_acc of {min_accuracy}")

         # Manual stop from config
        if config.get("manual_stop", False):
            control.should_training_stop = True
            print(f"Manual early stopping triggered!!")
            self.reset_manual_stop_flag()  # Reset the flag for future runs
            


In [14]:
# Optuna objective function for hyperparameter tuning
def objective(trial):
    # Clear any cached memory to start fresh for each trial
    torch.cuda.empty_cache()
    gc.collect()

    
    model_name = trial.suggest_categorical('model_name', [pretrained_model_name])     
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-7, 1e-4, log=True)
    batch_size = trial.suggest_categorical('batch_size', [3, 3])
    #warmup_steps = trial.suggest_int('warmup_steps', 0, 1000)
    warmup_ratio= trial.suggest_float('warmup_ratio', 0.0, 1.0)
    weight_decay = trial.suggest_float('weight_decay', 0.0, 0.25)
    adam_beta1 = trial.suggest_float('adam_beta1', 0.8, 0.95)
    adam_beta2 = trial.suggest_float('adam_beta2', 0.990, 0.999)
    adam_epsilon = trial.suggest_float('adam_epsilon', 1e-8, 1e-6)
    lr_scheduler_type = trial.suggest_categorical('lr_scheduler_type', ['linear', 'cosine', 'cosine_with_restarts']) #,'constant_with_warmup'   
    

    output_dir = f"./{dataset_name}/{global_run_name}/trial_{trial.number}"

    training_args = TrainingArguments(
        output_dir=output_dir,
        report_to="none",  # Disable all integrations
        overwrite_output_dir=True,
        metric_for_best_model='eval_accuracy',
        greater_is_better=True,
        load_best_model_at_end=True,
        save_total_limit=3,
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=30,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_ratio=warmup_ratio,
        weight_decay=weight_decay,
        adam_beta1=adam_beta1,
        adam_beta2=adam_beta2,
        adam_epsilon=adam_epsilon,
        lr_scheduler_type=lr_scheduler_type,
        fp16=True,  # Enable mixed-precision training
    ) 
    
    # Print trial parameters
    print(f"Current Trial {trial.number} parameters: {trial.params}")
    
    trainer = Trainer(
        model=model_init(model_name, dropout_rate),
        args=training_args,
        train_dataset=get_train_encoded(),
        eval_dataset=get_val_encoded(),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[AdvancedEarlyStoppingCallback(metric_name='eval_accuracy', patience=1)]
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
        
    torch.cuda.empty_cache()  # Clear cache after evaluation
    gc.collect()  # Collect garbage

    return eval_results['eval_accuracy']


# 4. DeBERTa Joint MTL Training on the Dataset

## 4.1 Optuna Hyperparameters Tuning 1

In [15]:
# Create a study object and optimize the objective
global_run_name="Optuna-1"
study = optuna.create_study(direction='maximize')
study.enqueue_trial({'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.0052258285035737e-05, 'batch_size': 3, 'warmup_ratio': 0.04149176551014211, 'weight_decay': 0.006685281279171756, 'adam_beta1': 0.9429922176765829, 'adam_beta2': 0.9918592948813898, 'adam_epsilon': 8.867767549079712e-08, 'lr_scheduler_type': 'cosine_with_restarts'})
#study.enqueue_trial({'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'})
study.optimize(objective, n_trials=20)


[I 2024-10-22 13:46:11,719] A new study created in memory with name: no-name-29336d1f-a95e-497e-b1de-30ae37010f92
[W 2024-10-22 13:46:11,858] Trial 0 failed with parameters: {} because of the following error: ValueError("'microsoft/deberta-v3-base' not in ('sentence-transformers/all-mpnet-base-v2',).").
Traceback (most recent call last):
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\OEM\AppData\Local\Temp\ipykernel_34512\2979044354.py", line 8, in objective
    model_name = trial.suggest_categorical('model_name', [pretrained_model_name])
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\site-packages\optuna\trial\_trial.py", line 402, in suggest_categorical
    return self._suggest(name, CategoricalDistribution(choices=choices))
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\site-packages\optuna\trial\_trial.py", line 623, in _suggest
    if self._is_f

ValueError: 'microsoft/deberta-v3-base' not in ('sentence-transformers/all-mpnet-base-v2',).

## 4.3 Optuna Hyperparameters Tuning 2

In [25]:
# Create a study object and optimize the objective for pretrained_model_name = "sentence-transformers/all-mpnet-base-v2"
global_run_name="Optuna-2"
study = optuna.create_study(direction='maximize')
study.enqueue_trial({'model_name': pretrained_model_name, 'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'})
study.optimize(objective, n_trials=20)


[I 2024-10-22 01:57:26,347] A new study created in memory with name: no-name-198a630a-9e5c-4ede-ae18-36aabe3fac1a


Current Trial 0 parameters: {'model_name': 'sentence-transformers/all-mpnet-base-v2', 'dropout_rate': 0.15718330934471478, 'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


Some weights of MPNetForMultipleChoice were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/274740 [00:00<?, ?it/s]

{'loss': 1.6088, 'grad_norm': 0.9182241559028625, 'learning_rate': 4.780025580617451e-08, 'epoch': 0.05}
{'loss': 1.6088, 'grad_norm': 0.7863313555717468, 'learning_rate': 9.569649606175901e-08, 'epoch': 0.11}
{'loss': 1.6093, 'grad_norm': 0.7911553978919983, 'learning_rate': 1.4368872076675347e-07, 'epoch': 0.16}
{'loss': 1.6085, 'grad_norm': 0.8689716458320618, 'learning_rate': 1.91488976572928e-07, 'epoch': 0.22}
{'loss': 1.6076, 'grad_norm': 0.8607678413391113, 'learning_rate': 2.3948120127792246e-07, 'epoch': 0.27}
{'loss': 1.6065, 'grad_norm': 1.0950431823730469, 'learning_rate': 2.8737744153350694e-07, 'epoch': 0.33}
{'loss': 1.6024, 'grad_norm': 1.1426678895950317, 'learning_rate': 3.3536966623850147e-07, 'epoch': 0.38}
{'loss': 1.5868, 'grad_norm': 1.5567299127578735, 'learning_rate': 3.8336189094349596e-07, 'epoch': 0.44}
{'loss': 1.5336, 'grad_norm': 2.8619110584259033, 'learning_rate': 4.3135411564849044e-07, 'epoch': 0.49}
{'loss': 1.4498, 'grad_norm': 3.4570772647857666, 

  0%|          | 0/1393 [00:00<?, ?it/s]

{'eval_accuracy': 0.45789190159813253, 'eval_f1': 0.4441435889737021, 'eval_loss': 1.154205560684204, 'eval_runtime': 130.2404, 'eval_samples_per_second': 42.759, 'eval_steps_per_second': 10.696, 'epoch': 1.0}
Stopping training: eval_accuracy below manual min_acc of 0.5
{'train_runtime': 2960.9734, 'train_samples_per_second': 371.128, 'train_steps_per_second': 92.787, 'train_loss': 1.4549762686675072, 'epoch': 1.0}


  0%|          | 0/1393 [00:00<?, ?it/s]

[I 2024-10-22 02:48:58,638] Trial 0 finished with value: 0.45789190159813253 and parameters: {'model_name': 'sentence-transformers/all-mpnet-base-v2', 'dropout_rate': 0.15718330934471478, 'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 0.45789190159813253.


Stopping training: No improvement in eval_accuracy for 1 epochs
Stopping training: eval_accuracy below manual min_acc of 0.5
Current Trial 1 parameters: {'model_name': 'sentence-transformers/all-mpnet-base-v2', 'dropout_rate': 0.3046487955829154, 'learning_rate': 4.1206992440781206e-07, 'batch_size': 3, 'warmup_ratio': 0.07116169962743613, 'weight_decay': 0.2388626303853753, 'adam_beta1': 0.8422162512825583, 'adam_beta2': 0.9904831834704457, 'adam_epsilon': 5.201468169219309e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


Some weights of MPNetForMultipleChoice were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/366300 [00:00<?, ?it/s]

{'loss': 1.6105, 'grad_norm': 1.0038748979568481, 'learning_rate': 7.888245378428597e-09, 'epoch': 0.04}
{'loss': 1.6097, 'grad_norm': 0.9802391529083252, 'learning_rate': 1.5776490756857193e-08, 'epoch': 0.08}
{'loss': 1.6096, 'grad_norm': 0.9342759847640991, 'learning_rate': 2.364892802831499e-08, 'epoch': 0.12}
{'loss': 1.6092, 'grad_norm': 0.9232449531555176, 'learning_rate': 3.155298151371439e-08, 'epoch': 0.16}
{'loss': 1.6099, 'grad_norm': 0.9416474103927612, 'learning_rate': 3.945703499911378e-08, 'epoch': 0.2}
{'loss': 1.6095, 'grad_norm': 0.9696550369262695, 'learning_rate': 4.734528037754238e-08, 'epoch': 0.25}
{'loss': 1.6097, 'grad_norm': 1.3210350275039673, 'learning_rate': 5.524933386294177e-08, 'epoch': 0.29}
{'loss': 1.6095, 'grad_norm': 0.9429779648780823, 'learning_rate': 6.315338734834116e-08, 'epoch': 0.33}
{'loss': 1.6089, 'grad_norm': 1.0847434997558594, 'learning_rate': 7.105744083374056e-08, 'epoch': 0.37}
{'loss': 1.6081, 'grad_norm': 1.0135345458984375, 'lear

  0%|          | 0/1857 [00:00<?, ?it/s]

{'eval_accuracy': 0.4410127491470641, 'eval_f1': 0.43217578765164366, 'eval_loss': 1.4569653272628784, 'eval_runtime': 129.4868, 'eval_samples_per_second': 43.008, 'eval_steps_per_second': 14.341, 'epoch': 1.0}
Stopping training: eval_accuracy below manual min_acc of 0.5
{'train_runtime': 2948.5397, 'train_samples_per_second': 372.693, 'train_steps_per_second': 124.231, 'train_loss': 1.6010893125791807, 'epoch': 1.0}


  0%|          | 0/1857 [00:00<?, ?it/s]

[I 2024-10-22 03:40:18,641] Trial 1 finished with value: 0.4410127491470641 and parameters: {'model_name': 'sentence-transformers/all-mpnet-base-v2', 'dropout_rate': 0.3046487955829154, 'learning_rate': 4.1206992440781206e-07, 'batch_size': 3, 'warmup_ratio': 0.07116169962743613, 'weight_decay': 0.2388626303853753, 'adam_beta1': 0.8422162512825583, 'adam_beta2': 0.9904831834704457, 'adam_epsilon': 5.201468169219309e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 0.45789190159813253.


Stopping training: No improvement in eval_accuracy for 1 epochs
Stopping training: eval_accuracy below manual min_acc of 0.5
Current Trial 2 parameters: {'model_name': 'sentence-transformers/all-mpnet-base-v2', 'dropout_rate': 0.10926740728558144, 'learning_rate': 1.335608479306097e-05, 'batch_size': 3, 'warmup_ratio': 0.41012687080315513, 'weight_decay': 0.09479782297028039, 'adam_beta1': 0.912797523704336, 'adam_beta2': 0.997103226208837, 'adam_epsilon': 7.229757891985637e-07, 'lr_scheduler_type': 'linear'}


Some weights of MPNetForMultipleChoice were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/366300 [00:00<?, ?it/s]

{'loss': 1.6105, 'grad_norm': 1.0041431188583374, 'learning_rate': 4.418540998569728e-08, 'epoch': 0.04}
{'loss': 1.6095, 'grad_norm': 0.9794089198112488, 'learning_rate': 8.863753270772673e-08, 'epoch': 0.08}
{'loss': 1.6093, 'grad_norm': 0.9342834949493408, 'learning_rate': 1.330896554297562e-07, 'epoch': 0.12}
{'loss': 1.6086, 'grad_norm': 0.9255183935165405, 'learning_rate': 1.774528739063416e-07, 'epoch': 0.16}
{'loss': 1.609, 'grad_norm': 0.9468817710876465, 'learning_rate': 2.2190499662837103e-07, 'epoch': 0.2}
{'loss': 1.6078, 'grad_norm': 0.9842406511306763, 'learning_rate': 2.6626821510495643e-07, 'epoch': 0.25}
{'loss': 1.6068, 'grad_norm': 1.0521067380905151, 'learning_rate': 3.1072033782698587e-07, 'epoch': 0.29}
{'loss': 1.6038, 'grad_norm': 1.0336287021636963, 'learning_rate': 3.5517246054901536e-07, 'epoch': 0.33}
{'loss': 1.5938, 'grad_norm': 1.386725902557373, 'learning_rate': 3.996245832710448e-07, 'epoch': 0.37}
{'loss': 1.5667, 'grad_norm': 1.9762277603149414, 'lea

  0%|          | 0/1857 [00:00<?, ?it/s]

{'eval_accuracy': 0.46004668701741785, 'eval_f1': 0.4511414731237447, 'eval_loss': 1.158411979675293, 'eval_runtime': 129.244, 'eval_samples_per_second': 43.089, 'eval_steps_per_second': 14.368, 'epoch': 1.0}
Stopping training: eval_accuracy below manual min_acc of 0.5
{'train_runtime': 2948.6868, 'train_samples_per_second': 372.674, 'train_steps_per_second': 124.225, 'train_loss': 1.434618428298238, 'epoch': 1.0}


  0%|          | 0/1857 [00:00<?, ?it/s]

[I 2024-10-22 04:31:38,277] Trial 2 finished with value: 0.46004668701741785 and parameters: {'model_name': 'sentence-transformers/all-mpnet-base-v2', 'dropout_rate': 0.10926740728558144, 'learning_rate': 1.335608479306097e-05, 'batch_size': 3, 'warmup_ratio': 0.41012687080315513, 'weight_decay': 0.09479782297028039, 'adam_beta1': 0.912797523704336, 'adam_beta2': 0.997103226208837, 'adam_epsilon': 7.229757891985637e-07, 'lr_scheduler_type': 'linear'}. Best is trial 2 with value: 0.46004668701741785.


Stopping training: No improvement in eval_accuracy for 1 epochs
Stopping training: eval_accuracy below manual min_acc of 0.5
Current Trial 3 parameters: {'model_name': 'sentence-transformers/all-mpnet-base-v2', 'dropout_rate': 0.1816376875451311, 'learning_rate': 5.4982191466617046e-05, 'batch_size': 4, 'warmup_ratio': 0.6229699420810516, 'weight_decay': 0.07934363728473695, 'adam_beta1': 0.9386008693968517, 'adam_beta2': 0.9935052156642877, 'adam_epsilon': 6.932517927990338e-07, 'lr_scheduler_type': 'cosine'}


Some weights of MPNetForMultipleChoice were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/274740 [00:00<?, ?it/s]

{'loss': 1.6095, 'grad_norm': 0.8916115760803223, 'learning_rate': 1.5997856533770726e-07, 'epoch': 0.05}
{'loss': 1.6094, 'grad_norm': 0.935311496257782, 'learning_rate': 3.2027837277448625e-07, 'epoch': 0.11}
{'loss': 1.6084, 'grad_norm': 0.7695035338401794, 'learning_rate': 4.808994223103369e-07, 'epoch': 0.16}
{'loss': 1.6056, 'grad_norm': 1.04118013381958, 'learning_rate': 6.415204718461876e-07, 'epoch': 0.22}
{'loss': 1.5815, 'grad_norm': 3.112593173980713, 'learning_rate': 8.021415213820383e-07, 'epoch': 0.27}
{'loss': 1.4552, 'grad_norm': 16.392295837402344, 'learning_rate': 9.62441328818817e-07, 'epoch': 0.33}
{'loss': 1.3405, 'grad_norm': 22.638620376586914, 'learning_rate': 1.1224198941565245e-06, 'epoch': 0.38}
{'loss': 1.2959, 'grad_norm': 16.598974227905273, 'learning_rate': 1.2823984594942319e-06, 'epoch': 0.44}
{'loss': 1.2856, 'grad_norm': 16.50101089477539, 'learning_rate': 1.4430195090300825e-06, 'epoch': 0.49}
{'loss': 1.288, 'grad_norm': 3.342400312423706, 'learnin

  0%|          | 0/1393 [00:00<?, ?it/s]

{'eval_accuracy': 0.4720775722750943, 'eval_f1': 0.46856230684954503, 'eval_loss': 1.133382797241211, 'eval_runtime': 129.7892, 'eval_samples_per_second': 42.908, 'eval_steps_per_second': 10.733, 'epoch': 1.0}
Stopping training: eval_accuracy below manual min_acc of 0.5
{'train_runtime': 2899.806, 'train_samples_per_second': 378.956, 'train_steps_per_second': 94.744, 'train_loss': 1.357816904021757, 'epoch': 1.0}


  0%|          | 0/1393 [00:00<?, ?it/s]

[I 2024-10-22 05:22:09,212] Trial 3 finished with value: 0.4720775722750943 and parameters: {'model_name': 'sentence-transformers/all-mpnet-base-v2', 'dropout_rate': 0.1816376875451311, 'learning_rate': 5.4982191466617046e-05, 'batch_size': 4, 'warmup_ratio': 0.6229699420810516, 'weight_decay': 0.07934363728473695, 'adam_beta1': 0.9386008693968517, 'adam_beta2': 0.9935052156642877, 'adam_epsilon': 6.932517927990338e-07, 'lr_scheduler_type': 'cosine'}. Best is trial 3 with value: 0.4720775722750943.


Stopping training: No improvement in eval_accuracy for 1 epochs
Stopping training: eval_accuracy below manual min_acc of 0.5
Current Trial 4 parameters: {'model_name': 'sentence-transformers/all-mpnet-base-v2', 'dropout_rate': 0.2500228054734358, 'learning_rate': 6.331007353356651e-05, 'batch_size': 3, 'warmup_ratio': 0.31837920111611373, 'weight_decay': 0.09971031186766507, 'adam_beta1': 0.8682929292594384, 'adam_beta2': 0.9909672212836867, 'adam_epsilon': 5.424397228138408e-07, 'lr_scheduler_type': 'linear'}


Some weights of MPNetForMultipleChoice were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/366300 [00:00<?, ?it/s]

{'loss': 1.6105, 'grad_norm': 1.003128170967102, 'learning_rate': 2.703447572066927e-07, 'epoch': 0.04}
{'loss': 1.6088, 'grad_norm': 0.9811621904373169, 'learning_rate': 5.417752363298781e-07, 'epoch': 0.08}
{'loss': 1.6068, 'grad_norm': 1.012223720550537, 'learning_rate': 8.132057154530636e-07, 'epoch': 0.12}
{'loss': 1.5915, 'grad_norm': 1.8195114135742188, 'learning_rate': 1.0840933336180026e-06, 'epoch': 0.16}
{'loss': 1.472, 'grad_norm': 4.724691390991211, 'learning_rate': 1.3544380908246952e-06, 'epoch': 0.2}
{'loss': 1.3579, 'grad_norm': 15.955487251281738, 'learning_rate': 1.6258685699478809e-06, 'epoch': 0.25}
{'loss': 1.3038, 'grad_norm': 37.59272384643555, 'learning_rate': 1.8972990490710663e-06, 'epoch': 0.29}
{'loss': 1.2731, 'grad_norm': 18.960540771484375, 'learning_rate': 2.167643806277759e-06, 'epoch': 0.33}
{'loss': 1.2412, 'grad_norm': 6.070656776428223, 'learning_rate': 2.439074285400944e-06, 'epoch': 0.37}
{'loss': 1.2635, 'grad_norm': 17.87575912475586, 'learning

  0%|          | 0/1857 [00:00<?, ?it/s]

{'eval_accuracy': 0.51212066798348, 'eval_f1': 0.5008762224153099, 'eval_loss': 1.0327595472335815, 'eval_runtime': 129.2988, 'eval_samples_per_second': 43.071, 'eval_steps_per_second': 14.362, 'epoch': 1.0}
{'loss': 1.1207, 'grad_norm': 25.40252113342285, 'learning_rate': 6.780333368497172e-06, 'epoch': 1.02}
{'loss': 1.1143, 'grad_norm': 19.99736213684082, 'learning_rate': 7.051220986662111e-06, 'epoch': 1.06}
{'loss': 1.1162, 'grad_norm': 15.271824836730957, 'learning_rate': 7.322108604827051e-06, 'epoch': 1.11}
{'loss': 1.1192, 'grad_norm': 30.091594696044922, 'learning_rate': 7.593539083950236e-06, 'epoch': 1.15}
{'loss': 1.1263, 'grad_norm': 20.144153594970703, 'learning_rate': 7.864969563073422e-06, 'epoch': 1.19}
{'loss': 1.0935, 'grad_norm': 9.870036125183105, 'learning_rate': 8.136400042196607e-06, 'epoch': 1.23}
{'loss': 1.1304, 'grad_norm': 19.627023696899414, 'learning_rate': 8.407830521319792e-06, 'epoch': 1.27}
{'loss': 1.1352, 'grad_norm': 30.71637535095215, 'learning_r

  0%|          | 0/1857 [00:00<?, ?it/s]

{'eval_accuracy': 0.5421080984018675, 'eval_f1': 0.5427438398167861, 'eval_loss': 0.9257205128669739, 'eval_runtime': 129.1864, 'eval_samples_per_second': 43.108, 'eval_steps_per_second': 14.375, 'epoch': 2.0}
{'loss': 1.03, 'grad_norm': 6.124541759490967, 'learning_rate': 1.3293036284578884e-05, 'epoch': 2.01}
{'loss': 0.9802, 'grad_norm': 60.2114372253418, 'learning_rate': 1.3563923902743822e-05, 'epoch': 2.05}
{'loss': 0.9531, 'grad_norm': 64.00323486328125, 'learning_rate': 1.3835354381867009e-05, 'epoch': 2.09}
{'loss': 0.9351, 'grad_norm': 37.82403564453125, 'learning_rate': 1.4106242000031947e-05, 'epoch': 2.13}
{'loss': 0.9746, 'grad_norm': 16.772634506225586, 'learning_rate': 1.4377672479155133e-05, 'epoch': 2.17}
{'loss': 0.9465, 'grad_norm': 24.6521053314209, 'learning_rate': 1.4649102958278318e-05, 'epoch': 2.21}
{'loss': 0.9418, 'grad_norm': 69.53672790527344, 'learning_rate': 1.4920533437401504e-05, 'epoch': 2.25}
{'loss': 0.9725, 'grad_norm': 16.6578426361084, 'learning_

  0%|          | 0/1857 [00:00<?, ?it/s]

{'eval_accuracy': 0.6186029807864967, 'eval_f1': 0.6182180568613557, 'eval_loss': 0.9976822137832642, 'eval_runtime': 129.1709, 'eval_samples_per_second': 43.113, 'eval_steps_per_second': 14.376, 'epoch': 3.0}
{'loss': 0.8005, 'grad_norm': 18.805315017700195, 'learning_rate': 2.007554109690904e-05, 'epoch': 3.03}
{'loss': 0.7618, 'grad_norm': 1.7467933893203735, 'learning_rate': 2.0346971576032227e-05, 'epoch': 3.07}
{'loss': 0.8173, 'grad_norm': 10.063373565673828, 'learning_rate': 2.0618402055155412e-05, 'epoch': 3.11}
{'loss': 0.8088, 'grad_norm': 4.948492050170898, 'learning_rate': 2.08898325342786e-05, 'epoch': 3.15}
{'loss': 0.7888, 'grad_norm': 45.62972640991211, 'learning_rate': 2.1161263013401785e-05, 'epoch': 3.19}
{'loss': 0.8674, 'grad_norm': 31.724933624267578, 'learning_rate': 2.143269349252497e-05, 'epoch': 3.24}
{'loss': 0.7657, 'grad_norm': 33.06605911254883, 'learning_rate': 2.1704123971648154e-05, 'epoch': 3.28}
{'loss': 0.8493, 'grad_norm': 16.989675521850586, 'lear

  0%|          | 0/1857 [00:00<?, ?it/s]

{'eval_accuracy': 0.6415873585922068, 'eval_f1': 0.6415504437113901, 'eval_loss': 1.0187714099884033, 'eval_runtime': 130.4745, 'eval_samples_per_second': 42.683, 'eval_steps_per_second': 14.233, 'epoch': 4.0}
{'loss': 0.7835, 'grad_norm': 21.102596282958984, 'learning_rate': 2.6588244012990756e-05, 'epoch': 4.01}
{'loss': 0.6664, 'grad_norm': 5.044328212738037, 'learning_rate': 2.685967449211394e-05, 'epoch': 4.05}
{'loss': 0.6521, 'grad_norm': 16.94122314453125, 'learning_rate': 2.7131104971237125e-05, 'epoch': 4.1}
{'loss': 0.7038, 'grad_norm': 37.15906524658203, 'learning_rate': 2.740253545036031e-05, 'epoch': 4.14}
{'loss': 0.75, 'grad_norm': 18.979867935180664, 'learning_rate': 2.7673423068525248e-05, 'epoch': 4.18}
{'loss': 0.7511, 'grad_norm': 28.13322639465332, 'learning_rate': 2.794431068669019e-05, 'epoch': 4.22}
{'loss': 0.7576, 'grad_norm': 11.955854415893555, 'learning_rate': 2.8215741165813374e-05, 'epoch': 4.26}
{'loss': 0.7405, 'grad_norm': 9.219260215759277, 'learning

  0%|          | 0/1857 [00:00<?, ?it/s]

{'eval_accuracy': 0.6437421440114922, 'eval_f1': 0.643986597412205, 'eval_loss': 1.344173789024353, 'eval_runtime': 130.3053, 'eval_samples_per_second': 42.738, 'eval_steps_per_second': 14.251, 'epoch': 5.0}
{'loss': 0.6788, 'grad_norm': 26.029659271240234, 'learning_rate': 3.337237740819565e-05, 'epoch': 5.04}
{'loss': 0.6854, 'grad_norm': 0.8334842324256897, 'learning_rate': 3.364326502636059e-05, 'epoch': 5.08}
{'loss': 0.6623, 'grad_norm': 29.628421783447266, 'learning_rate': 3.391469550548377e-05, 'epoch': 5.12}
{'loss': 0.6937, 'grad_norm': 42.893489837646484, 'learning_rate': 3.4186125984606964e-05, 'epoch': 5.16}
{'loss': 0.6594, 'grad_norm': 14.304396629333496, 'learning_rate': 3.445755646373015e-05, 'epoch': 5.2}
{'loss': 0.6919, 'grad_norm': 1.6768128871917725, 'learning_rate': 3.472898694285333e-05, 'epoch': 5.24}
{'loss': 0.759, 'grad_norm': 55.0181884765625, 'learning_rate': 3.499987456101827e-05, 'epoch': 5.28}
{'loss': 0.6734, 'grad_norm': 0.09906376153230667, 'learning

  0%|          | 0/1857 [00:00<?, ?it/s]

{'eval_accuracy': 0.6502065002693482, 'eval_f1': 0.6480668146829563, 'eval_loss': 1.1717666387557983, 'eval_runtime': 130.254, 'eval_samples_per_second': 42.755, 'eval_steps_per_second': 14.257, 'epoch': 6.0}
{'loss': 0.7332, 'grad_norm': 38.68946838378906, 'learning_rate': 3.9883451741402624e-05, 'epoch': 6.02}
{'loss': 0.6181, 'grad_norm': 1.2194770574569702, 'learning_rate': 4.0154882220525815e-05, 'epoch': 6.06}
{'loss': 0.6082, 'grad_norm': 34.59031295776367, 'learning_rate': 4.042631269964899e-05, 'epoch': 6.1}
{'loss': 0.7004, 'grad_norm': 6.616634845733643, 'learning_rate': 4.069720031781393e-05, 'epoch': 6.14}
{'loss': 0.6777, 'grad_norm': 0.04622616246342659, 'learning_rate': 4.096863079693712e-05, 'epoch': 6.18}
{'loss': 0.6969, 'grad_norm': 103.87966918945312, 'learning_rate': 4.12400612760603e-05, 'epoch': 6.22}
{'loss': 0.7291, 'grad_norm': 16.712339401245117, 'learning_rate': 4.151149175518349e-05, 'epoch': 6.27}
{'loss': 0.7262, 'grad_norm': 35.68686294555664, 'learning

  0%|          | 0/1857 [00:00<?, ?it/s]

{'eval_accuracy': 0.6414077931405997, 'eval_f1': 0.64276980382379, 'eval_loss': 1.5463054180145264, 'eval_runtime': 128.7807, 'eval_samples_per_second': 43.244, 'eval_steps_per_second': 14.42, 'epoch': 7.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 20758.428, 'train_samples_per_second': 52.938, 'train_steps_per_second': 17.646, 'train_loss': 0.9223125685833206, 'epoch': 7.0}


  0%|          | 0/1857 [00:00<?, ?it/s]

[I 2024-10-22 11:10:20,203] Trial 4 finished with value: 0.6502065002693482 and parameters: {'model_name': 'sentence-transformers/all-mpnet-base-v2', 'dropout_rate': 0.2500228054734358, 'learning_rate': 6.331007353356651e-05, 'batch_size': 3, 'warmup_ratio': 0.31837920111611373, 'weight_decay': 0.09971031186766507, 'adam_beta1': 0.8682929292594384, 'adam_beta2': 0.9909672212836867, 'adam_epsilon': 5.424397228138408e-07, 'lr_scheduler_type': 'linear'}. Best is trial 4 with value: 0.6502065002693482.


Stopping training: No improvement in eval_accuracy for 1 epochs
Current Trial 5 parameters: {'model_name': 'sentence-transformers/all-mpnet-base-v2', 'dropout_rate': 0.43619344239874647, 'learning_rate': 4.744804985319591e-07, 'batch_size': 3, 'warmup_ratio': 0.6610094216704753, 'weight_decay': 0.214015165889555, 'adam_beta1': 0.8312852854211105, 'adam_beta2': 0.996036397101193, 'adam_epsilon': 5.420035889344036e-08, 'lr_scheduler_type': 'linear'}


Some weights of MPNetForMultipleChoice were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/366300 [00:00<?, ?it/s]

{'loss': 1.6085, 'grad_norm': 1.0757840871810913, 'learning_rate': 9.798133601482668e-10, 'epoch': 0.04}
{'loss': 1.6087, 'grad_norm': 1.0348371267318726, 'learning_rate': 1.957667093576237e-09, 'epoch': 0.08}
{'loss': 1.609, 'grad_norm': 0.9857816696166992, 'learning_rate': 2.9355208270042074e-09, 'epoch': 0.12}
{'loss': 1.609, 'grad_norm': 0.9480171799659729, 'learning_rate': 3.915334187152474e-09, 'epoch': 0.16}
{'loss': 1.6091, 'grad_norm': 0.9768244624137878, 'learning_rate': 4.895147547300741e-09, 'epoch': 0.2}
{'loss': 1.6093, 'grad_norm': 0.982633650302887, 'learning_rate': 5.8749609074490075e-09, 'epoch': 0.25}
{'loss': 1.6088, 'grad_norm': 1.8424819707870483, 'learning_rate': 6.8528146408769786e-09, 'epoch': 0.29}
{'loss': 1.609, 'grad_norm': 0.988273024559021, 'learning_rate': 7.832628001025244e-09, 'epoch': 0.33}
{'loss': 1.6085, 'grad_norm': 1.1059460639953613, 'learning_rate': 8.81244136117351e-09, 'epoch': 0.37}
{'loss': 1.6093, 'grad_norm': 1.0883785486221313, 'learning

  0%|          | 0/1857 [00:00<?, ?it/s]

{'eval_accuracy': 0.31836954569940745, 'eval_f1': 0.32442948394051324, 'eval_loss': 1.607865333557129, 'eval_runtime': 130.1834, 'eval_samples_per_second': 42.778, 'eval_steps_per_second': 14.264, 'epoch': 1.0}
Stopping training: eval_accuracy below manual min_acc of 0.5
{'train_runtime': 2976.1961, 'train_samples_per_second': 369.23, 'train_steps_per_second': 123.077, 'train_loss': 1.6087190195344492, 'epoch': 1.0}


  0%|          | 0/1857 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs
Stopping training: eval_accuracy below manual min_acc of 0.5


[I 2024-10-22 12:02:06,610] Trial 5 finished with value: 0.31836954569940745 and parameters: {'model_name': 'sentence-transformers/all-mpnet-base-v2', 'dropout_rate': 0.43619344239874647, 'learning_rate': 4.744804985319591e-07, 'batch_size': 3, 'warmup_ratio': 0.6610094216704753, 'weight_decay': 0.214015165889555, 'adam_beta1': 0.8312852854211105, 'adam_beta2': 0.996036397101193, 'adam_epsilon': 5.420035889344036e-08, 'lr_scheduler_type': 'linear'}. Best is trial 4 with value: 0.6502065002693482.


Current Trial 6 parameters: {'model_name': 'sentence-transformers/all-mpnet-base-v2', 'dropout_rate': 0.23732993865505386, 'learning_rate': 1.1685926112932391e-07, 'batch_size': 4, 'warmup_ratio': 0.8400211050841219, 'weight_decay': 0.05868908832126296, 'adam_beta1': 0.8519566435750197, 'adam_beta2': 0.9913554074386653, 'adam_epsilon': 7.201851832756769e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


Some weights of MPNetForMultipleChoice were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/274740 [00:00<?, ?it/s]

{'loss': 1.6096, 'grad_norm': 0.892684817314148, 'learning_rate': 2.5165542741075785e-10, 'epoch': 0.05}


[W 2024-10-22 12:04:46,078] Trial 6 failed with parameters: {'model_name': 'sentence-transformers/all-mpnet-base-v2', 'dropout_rate': 0.23732993865505386, 'learning_rate': 1.1685926112932391e-07, 'batch_size': 4, 'warmup_ratio': 0.8400211050841219, 'weight_decay': 0.05868908832126296, 'adam_beta1': 0.8519566435750197, 'adam_beta2': 0.9913554074386653, 'adam_epsilon': 7.201851832756769e-07, 'lr_scheduler_type': 'cosine_with_restarts'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\OEM\AppData\Local\Temp\ipykernel_29776\2979044354.py", line 60, in objective
    trainer.train()
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\site-packages\transformers\trainer.py", line 1938, in train
    return inner_training_loop(
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\site-p

KeyboardInterrupt: 

## 4.4 Optuna Hyperparameters Tuning 3

# End of NoteBook