# 0. Imports, libraries and rusable functions

In [1]:
from project_imports import *
import use_gpu

Project libraries imported!
GPU: NVIDIA GeForce RTX 4070 Ti SUPER is available.
Device:cuda


# 1. Global Variables

In [2]:
## Arguments and global vriables
dataset_name="ReClor"
pretrained_model_name = "microsoft/deberta-v3-base"
normalized_model_name = pretrained_model_name.replace("/", "-")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
assert isinstance( tokenizer, PreTrainedTokenizerFast )
data_collator = DefaultDataCollator()
max_length = 512 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
pad_on_right = right_padding = tokenizer.padding_side == 'right'
global_counter = 0
traing_answer_mismatches = []
logger = logging.getLogger(__name__)



# 2. Prepare the AR-LSAT Dataset 

In [3]:
# Load the combined dataset
combined_dataset = load_from_disk('cleaned_dataset')

combined_dataset

DatasetDict({
    train: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 1072514
    })
    validation: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 118521
    })
    test: Dataset({
        features: ['Context', 'Question', 'Options', 'Label_Text', 'Label', 'Type', 'Source Dataset'],
        num_rows: 200566
    })
})

In [4]:
# Filter the dataset to only include AR-LSAT data
reclor_train = combined_dataset['train'].filter(lambda x: x['Source Dataset'] == 'ReClor')
reclor_val = combined_dataset['validation'].filter(lambda x: x['Source Dataset'] == 'ReClor')
reclor_test = combined_dataset['test'].filter(lambda x: x['Source Dataset'] == 'ReClor')


# Concatenate test data into the training data
reclor_train = concatenate_datasets([reclor_train, reclor_test])

In [5]:
# Preprocessing function for multiple-choice tasks
def mcqa_preprocess_function(examples):
    num_choices = num_choices = len(examples['Options'][0])    
    first_sentences = [[context] * num_choices for context in examples['Context']]  # Repeat context for each option
    question_headers = examples['Question']
    options_list = examples['Options']
    
    second_sentences = []
    for question, options in zip(question_headers, options_list):
        # Combine question with each option
        second_sentences.append([f"{question} {option}" for option in options])
    
    # Flatten the lists
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    
    # Tokenize the inputs
    tokenized_examples = tokenizer(
        first_sentences,
        second_sentences,
        truncation=True,
        max_length=512,
        padding='max_length',
    )
    
    # Un-flatten the tokenized inputs to have shape (num_examples, num_choices, seq_length)
    tokenized_inputs = {k: [v[i:i + num_choices] for i in range(0, len(v), num_choices)] for k, v in tokenized_examples.items()}
    
    # Labels
    tokenized_inputs["labels"] = examples["Label"]
    
    return tokenized_inputs


In [6]:
def mcqa_preprocess_function(examples):
    # Determine the maximum number of choices
    max_num_choices = 5  
    contexts = examples['Context']
    questions = examples['Question']
    options_list = examples['Options']
    labels = examples['Label']
    
    first_sentences = []
    second_sentences = []
    labels_adjusted = []
    
    for context, question, options, label in zip(contexts, questions, options_list, labels):
        num_choices = len(options)
        # Pad options to have max_num_choices
        if num_choices < max_num_choices:
            options += [''] * (max_num_choices - num_choices)
        first_sentences.append([context] * max_num_choices)
        second_sentences.append([f"{question} {option}" for option in options])
        labels_adjusted.append(label)
    
    # Flatten the lists
    first_sentences = [item for sublist in first_sentences for item in sublist]
    second_sentences = [item for sublist in second_sentences for item in sublist]
    
    # Tokenize the inputs
    tokenized_examples = tokenizer(
        first_sentences,
        second_sentences,
        truncation=True,
        max_length=512,
        padding='max_length',
    )
    
    # Un-flatten to shape (num_examples, max_num_choices, seq_length)
    tokenized_inputs = {
        k: [v[i:i + max_num_choices] for i in range(0, len(v), max_num_choices)]
        for k, v in tokenized_examples.items()
    }
    
    # Labels
    tokenized_inputs["labels"] = labels_adjusted
    
    return tokenized_inputs




In [7]:

# Apply the preprocessing function to the combined datasets
encoded_reclor_train = reclor_train.map(mcqa_preprocess_function, batched=True)
encoded_reclor_val = reclor_val.map(mcqa_preprocess_function, batched=True)
encoded_reclor_test = reclor_test.map(mcqa_preprocess_function, batched=True)

In [8]:
# Set the format of the datasets to PyTorch tensors
encoded_reclor_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
encoded_reclor_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
encoded_reclor_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


def get_train_encoded():
    return encoded_reclor_train

def get_val_encoded():
    return encoded_reclor_val

def get_test_encoded():
    return encoded_reclor_test


# 3. Reusable Functions

In [9]:
# Load the accuracy metric
accuracy = evaluate.load('accuracy')

# Define the compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy.compute(predictions=predictions, references=labels)['accuracy']
    f1 = f1_score(labels, predictions, average='weighted')
    return {'eval_accuracy': acc, 'eval_f1': f1}

In [10]:
def create_training_args(run_name="Default-Run", num_train_epochs=3, learning_rate=4.92e-05, batch_size=3):
    """
    Generates training arguments for training a machine learning model.

    Parameters:
    - dataset_name (str): The name of the dataset.
    - run_name (str): The name of the run, useful for logging and saving models.
    - model_name (str): The name of the model, typically including its configuration.
    - num_train_epochs (int): The number of epochs to train for.
    - learning_rate (float): The learning rate for training.
    - batch_size (int): The batch size used for training.

    Returns:
    - TrainingArguments: A configured TrainingArguments instance.
    """    
    output_dir = f"./{dataset_name}/{run_name}/{normalized_model_name}"

    if run_name=="Optuna":
        training_args = TrainingArguments(
        output_dir=output_dir,
        report_to="none",  # Disable all integrations
        overwrite_output_dir=True,
        metric_for_best_model='eval_accuracy',
        greater_is_better=True,
        load_best_model_at_end=True,
        save_total_limit=3,
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=num_train_epochs,
        learning_rate=1.5807103066634623e-05,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,        
        warmup_ratio = 0.5994150649377659,
        weight_decay=0.12506835879573128,
        adam_beta1=0.8136227307274486,
        adam_beta2=0.9924116710027883,
        adam_epsilon=1.9858068243318367e-07,
        lr_scheduler_type='cosine_with_restarts',
        fp16=True,  # Enable mixed-precision training
    )
    else:
        training_args = TrainingArguments(
            output_dir=output_dir,
            report_to="none",  # Disable all integrations
            overwrite_output_dir=True,
            metric_for_best_model='eval_accuracy',
            greater_is_better=True,
            load_best_model_at_end=True,
            save_total_limit=3,
            eval_strategy="epoch",
            save_strategy="epoch",
            num_train_epochs=num_train_epochs,
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=4,
            warmup_steps=398,
            weight_decay=0.194,
            adam_beta1=0.837,
            adam_beta2=0.997,
            adam_epsilon=5.87e-07,
            lr_scheduler_type='cosine',
            fp16=True,  # Enable mixed-precision training
        )
    
    return training_args


In [11]:
class AdvancedEarlyStoppingCallback(TrainerCallback):
    """
    A callback to stop training when either the performance falls below a certain threshold
    or if there is no improvement over a set number of epochs.
    """
    def __init__(self, metric_name, patience):
        self.metric_name = metric_name
        self.patience = patience        
        self.best_score = None
        self.no_improve_epochs = 0
        self.config_file = "early_stopping_config.json"  # Config file for early stopping values

    def read_early_stopping_config(self):
        """
        Reads the early stopping configuration from the file system.
        Returns the configuration as a dictionary.
        """
        if os.path.exists(self.config_file):
            with open(self.config_file, 'r') as file:
                config = json.load(file)
            return config
        else:
            raise FileNotFoundError(f"Config file not found: {self.config_file}")
    def reset_manual_stop_flag(self):
        """
        Resets the manual stop flag to False in the early stopping config file.
        """
        config = self.read_early_stopping_config()
        config['manual_stop'] = False
        with open(self.config_file, 'w') as file:
            json.dump(config, file, indent=4)

    def on_evaluate(self, args, state, control, **kwargs):
        metric_value = kwargs['metrics'].get(self.metric_name)

        if self.best_score is None or metric_value > self.best_score:
            self.best_score = metric_value
            self.no_improve_epochs = 0
        else:
            self.no_improve_epochs += 1

        # Check if no improvement has been seen over the allowed patience
        if self.no_improve_epochs >= self.patience:
            control.should_training_stop = True
            print(f"Stopping training: No improvement in {self.metric_name} for {self.patience} epochs")


        # Read the early stopping configuration
        config = self.read_early_stopping_config()
        min_accuracy = config.get("min_accuracy", 0.35)                
        num_epochs_min_acc = config.get("num_epochs_min_acc", 2)  
        max_variance = config.get("max_variance", 0.2)  

        # Check if performance is below the threshold
        if metric_value < min_accuracy:
            control.should_training_stop = True
            print(f"Stopping training: {self.metric_name} below manual min_acc of {min_accuracy}")

         # Manual stop from config
        if config.get("manual_stop", False):
            control.should_training_stop = True
            print(f"Manual early stopping triggered!!")
            self.reset_manual_stop_flag()  # Reset the flag for future runs
            


In [12]:
def model_init(model_name=pretrained_model_name):
    return AutoModelForMultipleChoice.from_pretrained(model_name)

In [13]:
def create_trainer(model_name=pretrained_model_name,run_name="Default-Run", num_train_epochs=3, learning_rate=4.92e-05, batch_size=4):
    trainer = Trainer(
        model=model_init(model_name),
        args=create_training_args(run_name=run_name, num_train_epochs=num_train_epochs, learning_rate=learning_rate, batch_size=batch_size),
        train_dataset=get_train_encoded(),
        eval_dataset=get_val_encoded(),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[AdvancedEarlyStoppingCallback(metric_name='eval_accuracy', patience=1)]
    )
    
    return trainer


In [14]:
# Optuna objective function for hyperparameter tuning
def objective(trial):
    # Clear any cached memory to start fresh for each trial
    torch.cuda.empty_cache()
    gc.collect()

    
    #model_name = trial.suggest_categorical('model_name', [pretrained_model_name, "./squad-trained-model", './MCQA-Combined/Optuna/trial_5/checkpoint-22865', './MCQA-Combined/Optuna/trial_0/checkpoint-30485','./MCQA-Combined/Optuna/trial_6/checkpoint-30485' ]) 
    model_name ="./MCQA-Combined/Optuna/trial_5/checkpoint-22865"
    learning_rate = trial.suggest_float('learning_rate', 1e-7, 1e-4, log=True)
    batch_size = trial.suggest_categorical('batch_size', [3, 4])
    #warmup_steps = trial.suggest_int('warmup_steps', 0, 1000)
    warmup_ratio= trial.suggest_float('warmup_ratio', 0.0, 1.0)
    weight_decay = trial.suggest_float('weight_decay', 0.0, 0.25)
    adam_beta1 = trial.suggest_float('adam_beta1', 0.8, 0.95)
    adam_beta2 = trial.suggest_float('adam_beta2', 0.990, 0.999)
    adam_epsilon = trial.suggest_float('adam_epsilon', 1e-8, 1e-6)
    lr_scheduler_type = trial.suggest_categorical('lr_scheduler_type', ['linear', 'cosine', 'cosine_with_restarts']) #,'constant_with_warmup'   
    

    output_dir = f"./{dataset_name}/Optuna3/trial_{trial.number}"

    training_args = TrainingArguments(
        output_dir=output_dir,
        report_to="none",  # Disable all integrations
        overwrite_output_dir=True,
        metric_for_best_model='eval_accuracy',
        greater_is_better=True,
        load_best_model_at_end=True,
        save_total_limit=3,
        eval_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=30,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        warmup_ratio=warmup_ratio,
        weight_decay=weight_decay,
        adam_beta1=adam_beta1,
        adam_beta2=adam_beta2,
        adam_epsilon=adam_epsilon,
        lr_scheduler_type=lr_scheduler_type,
        fp16=True,  # Enable mixed-precision training
    ) 
    
    # Print trial parameters
    print(f"Current Trial {trial.number} parameters: {trial.params}")
    
    trainer = Trainer(
        model=model_init(model_name),
        args=training_args,
        train_dataset=get_train_encoded(),
        eval_dataset=get_val_encoded(),
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[AdvancedEarlyStoppingCallback(metric_name='eval_accuracy', patience=1)]
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
        
    torch.cuda.empty_cache()  # Clear cache after evaluation
    gc.collect()  # Collect garbage

    return eval_results['eval_accuracy']


# 4. Fine-tuning DeBERTa on MCQA task (AR-LSAT Dataset)

## 4.1 Evaluate Vanilla DeBERTa (Acc= 27.6%)

In [17]:
# Load the model
model = AutoModelForMultipleChoice.from_pretrained(pretrained_model_name)

# Create the Trainer
trainer = create_trainer()

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print(f"Test Results: {test_results}")

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


wandb: Currently logged in as: mzak071 (COMPSCI714). Use `wandb login --relogin` to force relogin


Test Results: {'eval_accuracy': 0.276, 'eval_f1': 0.2767257999541722, 'eval_loss': 1.3862577676773071, 'eval_model_preparation_time': 0.002, 'eval_runtime': 15.3247, 'eval_samples_per_second': 32.627, 'eval_steps_per_second': 32.627}


## 4.2 Fine-Tune and Evaluate Vanilla DeBERTa (Acc=38.2%)

In [19]:
# Load the model
model = AutoModelForMultipleChoice.from_pretrained(pretrained_model_name)
# Create the Trainer
trainer = create_trainer()

# Train the model
trainer.train()
# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print(f"Test Results: {test_results}")

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.3868,1.385791,0.342685,0.342227
2,1.391,1.386014,0.296593,0.297131
3,1.3052,1.325571,0.372745,0.370938


Test Results: {'eval_accuracy': 0.382, 'eval_f1': 0.38155725956214587, 'eval_loss': 1.3181953430175781, 'eval_runtime': 14.3702, 'eval_samples_per_second': 34.794, 'eval_steps_per_second': 34.794, 'epoch': 3.0}


## 4.3 Evaluate SQUAD DeBERTa (Acc=25.2%)

In [21]:
path = "./squad-trained-model"
model =  AutoModelForMultipleChoice.from_pretrained(path)
# Create the Trainer
trainer = create_trainer(run_name="Squad-Run")
# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print(f"Test Results: {test_results}")

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at ./squad-trained-model and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Test Results: {'eval_accuracy': 0.252, 'eval_f1': 0.2527663131875577, 'eval_loss': 1.386056661605835, 'eval_model_preparation_time': 0.002, 'eval_runtime': 14.2706, 'eval_samples_per_second': 35.037, 'eval_steps_per_second': 35.037}


## 4.4 Fine-Tune and Evaluate  SQUAD DeBERTa (Acc=46.8%)


In [24]:
path = "./squad-trained-model"
model =  AutoModelForMultipleChoice.from_pretrained(path)
# Create the Trainer
trainer = create_trainer(run_name="Squad-Run")

# Train the model
trainer.train()

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print(f"Test Results: {test_results}")

Some weights of DebertaV2ForMultipleChoice were not initialized from the model checkpoint at ./squad-trained-model and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
wandb: Currently logged in as: mzak071 (COMPSCI714). Use `wandb login --relogin` to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.3723,1.32034,0.40481,0.404691
2,1.1193,1.094201,0.537074,0.537099
3,0.611,1.609691,0.551102,0.550685


Test Results: {'eval_accuracy': 0.468, 'eval_f1': 0.4681988749254519, 'eval_loss': 1.9973200559616089, 'eval_runtime': 14.2601, 'eval_samples_per_second': 35.063, 'eval_steps_per_second': 35.063, 'epoch': 3.0}


## 4.5 Evaluate Others

In [43]:
path = "./LogiQA/Squad-Run/microsoft-deberta-v3-base/checkpoint-12567"
model =  AutoModelForMultipleChoice.from_pretrained(path)
# Create the Trainer
trainer = create_trainer(run_name="LogiQA-Run")

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print(f"Test Results: {test_results}")

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Test Results: {'eval_accuracy': 0.4, 'eval_f1': 0.4006366456726491, 'eval_loss': 1.3058828115463257, 'eval_model_preparation_time': 0.0, 'eval_runtime': 14.525, 'eval_samples_per_second': 34.423, 'eval_steps_per_second': 34.423}


In [10]:
path = "./MCQA-Combined/Squad-Run/microsoft-deberta-v3-base/checkpoint-18291"
model =  AutoModelForMultipleChoice.from_pretrained(path)
# Create the Trainer
trainer = create_trainer(run_name="MCQA-Combined-Run", batch_size=3,  num_train_epochs=1)

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print("Test Results:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/500 [00:00<?, ?it/s]

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: mzak071 (COMPSCI714). Use `wandb login --relogin` to force relogin


Test Results:
Accuracy: 0.3340
F1 Score: 0.3335


In [14]:
path = "./MCQA-Combined/Squad-Run/microsoft-deberta-v3-base/checkpoint-18291"
model =  AutoModelForMultipleChoice.from_pretrained(path)
# Create the Trainer
trainer = create_trainer(run_name="MCQA-Combined-Run", batch_size=4,  num_train_epochs=6)

# Train the model
trainer.train()

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print("Test Results:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/6210 [00:00<?, ?it/s]

{'loss': 1.3947, 'grad_norm': 1.515221357345581, 'learning_rate': 4.9164785591849574e-05, 'epoch': 0.48}
{'loss': 1.3877, 'grad_norm': 1.8046776056289673, 'learning_rate': 4.7921767366118935e-05, 'epoch': 0.97}


  0%|          | 0/499 [00:00<?, ?it/s]

{'eval_accuracy': 0.2985971943887776, 'eval_f1': 0.2984327577081756, 'eval_loss': 1.385642409324646, 'eval_runtime': 17.1848, 'eval_samples_per_second': 29.037, 'eval_steps_per_second': 29.037, 'epoch': 1.0}
{'loss': 1.3857, 'grad_norm': 1.6376599073410034, 'learning_rate': 4.4985562567144186e-05, 'epoch': 1.45}
{'loss': 1.3774, 'grad_norm': 1.8705437183380127, 'learning_rate': 4.0569342969768665e-05, 'epoch': 1.93}


  0%|          | 0/499 [00:00<?, ?it/s]

{'eval_accuracy': 0.23446893787575152, 'eval_f1': 0.08906774588137314, 'eval_loss': 1.38671875, 'eval_runtime': 18.0805, 'eval_samples_per_second': 27.599, 'eval_steps_per_second': 27.599, 'epoch': 2.0}
{'loss': 1.385, 'grad_norm': 1.643894076347351, 'learning_rate': 3.4993731089832955e-05, 'epoch': 2.42}
{'loss': 1.3834, 'grad_norm': 1.6159354448318481, 'learning_rate': 2.8663522628474263e-05, 'epoch': 2.9}


  0%|          | 0/499 [00:00<?, ?it/s]

{'eval_accuracy': 0.23446893787575152, 'eval_f1': 0.08906774588137314, 'eval_loss': 1.38671875, 'eval_runtime': 16.8904, 'eval_samples_per_second': 29.543, 'eval_steps_per_second': 29.543, 'epoch': 3.0}
{'loss': 1.3706, 'grad_norm': 2.1004326343536377, 'learning_rate': 2.203829784838659e-05, 'epoch': 3.38}
{'loss': 1.3742, 'grad_norm': 2.7644174098968506, 'learning_rate': 1.5599055529219255e-05, 'epoch': 3.86}


  0%|          | 0/499 [00:00<?, ?it/s]

{'eval_accuracy': 0.23647294589178355, 'eval_f1': 0.10052816315688022, 'eval_loss': 1.3867089748382568, 'eval_runtime': 17.1559, 'eval_samples_per_second': 29.086, 'eval_steps_per_second': 29.086, 'epoch': 4.0}
{'loss': 1.375, 'grad_norm': 1.9737707376480103, 'learning_rate': 9.813291914525339e-06, 'epoch': 4.35}
{'loss': 1.3733, 'grad_norm': 2.138916015625, 'learning_rate': 5.101059958035776e-06, 'epoch': 4.83}


  0%|          | 0/499 [00:00<?, ?it/s]

{'eval_accuracy': 0.23647294589178355, 'eval_f1': 0.18601877501047798, 'eval_loss': 1.3865934610366821, 'eval_runtime': 18.016, 'eval_samples_per_second': 27.698, 'eval_steps_per_second': 27.698, 'epoch': 5.0}
{'loss': 1.3697, 'grad_norm': 2.079101800918579, 'learning_rate': 1.8044730062000698e-06, 'epoch': 5.31}
{'loss': 1.3723, 'grad_norm': 2.223784923553467, 'learning_rate': 1.6286698398285957e-07, 'epoch': 5.8}


  0%|          | 0/499 [00:00<?, ?it/s]

{'eval_accuracy': 0.23246492985971945, 'eval_f1': 0.08845007087345423, 'eval_loss': 1.3867168426513672, 'eval_runtime': 17.7611, 'eval_samples_per_second': 28.095, 'eval_steps_per_second': 28.095, 'epoch': 6.0}
{'train_runtime': 5386.1301, 'train_samples_per_second': 4.61, 'train_steps_per_second': 1.153, 'train_loss': 1.3788943673101601, 'epoch': 6.0}


  0%|          | 0/500 [00:00<?, ?it/s]

Test Results:
Accuracy: 0.3340
F1 Score: 0.3315


# End of NoteBook

## First Tuninig

In [56]:
# Create the Trainer
#trainer = create_trainer(model_name="./MCQA-Combined/Optuna/trial_5/checkpoint-22865", run_name="Optuna")  - Official Score 51.22%
trainer = create_trainer(model_name="./ReClor/Optuna/trial_0/checkpoint-5800",  run_name="Optuna")
# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_val_encoded())
print("Test Results:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x0000027DA2B772B0>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 27e37a07850, raw_cell="# Create the Trainer
#trainer = create_trainer(mod.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/c%3A/Users/OEM/Notebooks/COMPSCI764/Project/4.%20ReClor%20Baseline%20Training%20and%20Evaluation.ipynb#X43sZmlsZQ%3D%3D>,),kwargs {}:


ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

  0%|          | 0/125 [00:00<?, ?it/s]

KeyboardInterrupt: 

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x0000027DA2B772B0>> (for post_run_cell), with arguments args (<ExecutionResult object at 27e37a07610, execution_count=56 error_before_exec=None error_in_exec= info=<ExecutionInfo object at 27e37a07850, raw_cell="# Create the Trainer
#trainer = create_trainer(mod.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/c%3A/Users/OEM/Notebooks/COMPSCI764/Project/4.%20ReClor%20Baseline%20Training%20and%20Evaluation.ipynb#X43sZmlsZQ%3D%3D> result=None>,),kwargs {}:


ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

In [19]:
# Create a study object and optimize the objective
study = optuna.create_study(direction='maximize')
#study.enqueue_trial({'model_name': 'microsoft/deberta-v3-base', 'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'})
study.enqueue_trial({'model_name': './MCQA-Combined/Optuna/trial_5/checkpoint-22865', 'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'})
study.optimize(objective, n_trials=20)


[I 2024-10-18 15:11:55,784] A new study created in memory with name: no-name-ea4bfe48-1cd2-49e9-baaa-1e5c2e900ae2


Current Trial 0 parameters: {'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.2007, 'grad_norm': 3.974223363911733e-05, 'learning_rate': 3.7509664515743714e-07, 'epoch': 0.43}
{'loss': 0.1361, 'grad_norm': 143.74293518066406, 'learning_rate': 7.539821453164645e-07, 'epoch': 0.86}


wandb: 500 encountered ({"errors":[{"message":"context deadline exceeded","path":["project"]}],"data":{"project":null}}), retrying request
wandb: Network error resolved after 0:00:27.526240, resuming normal operation.
wandb: 500 encountered ({"errors":[{"message":"context deadline exceeded","path":["project"]}],"data":{"project":null}}), retrying request
wandb: Network error resolved after 0:01:17.070744, resuming normal operation.
Exception in thread NetStatThr:
Traceback (most recent call last):
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\OEM\anaconda3\envs\compsci714win\lib\site-packages\wandb\sdk\wandb_run.py", line 281

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9398797595190381, 'eval_f1': 0.9399240439612142, 'eval_loss': 0.5564093589782715, 'eval_runtime': 23.9439, 'eval_samples_per_second': 20.84, 'eval_steps_per_second': 5.221, 'epoch': 1.0}
{'loss': 0.1219, 'grad_norm': 0.00015371701738331467, 'learning_rate': 1.132867645475492e-06, 'epoch': 1.29}
{'loss': 0.1184, 'grad_norm': 238.6856231689453, 'learning_rate': 1.5117531456345194e-06, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9378757515030061, 'eval_f1': 0.9378831817562621, 'eval_loss': 0.5600924491882324, 'eval_runtime': 21.802, 'eval_samples_per_second': 22.888, 'eval_steps_per_second': 5.733, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 2678.9295, 'train_samples_per_second': 51.939, 'train_steps_per_second': 12.99, 'train_loss': 0.15260590849251582, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-18 15:56:58,228] Trial 0 finished with value: 0.9398797595190381 and parameters: {'learning_rate': 1.5807103066634623e-05, 'batch_size': 4, 'warmup_ratio': 0.5994150649377659, 'weight_decay': 0.12506835879573128, 'adam_beta1': 0.8136227307274486, 'adam_beta2': 0.9924116710027883, 'adam_epsilon': 1.9858068243318367e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 0.9398797595190381.


Current Trial 1 parameters: {'learning_rate': 2.081866957632205e-07, 'batch_size': 3, 'warmup_ratio': 0.6006070336374876, 'weight_decay': 0.0586348889480966, 'adam_beta1': 0.9050607005490999, 'adam_beta2': 0.9965461630006571, 'adam_epsilon': 1.4185782768203173e-07, 'lr_scheduler_type': 'linear'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/46380 [00:00<?, ?it/s]

{'loss': 0.2161, 'grad_norm': 138.05088806152344, 'learning_rate': 3.699336411056257e-09, 'epoch': 0.32}
{'loss': 0.186, 'grad_norm': 24.56732177734375, 'learning_rate': 7.428566449676605e-09, 'epoch': 0.65}
{'loss': 0.1055, 'grad_norm': 0.0011058965465053916, 'learning_rate': 1.1165269895187975e-08, 'epoch': 0.97}


  0%|          | 0/167 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9359157700799514, 'eval_loss': 0.5544729828834534, 'eval_runtime': 17.9234, 'eval_samples_per_second': 27.841, 'eval_steps_per_second': 9.317, 'epoch': 1.0}
{'loss': 0.1216, 'grad_norm': 0.11031674593687057, 'learning_rate': 1.4901973340699347e-08, 'epoch': 1.29}
{'loss': 0.1579, 'grad_norm': 0.01069814432412386, 'learning_rate': 1.8638676786210716e-08, 'epoch': 1.62}
{'loss': 0.191, 'grad_norm': 0.003246413776651025, 'learning_rate': 2.2375380231722088e-08, 'epoch': 1.94}


  0%|          | 0/167 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9359157700799514, 'eval_loss': 0.552856981754303, 'eval_runtime': 17.9306, 'eval_samples_per_second': 27.83, 'eval_steps_per_second': 9.314, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 1178.3674, 'train_samples_per_second': 118.079, 'train_steps_per_second': 39.36, 'train_loss': 0.1623492521579639, 'epoch': 2.0}


  0%|          | 0/167 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-18 16:16:55,579] Trial 1 finished with value: 0.935871743486974 and parameters: {'learning_rate': 2.081866957632205e-07, 'batch_size': 3, 'warmup_ratio': 0.6006070336374876, 'weight_decay': 0.0586348889480966, 'adam_beta1': 0.9050607005490999, 'adam_beta2': 0.9965461630006571, 'adam_epsilon': 1.4185782768203173e-07, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.9398797595190381.


Current Trial 2 parameters: {'learning_rate': 4.209283208471863e-07, 'batch_size': 3, 'warmup_ratio': 0.37357937635019467, 'weight_decay': 0.09502589043491191, 'adam_beta1': 0.8596182665473875, 'adam_beta2': 0.9962880203824281, 'adam_epsilon': 6.970115945418126e-07, 'lr_scheduler_type': 'cosine'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/46380 [00:00<?, ?it/s]

{'loss': 0.2162, 'grad_norm': 136.7920379638672, 'learning_rate': 1.2025135269773026e-08, 'epoch': 0.32}
{'loss': 0.1858, 'grad_norm': 23.006372451782227, 'learning_rate': 2.4147443349806846e-08, 'epoch': 0.65}
{'loss': 0.105, 'grad_norm': 0.0012465206673368812, 'learning_rate': 3.6294044632405864e-08, 'epoch': 0.97}


  0%|          | 0/167 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9359157700799514, 'eval_loss': 0.5590081214904785, 'eval_runtime': 17.9605, 'eval_samples_per_second': 27.783, 'eval_steps_per_second': 9.298, 'epoch': 1.0}
{'loss': 0.1199, 'grad_norm': 0.0857333168387413, 'learning_rate': 4.844064591500488e-08, 'epoch': 1.29}
{'loss': 0.1567, 'grad_norm': 0.010123919695615768, 'learning_rate': 6.058724719760389e-08, 'epoch': 1.62}
{'loss': 0.1892, 'grad_norm': 0.004119736608117819, 'learning_rate': 7.273384848020291e-08, 'epoch': 1.94}


  0%|          | 0/167 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9359157700799514, 'eval_loss': 0.5503807067871094, 'eval_runtime': 17.9472, 'eval_samples_per_second': 27.804, 'eval_steps_per_second': 9.305, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 1169.6738, 'train_samples_per_second': 118.956, 'train_steps_per_second': 39.652, 'train_loss': 0.16142211663615041, 'epoch': 2.0}


  0%|          | 0/167 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-18 16:36:44,213] Trial 2 finished with value: 0.935871743486974 and parameters: {'learning_rate': 4.209283208471863e-07, 'batch_size': 3, 'warmup_ratio': 0.37357937635019467, 'weight_decay': 0.09502589043491191, 'adam_beta1': 0.8596182665473875, 'adam_beta2': 0.9962880203824281, 'adam_epsilon': 6.970115945418126e-07, 'lr_scheduler_type': 'cosine'}. Best is trial 0 with value: 0.9398797595190381.


Current Trial 3 parameters: {'learning_rate': 2.4320759012786312e-05, 'batch_size': 4, 'warmup_ratio': 0.43428494165126297, 'weight_decay': 0.17792509942077755, 'adam_beta1': 0.9072495493860744, 'adam_beta2': 0.9955090465942885, 'adam_epsilon': 8.072511769911557e-07, 'lr_scheduler_type': 'cosine'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.1993, 'grad_norm': 9.122314804699272e-05, 'learning_rate': 7.965314087157089e-07, 'epoch': 0.43}
{'loss': 0.1364, 'grad_norm': 143.51292419433594, 'learning_rate': 1.6011085892366268e-06, 'epoch': 0.86}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9398797595190381, 'eval_f1': 0.9398895060983918, 'eval_loss': 0.5555239915847778, 'eval_runtime': 21.9241, 'eval_samples_per_second': 22.76, 'eval_steps_per_second': 5.701, 'epoch': 1.0}
{'loss': 0.1119, 'grad_norm': 2.6215633624815382e-05, 'learning_rate': 2.4056857697575452e-06, 'epoch': 1.29}
{'loss': 0.1145, 'grad_norm': 245.3953857421875, 'learning_rate': 3.2102629502784634e-06, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9378757515030061, 'eval_f1': 0.9379038745902142, 'eval_loss': 0.5454299449920654, 'eval_runtime': 21.6573, 'eval_samples_per_second': 23.041, 'eval_steps_per_second': 5.772, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 2297.0015, 'train_samples_per_second': 60.575, 'train_steps_per_second': 15.15, 'train_loss': 0.14942096348466544, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-18 17:15:23,910] Trial 3 finished with value: 0.9398797595190381 and parameters: {'learning_rate': 2.4320759012786312e-05, 'batch_size': 4, 'warmup_ratio': 0.43428494165126297, 'weight_decay': 0.17792509942077755, 'adam_beta1': 0.9072495493860744, 'adam_beta2': 0.9955090465942885, 'adam_epsilon': 8.072511769911557e-07, 'lr_scheduler_type': 'cosine'}. Best is trial 0 with value: 0.9398797595190381.


Current Trial 4 parameters: {'learning_rate': 1.8780148645548947e-06, 'batch_size': 3, 'warmup_ratio': 0.11853606348503187, 'weight_decay': 0.12637484667500865, 'adam_beta1': 0.9354524312732975, 'adam_beta2': 0.9958991004371357, 'adam_epsilon': 3.051130836550833e-07, 'lr_scheduler_type': 'linear'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/46380 [00:00<?, ?it/s]

{'loss': 0.2156, 'grad_norm': 122.10905456542969, 'learning_rate': 1.6908282247265783e-07, 'epoch': 0.32}
{'loss': 0.1832, 'grad_norm': 10.739239692687988, 'learning_rate': 3.398735522430193e-07, 'epoch': 0.65}
{'loss': 0.1, 'grad_norm': 0.000938432349357754, 'learning_rate': 5.106642820133808e-07, 'epoch': 0.97}


  0%|          | 0/167 [00:00<?, ?it/s]

{'eval_accuracy': 0.9378757515030061, 'eval_f1': 0.9379182832541295, 'eval_loss': 0.5556657910346985, 'eval_runtime': 17.8792, 'eval_samples_per_second': 27.91, 'eval_steps_per_second': 9.34, 'epoch': 1.0}
{'loss': 0.1, 'grad_norm': 0.006034303456544876, 'learning_rate': 6.814550117837422e-07, 'epoch': 1.29}
{'loss': 0.1449, 'grad_norm': 0.01698395237326622, 'learning_rate': 8.51904160094563e-07, 'epoch': 1.62}
{'loss': 0.18, 'grad_norm': 0.04308560490608215, 'learning_rate': 1.0226948898649244e-06, 'epoch': 1.94}


  0%|          | 0/167 [00:00<?, ?it/s]

{'eval_accuracy': 0.9398797595190381, 'eval_f1': 0.9398730517318628, 'eval_loss': 0.5076422691345215, 'eval_runtime': 17.8638, 'eval_samples_per_second': 27.934, 'eval_steps_per_second': 9.349, 'epoch': 2.0}
{'loss': 0.0979, 'grad_norm': 0.0036783951800316572, 'learning_rate': 1.1934856196352858e-06, 'epoch': 2.26}
{'loss': 0.1148, 'grad_norm': 16.524320602416992, 'learning_rate': 1.3642763494056473e-06, 'epoch': 2.59}
{'loss': 0.155, 'grad_norm': 0.03502001240849495, 'learning_rate': 1.5343839162569273e-06, 'epoch': 2.91}


  0%|          | 0/167 [00:00<?, ?it/s]

{'eval_accuracy': 0.9378757515030061, 'eval_f1': 0.9379318191574723, 'eval_loss': 0.5781373381614685, 'eval_runtime': 17.8984, 'eval_samples_per_second': 27.88, 'eval_steps_per_second': 9.33, 'epoch': 3.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 1757.4692, 'train_samples_per_second': 79.171, 'train_steps_per_second': 26.39, 'train_loss': 0.14347673171858893, 'epoch': 3.0}


  0%|          | 0/167 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-18 17:45:00,363] Trial 4 finished with value: 0.9398797595190381 and parameters: {'learning_rate': 1.8780148645548947e-06, 'batch_size': 3, 'warmup_ratio': 0.11853606348503187, 'weight_decay': 0.12637484667500865, 'adam_beta1': 0.9354524312732975, 'adam_beta2': 0.9958991004371357, 'adam_epsilon': 3.051130836550833e-07, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.9398797595190381.


Current Trial 5 parameters: {'learning_rate': 5.461764668132139e-05, 'batch_size': 4, 'warmup_ratio': 0.29748784861167366, 'weight_decay': 0.06446170389576267, 'adam_beta1': 0.9047555603231419, 'adam_beta2': 0.9929639240746806, 'adam_epsilon': 4.0218893028568194e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.1958, 'grad_norm': 0.00024149479577317834, 'learning_rate': 2.6113913944995737e-06, 'epoch': 0.43}
{'loss': 0.1548, 'grad_norm': 142.24588012695312, 'learning_rate': 5.249160479852679e-06, 'epoch': 0.86}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9338677354709419, 'eval_f1': 0.9338928618680636, 'eval_loss': 0.5474833846092224, 'eval_runtime': 21.8632, 'eval_samples_per_second': 22.824, 'eval_steps_per_second': 5.717, 'epoch': 1.0}
{'loss': 0.1198, 'grad_norm': 0.043761178851127625, 'learning_rate': 7.886929565205784e-06, 'epoch': 1.29}
{'loss': 0.1582, 'grad_norm': 185.4772491455078, 'learning_rate': 1.0524698650558889e-05, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9078156312625251, 'eval_f1': 0.9078627685847167, 'eval_loss': 0.6732323169708252, 'eval_runtime': 21.4745, 'eval_samples_per_second': 23.237, 'eval_steps_per_second': 5.821, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 2293.4304, 'train_samples_per_second': 60.669, 'train_steps_per_second': 15.174, 'train_loss': 0.17046518161379057, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-18 18:23:36,457] Trial 5 finished with value: 0.9338677354709419 and parameters: {'learning_rate': 5.461764668132139e-05, 'batch_size': 4, 'warmup_ratio': 0.29748784861167366, 'weight_decay': 0.06446170389576267, 'adam_beta1': 0.9047555603231419, 'adam_beta2': 0.9929639240746806, 'adam_epsilon': 4.0218893028568194e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 0.9398797595190381.


Current Trial 6 parameters: {'learning_rate': 1.9517139030724175e-07, 'batch_size': 4, 'warmup_ratio': 0.22736486112025234, 'weight_decay': 0.08517600500154998, 'adam_beta1': 0.838934795685261, 'adam_beta2': 0.9913452942598424, 'adam_epsilon': 6.029933770180071e-07, 'lr_scheduler_type': 'linear'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.2021, 'grad_norm': 2.802017297653947e-05, 'learning_rate': 1.2209002679399048e-08, 'epoch': 0.43}
{'loss': 0.1405, 'grad_norm': 146.70745849609375, 'learning_rate': 2.4541328618185966e-08, 'epoch': 0.86}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9359157700799514, 'eval_loss': 0.5523994565010071, 'eval_runtime': 73.9722, 'eval_samples_per_second': 6.746, 'eval_steps_per_second': 1.69, 'epoch': 1.0}
{'loss': 0.1509, 'grad_norm': 0.00556301511824131, 'learning_rate': 3.687365455697288e-08, 'epoch': 1.29}
{'loss': 0.1296, 'grad_norm': 233.63095092773438, 'learning_rate': 4.92059804957598e-08, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9359157700799514, 'eval_loss': 0.5581141710281372, 'eval_runtime': 101.4891, 'eval_samples_per_second': 4.917, 'eval_steps_per_second': 1.232, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 19477.8021, 'train_samples_per_second': 7.144, 'train_steps_per_second': 1.787, 'train_loss': 0.16396001947337183, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-18 23:49:57,732] Trial 6 finished with value: 0.935871743486974 and parameters: {'learning_rate': 1.9517139030724175e-07, 'batch_size': 4, 'warmup_ratio': 0.22736486112025234, 'weight_decay': 0.08517600500154998, 'adam_beta1': 0.838934795685261, 'adam_beta2': 0.9913452942598424, 'adam_epsilon': 6.029933770180071e-07, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.9398797595190381.


Current Trial 7 parameters: {'learning_rate': 5.3331637660761855e-05, 'batch_size': 4, 'warmup_ratio': 0.5430317046572521, 'weight_decay': 0.19083919012158201, 'adam_beta1': 0.8101433445091133, 'adam_beta2': 0.9967761152201805, 'adam_epsilon': 8.263355819356603e-08, 'lr_scheduler_type': 'cosine_with_restarts'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.1975, 'grad_norm': 0.00017245460185222328, 'learning_rate': 1.396928809507732e-06, 'epoch': 0.43}
{'loss': 0.1424, 'grad_norm': 139.92591857910156, 'learning_rate': 2.8079680110306937e-06, 'epoch': 0.86}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9359036696491496, 'eval_loss': 0.5496513247489929, 'eval_runtime': 21.7858, 'eval_samples_per_second': 22.905, 'eval_steps_per_second': 5.738, 'epoch': 1.0}
{'loss': 0.111, 'grad_norm': 2.511823367967736e-05, 'learning_rate': 4.219007212553655e-06, 'epoch': 1.29}
{'loss': 0.1223, 'grad_norm': 228.8831024169922, 'learning_rate': 5.630046414076617e-06, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9318637274549099, 'eval_f1': 0.9319122892663356, 'eval_loss': 0.5565008521080017, 'eval_runtime': 21.7858, 'eval_samples_per_second': 22.905, 'eval_steps_per_second': 5.738, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 2268.8974, 'train_samples_per_second': 61.325, 'train_steps_per_second': 15.338, 'train_loss': 0.15282705076809588, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-19 00:28:10,424] Trial 7 finished with value: 0.935871743486974 and parameters: {'learning_rate': 5.3331637660761855e-05, 'batch_size': 4, 'warmup_ratio': 0.5430317046572521, 'weight_decay': 0.19083919012158201, 'adam_beta1': 0.8101433445091133, 'adam_beta2': 0.9967761152201805, 'adam_epsilon': 8.263355819356603e-08, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 0.9398797595190381.


Current Trial 8 parameters: {'learning_rate': 6.290870893003492e-06, 'batch_size': 3, 'warmup_ratio': 0.8640569450830029, 'weight_decay': 0.0358800403430124, 'adam_beta1': 0.8889996603067263, 'adam_beta2': 0.992656822089469, 'adam_epsilon': 2.748398030680172e-07, 'lr_scheduler_type': 'linear'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/46380 [00:00<?, ?it/s]

{'loss': 0.2159, 'grad_norm': 130.80955505371094, 'learning_rate': 7.770383261476553e-08, 'epoch': 0.32}
{'loss': 0.1847, 'grad_norm': 15.719806671142578, 'learning_rate': 1.5619255242766002e-07, 'epoch': 0.65}
{'loss': 0.1022, 'grad_norm': 0.001390735269524157, 'learning_rate': 2.346812722405545e-07, 'epoch': 0.97}


  0%|          | 0/167 [00:00<?, ?it/s]

{'eval_accuracy': 0.9378757515030061, 'eval_f1': 0.9379182832541295, 'eval_loss': 0.5640942454338074, 'eval_runtime': 18.1324, 'eval_samples_per_second': 27.52, 'eval_steps_per_second': 9.21, 'epoch': 1.0}
{'loss': 0.1096, 'grad_norm': 0.026386797428131104, 'learning_rate': 3.1316999205344896e-07, 'epoch': 1.29}
{'loss': 0.1506, 'grad_norm': 0.010197930037975311, 'learning_rate': 3.915017344267176e-07, 'epoch': 1.62}
{'loss': 0.1834, 'grad_norm': 0.01581771858036518, 'learning_rate': 4.6999045423961206e-07, 'epoch': 1.94}


  0%|          | 0/167 [00:00<?, ?it/s]

{'eval_accuracy': 0.9378757515030061, 'eval_f1': 0.9378944109059071, 'eval_loss': 0.5267962217330933, 'eval_runtime': 17.8775, 'eval_samples_per_second': 27.912, 'eval_steps_per_second': 9.341, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 1189.7615, 'train_samples_per_second': 116.948, 'train_steps_per_second': 38.983, 'train_loss': 0.15692259299955577, 'epoch': 2.0}


  0%|          | 0/167 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-19 00:48:19,257] Trial 8 finished with value: 0.9378757515030061 and parameters: {'learning_rate': 6.290870893003492e-06, 'batch_size': 3, 'warmup_ratio': 0.8640569450830029, 'weight_decay': 0.0358800403430124, 'adam_beta1': 0.8889996603067263, 'adam_beta2': 0.992656822089469, 'adam_epsilon': 2.748398030680172e-07, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.9398797595190381.


Current Trial 9 parameters: {'learning_rate': 1.332652302207946e-05, 'batch_size': 4, 'warmup_ratio': 0.3216606611220145, 'weight_decay': 0.0027023555623991724, 'adam_beta1': 0.864991780060698, 'adam_beta2': 0.9971022350324645, 'adam_epsilon': 4.665844108797715e-07, 'lr_scheduler_type': 'linear'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.2, 'grad_norm': 5.4490072216140106e-05, 'learning_rate': 5.89300419504139e-07, 'epoch': 0.43}
{'loss': 0.1359, 'grad_norm': 143.32614135742188, 'learning_rate': 1.184553368498219e-06, 'epoch': 0.86}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9418837675350702, 'eval_f1': 0.9419072450828159, 'eval_loss': 0.5560993552207947, 'eval_runtime': 20.3371, 'eval_samples_per_second': 24.536, 'eval_steps_per_second': 6.146, 'epoch': 1.0}
{'loss': 0.1156, 'grad_norm': 5.551341746468097e-05, 'learning_rate': 1.779806317492299e-06, 'epoch': 1.29}
{'loss': 0.1151, 'grad_norm': 238.62818908691406, 'learning_rate': 2.3750592664863788e-06, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9378757515030061, 'eval_f1': 0.9379038745902142, 'eval_loss': 0.5519194006919861, 'eval_runtime': 20.0127, 'eval_samples_per_second': 24.934, 'eval_steps_per_second': 6.246, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 2097.3278, 'train_samples_per_second': 66.342, 'train_steps_per_second': 16.593, 'train_loss': 0.15026391950146906, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-19 01:23:38,081] Trial 9 finished with value: 0.9418837675350702 and parameters: {'learning_rate': 1.332652302207946e-05, 'batch_size': 4, 'warmup_ratio': 0.3216606611220145, 'weight_decay': 0.0027023555623991724, 'adam_beta1': 0.864991780060698, 'adam_beta2': 0.9971022350324645, 'adam_epsilon': 4.665844108797715e-07, 'lr_scheduler_type': 'linear'}. Best is trial 9 with value: 0.9418837675350702.


Current Trial 10 parameters: {'learning_rate': 1.740142214931175e-06, 'batch_size': 4, 'warmup_ratio': 0.02424349734159209, 'weight_decay': 0.24476976357878066, 'adam_beta1': 0.8619381387791497, 'adam_beta2': 0.9989940250760806, 'adam_epsilon': 4.978966294246238e-07, 'lr_scheduler_type': 'linear'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.1985, 'grad_norm': 0.00012113740376662463, 'learning_rate': 1.0205810383778811e-06, 'epoch': 0.43}
{'loss': 0.1383, 'grad_norm': 142.3710479736328, 'learning_rate': 1.7324039220093171e-06, 'epoch': 0.86}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9398797595190381, 'eval_f1': 0.9398895060983918, 'eval_loss': 0.5556939244270325, 'eval_runtime': 19.5016, 'eval_samples_per_second': 25.588, 'eval_steps_per_second': 6.41, 'epoch': 1.0}
{'loss': 0.1081, 'grad_norm': 5.780799256172031e-05, 'learning_rate': 1.7067804355130989e-06, 'epoch': 1.29}
{'loss': 0.1028, 'grad_norm': 238.42990112304688, 'learning_rate': 1.6811569490168804e-06, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9358642859453613, 'eval_loss': 0.5408743023872375, 'eval_runtime': 20.4165, 'eval_samples_per_second': 24.441, 'eval_steps_per_second': 6.122, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 2280.1806, 'train_samples_per_second': 61.021, 'train_steps_per_second': 15.262, 'train_loss': 0.1441633290257947, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-19 02:01:59,587] Trial 10 finished with value: 0.9398797595190381 and parameters: {'learning_rate': 1.740142214931175e-06, 'batch_size': 4, 'warmup_ratio': 0.02424349734159209, 'weight_decay': 0.24476976357878066, 'adam_beta1': 0.8619381387791497, 'adam_beta2': 0.9989940250760806, 'adam_epsilon': 4.978966294246238e-07, 'lr_scheduler_type': 'linear'}. Best is trial 9 with value: 0.9418837675350702.


Current Trial 11 parameters: {'learning_rate': 1.106211044117024e-05, 'batch_size': 4, 'warmup_ratio': 0.7432294920061084, 'weight_decay': 0.002607992266424136, 'adam_beta1': 0.8078751716319704, 'adam_beta2': 0.9901704676507107, 'adam_epsilon': 9.809578420447187e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.2014, 'grad_norm': 4.127285865251906e-05, 'learning_rate': 2.1170480063325998e-07, 'epoch': 0.43}
{'loss': 0.1372, 'grad_norm': 144.4433135986328, 'learning_rate': 4.2554803359614884e-07, 'epoch': 0.86}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9378757515030061, 'eval_f1': 0.9379182832541295, 'eval_loss': 0.5573342442512512, 'eval_runtime': 20.3195, 'eval_samples_per_second': 24.558, 'eval_steps_per_second': 6.152, 'epoch': 1.0}
{'loss': 0.13, 'grad_norm': 0.0004075548204127699, 'learning_rate': 6.393912665590377e-07, 'epoch': 1.29}
{'loss': 0.1199, 'grad_norm': 233.52256774902344, 'learning_rate': 8.532344995219265e-07, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.935875288174291, 'eval_loss': 0.5552846193313599, 'eval_runtime': 20.4634, 'eval_samples_per_second': 24.385, 'eval_steps_per_second': 6.108, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 2095.8091, 'train_samples_per_second': 66.39, 'train_steps_per_second': 16.605, 'train_loss': 0.15540576638846562, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-19 02:37:16,839] Trial 11 finished with value: 0.9378757515030061 and parameters: {'learning_rate': 1.106211044117024e-05, 'batch_size': 4, 'warmup_ratio': 0.7432294920061084, 'weight_decay': 0.002607992266424136, 'adam_beta1': 0.8078751716319704, 'adam_beta2': 0.9901704676507107, 'adam_epsilon': 9.809578420447187e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 9 with value: 0.9418837675350702.


Current Trial 12 parameters: {'learning_rate': 1.3045906121744648e-05, 'batch_size': 4, 'warmup_ratio': 0.5949894141215617, 'weight_decay': 0.14050806151632145, 'adam_beta1': 0.8346666433206839, 'adam_beta2': 0.9939513160406481, 'adam_epsilon': 2.653443903719701e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.2011, 'grad_norm': 4.0176513721235096e-05, 'learning_rate': 3.1187692119499663e-07, 'epoch': 0.43}
{'loss': 0.1363, 'grad_norm': 143.80104064941406, 'learning_rate': 6.26904114321256e-07, 'epoch': 0.86}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9398797595190381, 'eval_f1': 0.9399240439612142, 'eval_loss': 0.5572511553764343, 'eval_runtime': 19.6393, 'eval_samples_per_second': 25.408, 'eval_steps_per_second': 6.365, 'epoch': 1.0}
{'loss': 0.1244, 'grad_norm': 0.00022325461031869054, 'learning_rate': 9.419313074475151e-07, 'epoch': 1.29}
{'loss': 0.1186, 'grad_norm': 237.1048126220703, 'learning_rate': 1.2569585005737744e-06, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.935875288174291, 'eval_loss': 0.5554161071777344, 'eval_runtime': 19.5872, 'eval_samples_per_second': 25.476, 'eval_steps_per_second': 6.382, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 2268.8914, 'train_samples_per_second': 61.325, 'train_steps_per_second': 15.338, 'train_loss': 0.15333370340281519, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-19 03:15:26,320] Trial 12 finished with value: 0.9398797595190381 and parameters: {'learning_rate': 1.3045906121744648e-05, 'batch_size': 4, 'warmup_ratio': 0.5949894141215617, 'weight_decay': 0.14050806151632145, 'adam_beta1': 0.8346666433206839, 'adam_beta2': 0.9939513160406481, 'adam_epsilon': 2.653443903719701e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 9 with value: 0.9418837675350702.


Current Trial 13 parameters: {'learning_rate': 5.1300745373433385e-06, 'batch_size': 4, 'warmup_ratio': 0.985166175210749, 'weight_decay': 0.0016266145488112876, 'adam_beta1': 0.8338783120227198, 'adam_beta2': 0.9985120969837219, 'adam_epsilon': 4.7414189394788616e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.2019, 'grad_norm': 2.7512667656992562e-05, 'learning_rate': 7.406915459062398e-08, 'epoch': 0.43}
{'loss': 0.1389, 'grad_norm': 145.76512145996094, 'learning_rate': 1.4888648245994114e-07, 'epoch': 0.86}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9359157700799514, 'eval_loss': 0.5545303225517273, 'eval_runtime': 21.2693, 'eval_samples_per_second': 23.461, 'eval_steps_per_second': 5.877, 'epoch': 1.0}
{'loss': 0.1416, 'grad_norm': 0.0016357159474864602, 'learning_rate': 2.2370381032925828e-07, 'epoch': 1.29}
{'loss': 0.1239, 'grad_norm': 227.5735626220703, 'learning_rate': 2.9852113819857544e-07, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9378757515030061, 'eval_f1': 0.9379182832541295, 'eval_loss': 0.554935872554779, 'eval_runtime': 21.3722, 'eval_samples_per_second': 23.348, 'eval_steps_per_second': 5.849, 'epoch': 2.0}
{'loss': 0.1614, 'grad_norm': 1.0587824306185212e-07, 'learning_rate': 3.7333846606789263e-07, 'epoch': 2.16}
{'loss': 0.1203, 'grad_norm': 0.9098794460296631, 'learning_rate': 4.480061592814711e-07, 'epoch': 2.59}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9398797595190381, 'eval_f1': 0.9399240439612142, 'eval_loss': 0.5736071467399597, 'eval_runtime': 21.4387, 'eval_samples_per_second': 23.276, 'eval_steps_per_second': 5.831, 'epoch': 3.0}
{'loss': 0.1378, 'grad_norm': 2.27242112159729, 'learning_rate': 5.228234871507882e-07, 'epoch': 3.02}
{'loss': 0.0998, 'grad_norm': 0.011053909547626972, 'learning_rate': 5.976408150201054e-07, 'epoch': 3.45}
{'loss': 0.1407, 'grad_norm': 0.00028350844513624907, 'learning_rate': 6.724581428894226e-07, 'epoch': 3.88}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9378757515030061, 'eval_f1': 0.9379182832541295, 'eval_loss': 0.573811948299408, 'eval_runtime': 21.3239, 'eval_samples_per_second': 23.401, 'eval_steps_per_second': 5.862, 'epoch': 4.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 4065.8596, 'train_samples_per_second': 34.222, 'train_steps_per_second': 8.559, 'train_loss': 0.13852832831185438, 'epoch': 4.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-19 04:23:34,427] Trial 13 finished with value: 0.9398797595190381 and parameters: {'learning_rate': 5.1300745373433385e-06, 'batch_size': 4, 'warmup_ratio': 0.985166175210749, 'weight_decay': 0.0016266145488112876, 'adam_beta1': 0.8338783120227198, 'adam_beta2': 0.9985120969837219, 'adam_epsilon': 4.7414189394788616e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 9 with value: 0.9418837675350702.


Current Trial 14 parameters: {'learning_rate': 2.2721897374500115e-05, 'batch_size': 4, 'warmup_ratio': 0.6768089280531311, 'weight_decay': 0.15917904097684543, 'adam_beta1': 0.9451949608014663, 'adam_beta2': 0.9941893702334875, 'adam_epsilon': 5.1208282088829624e-08, 'lr_scheduler_type': 'cosine'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.2002, 'grad_norm': 3.9290651329793036e-05, 'learning_rate': 4.775331889940797e-07, 'epoch': 0.43}
{'loss': 0.1348, 'grad_norm': 144.19581604003906, 'learning_rate': 9.59889945553756e-07, 'epoch': 0.86}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9398797595190381, 'eval_f1': 0.9399240439612142, 'eval_loss': 0.560529351234436, 'eval_runtime': 20.8727, 'eval_samples_per_second': 23.907, 'eval_steps_per_second': 5.989, 'epoch': 1.0}
{'loss': 0.1181, 'grad_norm': 5.4279986215988174e-05, 'learning_rate': 1.4422467021134325e-06, 'epoch': 1.29}
{'loss': 0.1166, 'grad_norm': 241.52877807617188, 'learning_rate': 1.9246034586731085e-06, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.935875288174291, 'eval_loss': 0.5211436152458191, 'eval_runtime': 20.8974, 'eval_samples_per_second': 23.879, 'eval_steps_per_second': 5.982, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 2133.5193, 'train_samples_per_second': 65.216, 'train_steps_per_second': 16.311, 'train_loss': 0.15080753030448124, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-19 04:59:29,714] Trial 14 finished with value: 0.9398797595190381 and parameters: {'learning_rate': 2.2721897374500115e-05, 'batch_size': 4, 'warmup_ratio': 0.6768089280531311, 'weight_decay': 0.15917904097684543, 'adam_beta1': 0.9451949608014663, 'adam_beta2': 0.9941893702334875, 'adam_epsilon': 5.1208282088829624e-08, 'lr_scheduler_type': 'cosine'}. Best is trial 9 with value: 0.9418837675350702.


Current Trial 15 parameters: {'learning_rate': 9.53255349064821e-05, 'batch_size': 4, 'warmup_ratio': 0.46745217436286496, 'weight_decay': 0.22363355062361143, 'adam_beta1': 0.8715256999111988, 'adam_beta2': 0.9979493417368412, 'adam_epsilon': 1.7114394380042275e-07, 'lr_scheduler_type': 'linear'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.1956, 'grad_norm': 0.0002564020687714219, 'learning_rate': 2.9005495315163905e-06, 'epoch': 0.43}
{'loss': 0.1597, 'grad_norm': 142.0081787109375, 'learning_rate': 5.8303975431491085e-06, 'epoch': 0.86}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9298597194388778, 'eval_f1': 0.9299122950475985, 'eval_loss': 0.5481395125389099, 'eval_runtime': 20.8713, 'eval_samples_per_second': 23.908, 'eval_steps_per_second': 5.989, 'epoch': 1.0}
{'loss': 0.1217, 'grad_norm': 0.07999307662248611, 'learning_rate': 8.760245554781826e-06, 'epoch': 1.29}
{'loss': 0.1681, 'grad_norm': 217.4483642578125, 'learning_rate': 1.1690093566414543e-05, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9118236472945892, 'eval_f1': 0.911957823690246, 'eval_loss': 0.8085912466049194, 'eval_runtime': 20.8011, 'eval_samples_per_second': 23.989, 'eval_steps_per_second': 6.009, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 2134.1065, 'train_samples_per_second': 65.198, 'train_steps_per_second': 16.307, 'train_loss': 0.17650205513526654, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-19 05:35:25,384] Trial 15 finished with value: 0.9298597194388778 and parameters: {'learning_rate': 9.53255349064821e-05, 'batch_size': 4, 'warmup_ratio': 0.46745217436286496, 'weight_decay': 0.22363355062361143, 'adam_beta1': 0.8715256999111988, 'adam_beta2': 0.9979493417368412, 'adam_epsilon': 1.7114394380042275e-07, 'lr_scheduler_type': 'linear'}. Best is trial 9 with value: 0.9418837675350702.


Current Trial 16 parameters: {'learning_rate': 1.1390437132148595e-06, 'batch_size': 4, 'warmup_ratio': 0.22472734071311423, 'weight_decay': 0.10563337518542765, 'adam_beta1': 0.8002699227072925, 'adam_beta2': 0.9920616635528605, 'adam_epsilon': 3.5758212131124344e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.2019, 'grad_norm': 2.765080535027664e-05, 'learning_rate': 7.209137425410504e-08, 'epoch': 0.43}
{'loss': 0.1391, 'grad_norm': 145.7869873046875, 'learning_rate': 1.4491094420774648e-07, 'epoch': 0.86}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9359157700799514, 'eval_loss': 0.5550214052200317, 'eval_runtime': 20.6904, 'eval_samples_per_second': 24.117, 'eval_steps_per_second': 6.041, 'epoch': 1.0}
{'loss': 0.1421, 'grad_norm': 0.001591749140061438, 'learning_rate': 2.1773051416138796e-07, 'epoch': 1.29}
{'loss': 0.1243, 'grad_norm': 228.28860473632812, 'learning_rate': 2.905500841150294e-07, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9378757515030061, 'eval_f1': 0.9379182832541295, 'eval_loss': 0.5564877390861511, 'eval_runtime': 20.8246, 'eval_samples_per_second': 23.962, 'eval_steps_per_second': 6.003, 'epoch': 2.0}
{'loss': 0.1623, 'grad_norm': 7.613397912109576e-08, 'learning_rate': 3.6336965406867083e-07, 'epoch': 2.16}
{'loss': 0.1217, 'grad_norm': 0.963443398475647, 'learning_rate': 4.3604358488240495e-07, 'epoch': 2.59}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9398797595190381, 'eval_f1': 0.9399240439612142, 'eval_loss': 0.5736603736877441, 'eval_runtime': 20.7609, 'eval_samples_per_second': 24.036, 'eval_steps_per_second': 6.021, 'epoch': 3.0}
{'loss': 0.1393, 'grad_norm': 2.202064275741577, 'learning_rate': 5.088631548360464e-07, 'epoch': 3.02}
{'loss': 0.1015, 'grad_norm': 0.01070069894194603, 'learning_rate': 5.81682724789688e-07, 'epoch': 3.45}
{'loss': 0.1431, 'grad_norm': 0.0002620970772113651, 'learning_rate': 6.545022947433293e-07, 'epoch': 3.88}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9378757515030061, 'eval_f1': 0.9379182832541295, 'eval_loss': 0.5775915384292603, 'eval_runtime': 21.1547, 'eval_samples_per_second': 23.588, 'eval_steps_per_second': 5.909, 'epoch': 4.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 4507.4996, 'train_samples_per_second': 30.869, 'train_steps_per_second': 7.72, 'train_loss': 0.13954102808031543, 'epoch': 4.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-19 06:50:56,015] Trial 16 finished with value: 0.9398797595190381 and parameters: {'learning_rate': 1.1390437132148595e-06, 'batch_size': 4, 'warmup_ratio': 0.22472734071311423, 'weight_decay': 0.10563337518542765, 'adam_beta1': 0.8002699227072925, 'adam_beta2': 0.9920616635528605, 'adam_epsilon': 3.5758212131124344e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 9 with value: 0.9418837675350702.


Current Trial 17 parameters: {'learning_rate': 5.4345030270771235e-06, 'batch_size': 4, 'warmup_ratio': 0.7665504200303693, 'weight_decay': 0.03516175614603054, 'adam_beta1': 0.8501054470277679, 'adam_beta2': 0.9950987378578932, 'adam_epsilon': 6.187976653469034e-07, 'lr_scheduler_type': 'cosine_with_restarts'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.2018, 'grad_norm': 4.270712815923616e-05, 'learning_rate': 1.0084266750649184e-07, 'epoch': 0.43}
{'loss': 0.1385, 'grad_norm': 145.46963500976562, 'learning_rate': 2.0270394781607956e-07, 'epoch': 0.86}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9359157700799514, 'eval_loss': 0.5553570985794067, 'eval_runtime': 20.4219, 'eval_samples_per_second': 24.435, 'eval_steps_per_second': 6.121, 'epoch': 1.0}
{'loss': 0.1387, 'grad_norm': 0.0011071932967752218, 'learning_rate': 3.045652281256672e-07, 'epoch': 1.29}
{'loss': 0.1227, 'grad_norm': 227.7855224609375, 'learning_rate': 4.0642650843525493e-07, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9378757515030061, 'eval_f1': 0.9379182832541295, 'eval_loss': 0.5530847907066345, 'eval_runtime': 20.4407, 'eval_samples_per_second': 24.412, 'eval_steps_per_second': 6.115, 'epoch': 2.0}
{'loss': 0.1597, 'grad_norm': 7.443355087843884e-08, 'learning_rate': 5.082877887448427e-07, 'epoch': 2.16}
{'loss': 0.116, 'grad_norm': 0.6248034834861755, 'learning_rate': 6.099453464938112e-07, 'epoch': 2.59}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9418837675350702, 'eval_f1': 0.9419072450828159, 'eval_loss': 0.5867544412612915, 'eval_runtime': 20.3915, 'eval_samples_per_second': 24.471, 'eval_steps_per_second': 6.13, 'epoch': 3.0}
{'loss': 0.1335, 'grad_norm': 3.347468614578247, 'learning_rate': 7.118066268033989e-07, 'epoch': 3.02}
{'loss': 0.0927, 'grad_norm': 0.009553579613566399, 'learning_rate': 8.136679071129867e-07, 'epoch': 3.45}
{'loss': 0.1315, 'grad_norm': 0.0001974794577108696, 'learning_rate': 9.155291874225744e-07, 'epoch': 3.88}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.9378757515030061, 'eval_f1': 0.9378980210766958, 'eval_loss': 0.5795443654060364, 'eval_runtime': 20.5329, 'eval_samples_per_second': 24.302, 'eval_steps_per_second': 6.088, 'epoch': 4.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 4200.5548, 'train_samples_per_second': 33.124, 'train_steps_per_second': 8.285, 'train_loss': 0.13497266666642552, 'epoch': 4.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-19 08:01:18,052] Trial 17 finished with value: 0.9418837675350702 and parameters: {'learning_rate': 5.4345030270771235e-06, 'batch_size': 4, 'warmup_ratio': 0.7665504200303693, 'weight_decay': 0.03516175614603054, 'adam_beta1': 0.8501054470277679, 'adam_beta2': 0.9950987378578932, 'adam_epsilon': 6.187976653469034e-07, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 9 with value: 0.9418837675350702.


Current Trial 18 parameters: {'learning_rate': 4.154648473414455e-06, 'batch_size': 3, 'warmup_ratio': 0.8254611076004349, 'weight_decay': 0.021100516165727612, 'adam_beta1': 0.8486817950435868, 'adam_beta2': 0.9975388465691606, 'adam_epsilon': 6.359404716640919e-07, 'lr_scheduler_type': 'linear'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/46380 [00:00<?, ?it/s]

{'loss': 0.2159, 'grad_norm': 132.9669952392578, 'learning_rate': 5.3716886361242135e-08, 'epoch': 0.32}
{'loss': 0.185, 'grad_norm': 17.82255744934082, 'learning_rate': 1.0786784857186805e-07, 'epoch': 0.65}
{'loss': 0.103, 'grad_norm': 0.0015064617618918419, 'learning_rate': 1.621273297448399e-07, 'epoch': 0.97}


  0%|          | 0/167 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9359157700799514, 'eval_loss': 0.5643511414527893, 'eval_runtime': 17.8447, 'eval_samples_per_second': 27.963, 'eval_steps_per_second': 9.359, 'epoch': 1.0}
{'loss': 0.1128, 'grad_norm': 0.0397338792681694, 'learning_rate': 2.1638681091781175e-07, 'epoch': 1.29}
{'loss': 0.1521, 'grad_norm': 0.009844542481005192, 'learning_rate': 2.706462920907836e-07, 'epoch': 1.62}
{'loss': 0.1843, 'grad_norm': 0.011681566014885902, 'learning_rate': 3.2490577326375545e-07, 'epoch': 1.94}


  0%|          | 0/167 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9359157700799514, 'eval_loss': 0.533847451210022, 'eval_runtime': 17.9076, 'eval_samples_per_second': 27.865, 'eval_steps_per_second': 9.326, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 1166.0003, 'train_samples_per_second': 119.331, 'train_steps_per_second': 39.777, 'train_loss': 0.1580897867294055, 'epoch': 2.0}


  0%|          | 0/167 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-19 08:21:03,138] Trial 18 finished with value: 0.935871743486974 and parameters: {'learning_rate': 4.154648473414455e-06, 'batch_size': 3, 'warmup_ratio': 0.8254611076004349, 'weight_decay': 0.021100516165727612, 'adam_beta1': 0.8486817950435868, 'adam_beta2': 0.9975388465691606, 'adam_epsilon': 6.359404716640919e-07, 'lr_scheduler_type': 'linear'}. Best is trial 9 with value: 0.9418837675350702.


Current Trial 19 parameters: {'learning_rate': 6.169994646982436e-07, 'batch_size': 4, 'warmup_ratio': 0.9886008700272123, 'weight_decay': 0.045635541062629625, 'adam_beta1': 0.8867276898827927, 'adam_beta2': 0.9950703113403668, 'adam_epsilon': 7.987487123856168e-07, 'lr_scheduler_type': 'cosine'}


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/34800 [00:00<?, ?it/s]

{'loss': 0.2021, 'grad_norm': 2.8235794161446393e-05, 'learning_rate': 8.8773030759688e-09, 'epoch': 0.43}
{'loss': 0.1406, 'grad_norm': 146.78402709960938, 'learning_rate': 1.7844275879977687e-08, 'epoch': 0.86}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9359157700799514, 'eval_loss': 0.5521913766860962, 'eval_runtime': 20.5431, 'eval_samples_per_second': 24.29, 'eval_steps_per_second': 6.085, 'epoch': 1.0}
{'loss': 0.1515, 'grad_norm': 0.006157710216939449, 'learning_rate': 2.6811248683986577e-08, 'epoch': 1.29}
{'loss': 0.1301, 'grad_norm': 233.99411010742188, 'learning_rate': 3.5778221487995466e-08, 'epoch': 1.72}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_accuracy': 0.935871743486974, 'eval_f1': 0.9359157700799514, 'eval_loss': 0.557083785533905, 'eval_runtime': 20.4528, 'eval_samples_per_second': 24.398, 'eval_steps_per_second': 6.112, 'epoch': 2.0}
Stopping training: No improvement in eval_accuracy for 1 epochs
{'train_runtime': 2016.148, 'train_samples_per_second': 69.013, 'train_steps_per_second': 17.261, 'train_loss': 0.16425138341969459, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

Stopping training: No improvement in eval_accuracy for 1 epochs


[I 2024-10-19 08:55:00,648] Trial 19 finished with value: 0.935871743486974 and parameters: {'learning_rate': 6.169994646982436e-07, 'batch_size': 4, 'warmup_ratio': 0.9886008700272123, 'weight_decay': 0.045635541062629625, 'adam_beta1': 0.8867276898827927, 'adam_beta2': 0.9950703113403668, 'adam_epsilon': 7.987487123856168e-07, 'lr_scheduler_type': 'cosine'}. Best is trial 9 with value: 0.9418837675350702.


## Second Tuning

In [19]:
# Create the Trainer
check_point_path="./MCQA-Combined/Optuna2/trial_0/checkpoint-75540" # 58.6% ReClore Leaderboard
trainer = create_trainer(model_name=check_point_path,  run_name="Optuna3", num_train_epochs=10)

# Train the model
trainer.train()

# Evaluate the model on the test set
test_results = trainer.evaluate(eval_dataset=get_test_encoded())
print("Test Results:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  0%|          | 0/11600 [00:00<?, ?it/s]

KeyboardInterrupt: 

## LeaderBoard File generation

In [23]:
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForMultipleChoice
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Load the tokenizer and model from the same checkpoint used during training
#check_point_path="./ReClor/Optuna3/trial_17/checkpoint-3480" # 59.5% ReClore Leaderboard
#check_point_path="./MCQA-Combined/Optuna2/trial_0/checkpoint-62950" # 57.10% ReClore Leaderboard
#check_point_path="./MCQA-Combined/Optuna2/trial_0/checkpoint-75540" # 58.6% ReClore Leaderboard
#check_point_path="./MCQA-Combined/Optuna2/trial_0/checkpoint-125900" # 55.0% ReClore Leaderboard

#check_point_path="./MCQA-Combined-9/Optuna-1/trial_0/checkpoint-36630" # 71.04% Validation - 58% ReClore Leaderboard
#check_point_path="./MCQA-Combined-9/Optuna-1/trial_0/checkpoint-48840" # 72.60% Validation - 57.40% ReClore Leaderboard
#check_point_path="./MCQA-Combined-9/Optuna-1/trial_0/checkpoint-61050" # % Validation - 57.90% ReClore Leaderboard
#check_point_path="./MCQA-Combined-9/Optuna-1/trial_0/checkpoint-97680" # 75.20% Validation - 59.6% ReClore Leaderboard
#check_point_path="./MCQA-Combined-9/Optuna-1/trial_4/checkpoint-12210" # 60.78% Validation - 54.2% ReClore Leaderboard
check_point_path="./MCQA-Combined-9/Optuna-1/trial_4/checkpoint-24420" # 63.46% Validation - 54.9% ReClore Leaderboard


tokenizer = AutoTokenizer.from_pretrained(check_point_path)
model = AutoModelForMultipleChoice.from_pretrained(check_point_path)

# Load test data
with open('./ReClor/reclor_data/test.json', 'r') as file:
    test_data = json.load(file)

class MultipleChoiceDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512, max_num_choices=5):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.max_num_choices = max_num_choices

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item['context']
        question = item['question']
        options = item['answers']
        num_choices = len(options)
        # Pad options to have max_num_choices
        if num_choices < self.max_num_choices:
            options += [''] * (self.max_num_choices - num_choices)
        num_choices = self.max_num_choices

        # Combine question with each option
        second_sentences = [f"{question} {option}" for option in options]

        encoded = self.tokenizer(
            [context] * num_choices,  # Context repeated for each option
            second_sentences,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoded['input_ids'],  # Do not squeeze
            'attention_mask': encoded['attention_mask'],  # Do not squeeze
            'labels': torch.tensor(0)  # Dummy label
        }

# Create dataset and DataLoader
test_dataset = MultipleChoiceDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Prediction loop with tqdm for progress bar
predictions = []
model.eval()
for batch in tqdm(test_loader, desc="Predicting", unit="batch"):
    with torch.no_grad():
        outputs = model(
            input_ids=batch['input_ids'],  # Do not squeeze
            attention_mask=batch['attention_mask']  # Do not squeeze
        )
    pred = torch.argmax(outputs.logits, dim=1)
    predictions.extend(pred.cpu().numpy())

# Create ID string mapping to predictions
predicted_labels = {f"test_{i}": int(pred) for i, pred in enumerate(predictions)}

# Save predictions to .npy file
np.save('test_predictions.npy', np.array(predictions))

print("Predictions saved successfully.")


Predicting: 100%|██████████| 1000/1000 [17:47<00:00,  1.07s/batch]

Predictions saved successfully.





In [20]:
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForMultipleChoice
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Load the tokenizer and model from the same checkpoint used during training
tokenizer = AutoTokenizer.from_pretrained("./ReClor/Optuna/trial_0/checkpoint-5800")
model = AutoModelForMultipleChoice.from_pretrained("./ReClor/Optuna/trial_0/checkpoint-5800")

# Load test data
with open('./ReClor/reclor_data/test.json', 'r') as file:
    test_data = json.load(file)

class MultipleChoiceDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512, max_num_choices=5):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.max_num_choices = max_num_choices

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item['context']
        question = item['question']
        options = item['answers']
        num_choices = len(options)
        # Pad options to have max_num_choices
        if num_choices < self.max_num_choices:
            options += [''] * (self.max_num_choices - num_choices)
        num_choices = self.max_num_choices

        # Combine question with each option
        second_sentences = [f"{question} {option}" for option in options]

        encoded = self.tokenizer(
            [context] * num_choices,  # Context repeated for each option
            second_sentences,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoded['input_ids'],  # Do not squeeze
            'attention_mask': encoded['attention_mask'],  # Do not squeeze
            'labels': torch.tensor(0)  # Dummy label
        }

# Create dataset and DataLoader
test_dataset = MultipleChoiceDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Prediction loop with tqdm for progress bar
predictions = []
model.eval()
for batch in tqdm(test_loader, desc="Predicting", unit="batch"):
    with torch.no_grad():
        outputs = model(
            input_ids=batch['input_ids'],  # Do not squeeze
            attention_mask=batch['attention_mask']  # Do not squeeze
        )
    pred = torch.argmax(outputs.logits, dim=1)
    predictions.extend(pred.cpu().numpy())

# Create ID string mapping to predictions
predicted_labels = {f"test_{i}": int(pred) for i, pred in enumerate(predictions)}

# Save predictions to .npy file
np.save('test_predictions.npy', np.array(predictions))

print("Predictions saved successfully.")


Predicting: 100%|██████████| 1000/1000 [16:47<00:00,  1.01s/batch]

Predictions saved successfully.





In [15]:
print(len(test_data))  # This will print the number of entries in the test.json
print(len(test_dataset))  # This will also show the number of entries loaded into your dataset class


1000
1000


In [15]:
import json
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForMultipleChoice
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torch.nn as nn


# Load the base DeBERTa model
model = AutoModel.from_pretrained("microsoft/deberta-base-mnli")
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base-mnli")



class DeBERTaForMultipleChoice(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.deberta = base_model
        self.classifier = nn.Linear(self.deberta.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        batch_size = input_ids.size(0)
        num_choices = input_ids.size(1)
        seq_length = input_ids.size(2)

        # Flatten the batch and choice dimensions
        flat_input_ids = input_ids.view(batch_size * num_choices, seq_length)
        flat_attention_mask = attention_mask.view(batch_size * num_choices, seq_length)

        outputs = self.deberta(input_ids=flat_input_ids, attention_mask=flat_attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # [batch_size*num_choices, hidden_size]

        logits = self.classifier(pooled_output)  # [batch_size*num_choices, 1]

        # Reshape logits back to [batch_size, num_choices]
        logits = logits.view(batch_size, num_choices)
        return logits


# Initialize the custom multiple choice model
mc_model = DeBERTaForMultipleChoice(model)

# Load test data
with open('./ReClor/reclor_data/test.json', 'r') as file:
    test_data = json.load(file)

class MultipleChoiceDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item['context']
        question = item['question']
        options = item['answers']
        encoded = self.tokenizer(
            [context] * len(options),  # Context repeated for each option
            options,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoded['input_ids'],  # Keep the dimensions as is
            'attention_mask': encoded['attention_mask'],
            'labels': torch.tensor(item.get('question_type', 0))  # Dummy labels
        }


# Create dataset and DataLoader
test_dataset = MultipleChoiceDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Prediction loop with tqdm for progress bar
mc_model.eval()
predictions = []
for batch in tqdm(test_loader, desc="Predicting", unit="batch"):
    with torch.no_grad():
        logits = mc_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
    pred = torch.argmax(logits, dim=1)
    predictions.extend(pred.numpy())


# Create ID string mapping to predictions
predicted_labels = {f"test_{i}": pred for i, pred in enumerate(predictions)}

# Save predictions to .npy file
np.save('test_predictions.npy', np.array(predictions))

print("Predictions saved successfully.")


Predicting:  66%|██████▌   | 662/1000 [09:53<05:02,  1.12batch/s]


KeyboardInterrupt: 