In [None]:
import torch
import numpy as np  # Import numpy
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, Trainer, TrainingArguments
from transformers import DistilBertConfig
from transformers import DataCollatorForTokenClassification
from datasets import load_dataset
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import Trainer
from transformers import  get_cosine_schedule_with_warmup
from transformers import DistilBertModel
from torch.optim import AdamW
from collections import Counter

class CustomTrainer(Trainer):
    def create_optimizer_and_scheduler(self, num_training_steps: int):
        super().create_optimizer_and_scheduler(num_training_steps)  # Call to superclass method
        self.optimizer = AdamW(self.model.parameters(), betas=(0.9, 0.999), eps=1e-08, lr=5e-5)
        self.lr_scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
        print(f'Optimizer: {self.optimizer}')  # Debugging line
        print(f'Scheduler: {self.lr_scheduler}')  # Debugging line

class CustomDistillBERTClass(torch.nn.Module):
    def __init__(self, num_labels, class_weights):
        super(CustomDistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-cased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, num_labels)
        self.class_weights = torch.tensor(class_weights).to('cuda')        
    
    def forward(self, input_ids, attention_mask, labels=None):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        
        # For token classification, we apply the classifier to each token in the sequence
        sequence_output = self.pre_classifier(hidden_state)
        sequence_output = torch.nn.ReLU()(sequence_output)
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        
        # If labels are provided, compute the loss as well
        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)     
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.classifier.out_features)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
        
        return logits if loss is None else (loss, logits)
        
def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=2)

    # Remove ignored index (special tokens)
    true_labels = p.label_ids
    true_preds = [
        [id_to_tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, true_labels)
    ]
    true_labels = [
        [id_to_tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, true_labels)
    ]

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
        "accuracy": accuracy_score(true_labels, true_preds),
    }


# Load the dataset
dataset = load_dataset("wnut_17")
tag_to_id = dataset["train"].features["ner_tags"].feature.names
id_to_tag = {i: tag for i, tag in enumerate(tag_to_id)}

unique_tags = set()
for entry in dataset['train']['ner_tags']:
    unique_tags.update(entry)
print(f'Min tag: {min(unique_tags)}, Max tag: {max(unique_tags)}')

# Calculate class weights based on their frequency
counter = Counter({0: 59570, 9: 660, 7: 548, 10: 335, 5: 264, 8: 245, 1: 221, 4: 206, 12: 203, 6: 150, 11: 142, 3: 140, 2: 46})
total_count = sum(counter.values())
class_weights = [total_count / counter[i] for i in range(len(counter))]

# Normalize the class weights so they sum to 1
weight_sum = sum(class_weights)
normalized_class_weights = [w / weight_sum for w in class_weights]

# Multiply every label except the most common one by a factor of 10
most_common_label = counter.most_common(1)[0][0]
for i in range(len(class_weights)):
    if i != most_common_label:
        class_weights[i] *= 10

# Load the tokenizer and model using a DistilBERT checkpoint
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
config = DistilBertConfig.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))
model = CustomDistillBERTClass(num_labels=len(unique_tags), class_weights=normalized_class_weights)
model.to('cuda')


def tokenize_function(examples):
    encoding = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_attention_mask=True,
        return_tensors="pt"
    )
    encoding["labels"] = examples["ner_tags"]
    return encoding
    
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Prepare the data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    output_dir='./results',
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=5e-05,  
    push_to_hub=False,  
    logging_first_step=True,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',  
    greater_is_better=False,  
    warmup_steps=0,  
    save_total_limit=2,  
    seed=42
)

# Training
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

# Evaluation
results = trainer.evaluate()

# Print the results
print(results)

Min tag: 0, Max tag: 12
Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7ff7ced34430>


[34m[1mwandb[0m: Currently logged in as: [33mthejosephloy[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [1]:
import torch
import numpy as np  # Import numpy
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, Trainer, TrainingArguments
from transformers import DistilBertConfig
from transformers import DataCollatorForTokenClassification
from datasets import load_dataset
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import Trainer
from transformers import  get_cosine_schedule_with_warmup
from transformers import DistilBertModel
from torch.optim import AdamW
from collections import Counter

class CustomTrainer(Trainer):
    def create_optimizer_and_scheduler(self, num_training_steps: int):
        super().create_optimizer_and_scheduler(num_training_steps)
        self.optimizer = AdamW(self.model.parameters(), betas=(0.9, 0.999), eps=1e-08, lr=2.44e-5)
        self.lr_scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

class CustomDistillBERTClass(torch.nn.Module):
    def __init__(self, num_labels):
        super(CustomDistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-cased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        sequence_output = self.pre_classifier(hidden_state)
        sequence_output = torch.nn.ReLU()(sequence_output)
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        
        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()  # No weight argument
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.classifier.out_features)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
        
        return logits if loss is None else (loss, logits)
        
def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=2)

    # Remove ignored index (special tokens)
    true_labels = p.label_ids
    true_preds = [
        [id_to_tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, true_labels)
    ]
    true_labels = [
        [id_to_tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, true_labels)
    ]

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
        "accuracy": accuracy_score(true_labels, true_preds),
    }


# Load the dataset
dataset = load_dataset("wnut_17")
tag_to_id = dataset["train"].features["ner_tags"].feature.names
id_to_tag = {i: tag for i, tag in enumerate(tag_to_id)}

unique_tags = set()
for entry in dataset['train']['ner_tags']:
    unique_tags.update(entry)
print(f'Min tag: {min(unique_tags)}, Max tag: {max(unique_tags)}')

# Load the tokenizer and model using a DistilBERT checkpoint
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
config = DistilBertConfig.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))
model = CustomDistillBERTClass(num_labels=len(unique_tags))
model.to('cuda')


def tokenize_function(examples):
    encoding = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_attention_mask=True,
        return_tensors="pt"
    )
    encoding["labels"] = examples["ner_tags"]
    return encoding
    
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Prepare the data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    output_dir='./results',
    num_train_epochs=7,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=2.44e-05,  
    push_to_hub=False,  
    logging_first_step=True,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',  
    greater_is_better=False,  
    warmup_steps=0,  
    save_total_limit=2,  
    seed=42
)

# Training
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

# Evaluation
results = trainer.evaluate()

# Print the results
print(results)

Min tag: 0, Max tag: 12


[34m[1mwandb[0m: Currently logged in as: [33mthejosephloy[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0807,0.610826,0.542484,0.099282,0.167846,0.92627
2,0.2141,0.488645,0.52439,0.205742,0.295533,0.931609
3,0.2141,0.564926,0.504559,0.198565,0.284979,0.930782
4,0.0709,0.760129,0.421875,0.19378,0.265574,0.929575
5,0.081,0.775639,0.413793,0.229665,0.295385,0.928685
6,0.05,0.803822,0.368522,0.229665,0.282977,0.927414
7,0.0495,0.827177,0.372816,0.229665,0.284234,0.927795


{'eval_loss': 0.488645076751709, 'eval_precision': 0.524390243902439, 'eval_recall': 0.20574162679425836, 'eval_f1': 0.29553264604810997, 'eval_accuracy': 0.9316087205237399, 'eval_runtime': 10.0002, 'eval_samples_per_second': 100.898, 'eval_steps_per_second': 100.898, 'epoch': 7.0}


In [None]:
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import DistilBertTokenizerFast, DistilBertModel, Trainer, TrainingArguments, DataCollatorForTokenClassification
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from collections import Counter
from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup
import torch
import numpy as np

# Custom Trainer class
class CustomTrainer(Trainer):
    def create_optimizer_and_scheduler(self, num_training_steps: int):
        super().create_optimizer_and_scheduler(num_training_steps)
        self.optimizer = AdamW(self.model.parameters(), betas=(0.9, 0.999), eps=1e-08, lr=5e-5)
        self.lr_scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Custom DistillBERT model class
class CustomDistillBERTClass(torch.nn.Module):
    def __init__(self, num_labels):
        super(CustomDistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-cased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        sequence_output = self.pre_classifier(hidden_state)
        sequence_output = torch.nn.ReLU()(sequence_output)
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        
        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.classifier.out_features)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
        
        return logits if loss is None else (loss, logits)

# Function to compute metrics
def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=2)
    true_labels = p.label_ids
    true_preds = [
        [id_to_tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, true_labels)
    ]
    true_labels = [
        [id_to_tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, true_labels)
    ]

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
        "accuracy": accuracy_score(true_labels, true_preds),
    }

# Load the dataset
dataset = load_dataset("wnut_17")
tag_to_id = dataset["train"].features["ner_tags"].feature.names
id_to_tag = {i: tag for i, tag in enumerate(tag_to_id)}

# Upsampling smaller classes
counter = Counter()
for entry in dataset['train']['ner_tags']:
    counter.update(entry)

most_common_count = counter.most_common(1)[0][1]
upsampled_entries = []

for example in dataset['train']:
    tokens = example['tokens']
    ner_tags = example['ner_tags']
    example_counter = Counter(ner_tags)
    least_common_count = min([example_counter[tag] for tag in ner_tags if tag in example_counter])
    upsample_factor = most_common_count // least_common_count
    upsampled_entries.extend([example] * upsample_factor)

# Convert upsampled entries to a Dataset
upsampled_entries_dataset = Dataset.from_dict({
    'tokens': [entry['tokens'] for entry in upsampled_entries],
    'ner_tags': [entry['ner_tags'] for entry in upsampled_entries]
})

# Concatenate the original dataset with the upsampled dataset
upsampled_dataset = concatenate_datasets([dataset['train'], upsampled_entries_dataset])

# Tokenization function
def tokenize_function(examples):
    encoding = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_attention_mask=True,
        return_tensors="pt"
    )
    encoding["labels"] = examples["ner_tags"]
    return encoding

# Tokenize the upsampled dataset
tokenized_upsampled_dataset = upsampled_dataset.map(tokenize_function, batched=True)

# Prepare the data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    output_dir='./results',
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=5e-05,
    push_to_hub=False,
    logging_first_step=True,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    warmup_steps=0,
    save_total_limit=2,
    seed=42
)

# Initialize the Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_upsampled_dataset,
    eval_dataset=dataset["validation"].map(tokenize_function, batched=True),
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

# Print the results
print(results)

In [1]:
import optuna
import torch
import numpy as np  # Import numpy
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, Trainer, TrainingArguments
from transformers import DistilBertConfig
from transformers import DataCollatorForTokenClassification
from datasets import load_dataset
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import Trainer
from transformers import  get_cosine_schedule_with_warmup
from transformers import DistilBertModel
from torch.optim import AdamW
from collections import Counter

class CustomTrainer(Trainer):
    def create_optimizer_and_scheduler(self, num_training_steps: int):
        super().create_optimizer_and_scheduler(num_training_steps)  # Call to superclass method
        self.optimizer = AdamW(self.model.parameters(), betas=(0.9, 0.999), eps=1e-08, lr=5e-5)
        self.lr_scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
        print(f'Optimizer: {self.optimizer}')  # Debugging line
        print(f'Scheduler: {self.lr_scheduler}')  # Debugging line

class CustomDistillBERTClass(torch.nn.Module):
    def __init__(self, num_labels):
        super(CustomDistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-cased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, num_labels)
          
    
    def forward(self, input_ids, attention_mask, labels=None):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        
        # For token classification, we apply the classifier to each token in the sequence
        sequence_output = self.pre_classifier(hidden_state)
        sequence_output = torch.nn.ReLU()(sequence_output)
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        
        # If labels are provided, compute the loss as well
        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()     
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.classifier.out_features)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
        
        return logits if loss is None else (loss, logits)
        
def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=2)

    # Remove ignored index (special tokens)
    true_labels = p.label_ids
    true_preds = [
        [id_to_tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, true_labels)
    ]
    true_labels = [
        [id_to_tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, true_labels)
    ]

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
        "accuracy": accuracy_score(true_labels, true_preds),
    }


# Load the dataset
dataset = load_dataset("wnut_17")
tag_to_id = dataset["train"].features["ner_tags"].feature.names
id_to_tag = {i: tag for i, tag in enumerate(tag_to_id)}

unique_tags = set()
for entry in dataset['train']['ner_tags']:
    unique_tags.update(entry)
print(f'Min tag: {min(unique_tags)}, Max tag: {max(unique_tags)}')



# Load the tokenizer and model using a DistilBERT checkpoint
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
config = DistilBertConfig.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))
model = CustomDistillBERTClass(num_labels=len(unique_tags))
model.to('cuda')


def tokenize_function(examples):
    encoding = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_attention_mask=True,
        return_tensors="pt"
    )
    encoding["labels"] = examples["ner_tags"]
    return encoding
    
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Prepare the data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    output_dir='./results',
    num_train_epochs=5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=5e-05,  
    push_to_hub=False,  
    logging_first_step=True,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',  
    greater_is_better=False,  
    warmup_steps=0,  
    save_total_limit=2,  
    seed=42
)

# Training
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

def objective(trial):
    # Hyperparameters to be tuned
    lr = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    dropout = trial.suggest_float("dropout", 0.1, 0.3, step=0.1)

    # Initialize custom model with hyperparameters and other static settings
    num_labels = len(tag_to_id)
    model = CustomDistillBERTClass(num_labels=num_labels)
    model.dropout.p = dropout  # Set dropout rate
    model.to('cuda')  # Make sure the model is on the same device as the data

    # Initialize the Trainer with hyperparameters
    custom_trainer = CustomTrainer(
        model=model,
        args=TrainingArguments(
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            output_dir='./results',
            num_train_epochs=5,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            logging_dir='./logs',
            logging_steps=10,
            learning_rate=lr,  # Set learning rate
            push_to_hub=False,
            logging_first_step=True,
            load_best_model_at_end=True,
            metric_for_best_model='eval_loss',
            greater_is_better=False,
            warmup_steps=0,
            save_total_limit=2,
            seed=42
        ),
        train_dataset=tokenized_dataset["train"],  # Corrected dataset
        eval_dataset=tokenized_dataset["validation"],  # Corrected dataset
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Train the model
    custom_trainer.train()

    # Evaluate the model
    results = custom_trainer.evaluate()

    return results["eval_loss"]  # Objective metric to be minimized

# Initialize Optuna study
study = optuna.create_study(direction="minimize")  # Minimize evaluation loss
study.optimize(objective, n_trials=50)  # Number of trials

# Access the optimal hyperparameters
best_params = study.best_params
best_value = study.best_value

print(f"Best parameters: {best_params}")
print(f"Best evaluation loss: {best_value}")

Min tag: 0, Max tag: 12


[I 2023-10-05 02:06:08,403] A new study created in memory with name: no-name-368c5a93-e58e-4077-98f9-2555584bce6f


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fba1a7075e0>


[34m[1mwandb[0m: Currently logged in as: [33mthejosephloy[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.305,0.457105,0.0,0.0,0.0,0.920549
2,0.2021,0.369927,0.0,0.0,0.0,0.920549
3,0.2051,0.359084,0.571429,0.038278,0.071749,0.923155
4,0.137,0.366107,0.513208,0.162679,0.247048,0.927604
5,0.195,0.365924,0.468468,0.186603,0.266895,0.928049


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 02:13:40,822] Trial 0 finished with value: 0.3590839207172394 and parameters: {'learning_rate': 1.9558330933151116e-05, 'dropout': 0.3}. Best is trial 0 with value: 0.3590839207172394.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9f8e26710>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2973,0.452113,0.0,0.0,0.0,0.920549
2,0.1999,0.363565,0.274194,0.020335,0.037862,0.92252
3,0.1886,0.36778,0.604938,0.058612,0.10687,0.924681
4,0.1289,0.379684,0.432331,0.13756,0.208711,0.926969
5,0.1698,0.37992,0.398827,0.162679,0.231096,0.926842


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 02:21:01,918] Trial 1 finished with value: 0.36356452107429504 and parameters: {'learning_rate': 4.615675725295668e-05, 'dropout': 0.3}. Best is trial 0 with value: 0.3590839207172394.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fba06c0a230>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2907,0.430406,0.0,0.0,0.0,0.920549
2,0.1954,0.363416,0.425,0.020335,0.038813,0.922265
3,0.1835,0.36673,0.517647,0.052632,0.095548,0.924109
4,0.128,0.381459,0.453125,0.173445,0.250865,0.927922
5,0.1643,0.38195,0.396325,0.180622,0.248151,0.927096


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 02:28:14,929] Trial 2 finished with value: 0.36341598629951477 and parameters: {'learning_rate': 2.3587016751616624e-05, 'dropout': 0.2}. Best is trial 0 with value: 0.3590839207172394.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fba06c38dc0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2973,0.452113,0.0,0.0,0.0,0.920549
2,0.1999,0.363565,0.274194,0.020335,0.037862,0.92252
3,0.1886,0.36778,0.604938,0.058612,0.10687,0.924681
4,0.1289,0.379684,0.432331,0.13756,0.208711,0.926969
5,0.1698,0.37992,0.398827,0.162679,0.231096,0.926842


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 02:35:28,993] Trial 3 finished with value: 0.36356452107429504 and parameters: {'learning_rate': 1.5039698279063257e-05, 'dropout': 0.3}. Best is trial 0 with value: 0.3590839207172394.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9f84f4e20>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2907,0.430406,0.0,0.0,0.0,0.920549
2,0.1954,0.363416,0.425,0.020335,0.038813,0.922265
3,0.1835,0.36673,0.517647,0.052632,0.095548,0.924109
4,0.128,0.381459,0.453125,0.173445,0.250865,0.927922
5,0.1643,0.38195,0.396325,0.180622,0.248151,0.927096


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 02:42:43,041] Trial 4 finished with value: 0.36341598629951477 and parameters: {'learning_rate': 1.6407027410060587e-05, 'dropout': 0.2}. Best is trial 0 with value: 0.3590839207172394.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9f8478d60>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2973,0.452113,0.0,0.0,0.0,0.920549
2,0.1999,0.363565,0.274194,0.020335,0.037862,0.92252
3,0.1886,0.36778,0.604938,0.058612,0.10687,0.924681
4,0.1289,0.379684,0.432331,0.13756,0.208711,0.926969
5,0.1698,0.37992,0.398827,0.162679,0.231096,0.926842


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 02:49:57,889] Trial 5 finished with value: 0.36356452107429504 and parameters: {'learning_rate': 1.0801115389037237e-05, 'dropout': 0.3}. Best is trial 0 with value: 0.3590839207172394.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9f8dd7f10>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2973,0.452113,0.0,0.0,0.0,0.920549
2,0.1999,0.363565,0.274194,0.020335,0.037862,0.92252
3,0.1886,0.36778,0.604938,0.058612,0.10687,0.924681
4,0.1289,0.379684,0.432331,0.13756,0.208711,0.926969
5,0.1698,0.37992,0.398827,0.162679,0.231096,0.926842


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 02:57:13,056] Trial 6 finished with value: 0.36356452107429504 and parameters: {'learning_rate': 4.732066560061395e-05, 'dropout': 0.3}. Best is trial 0 with value: 0.3590839207172394.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9f8dd6ce0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2973,0.452113,0.0,0.0,0.0,0.920549
2,0.1999,0.363565,0.274194,0.020335,0.037862,0.92252
3,0.1886,0.36778,0.604938,0.058612,0.10687,0.924681
4,0.1289,0.379684,0.432331,0.13756,0.208711,0.926969
5,0.1698,0.37992,0.398827,0.162679,0.231096,0.926842


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 03:04:28,063] Trial 7 finished with value: 0.36356452107429504 and parameters: {'learning_rate': 1.6728831489491233e-05, 'dropout': 0.3}. Best is trial 0 with value: 0.3590839207172394.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9f847b730>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2907,0.430406,0.0,0.0,0.0,0.920549
2,0.1954,0.363416,0.425,0.020335,0.038813,0.922265
3,0.1835,0.36673,0.517647,0.052632,0.095548,0.924109
4,0.128,0.381459,0.453125,0.173445,0.250865,0.927922
5,0.1643,0.38195,0.396325,0.180622,0.248151,0.927096


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 03:11:41,028] Trial 8 finished with value: 0.36341598629951477 and parameters: {'learning_rate': 3.128259977774797e-05, 'dropout': 0.2}. Best is trial 0 with value: 0.3590839207172394.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fba1a707970>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 03:18:54,013] Trial 9 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.436768325486564e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fba06c3bfa0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 03:26:06,950] Trial 10 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.8031015700625902e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9f8e47cd0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 03:33:19,669] Trial 11 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.723826824898403e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fba1a7043d0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 03:40:33,385] Trial 12 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 3.473674833136144e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9f8dd7d60>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 03:47:46,292] Trial 13 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.6994724018645316e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9f84a6aa0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 03:54:58,703] Trial 14 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.2376870455123204e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fba06c3b280>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 04:02:11,842] Trial 15 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 3.5712781630592634e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e44614e0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 04:09:24,318] Trial 16 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.6836315325057337e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9f8de30d0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2907,0.430406,0.0,0.0,0.0,0.920549
2,0.1954,0.363416,0.425,0.020335,0.038813,0.922265
3,0.1835,0.36673,0.517647,0.052632,0.095548,0.924109
4,0.128,0.381459,0.453125,0.173445,0.250865,0.927922
5,0.1643,0.38195,0.396325,0.180622,0.248151,0.927096


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 04:16:37,807] Trial 17 finished with value: 0.36341598629951477 and parameters: {'learning_rate': 2.047204457255112e-05, 'dropout': 0.2}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e4389d50>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2907,0.430406,0.0,0.0,0.0,0.920549
2,0.1954,0.363416,0.425,0.020335,0.038813,0.922265
3,0.1835,0.36673,0.517647,0.052632,0.095548,0.924109
4,0.128,0.381459,0.453125,0.173445,0.250865,0.927922
5,0.1643,0.38195,0.396325,0.180622,0.248151,0.927096


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 04:23:50,289] Trial 18 finished with value: 0.36341598629951477 and parameters: {'learning_rate': 3.759421955314817e-05, 'dropout': 0.2}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9ec18ece0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 04:31:03,245] Trial 19 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.9540905196716654e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fba06c38970>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 04:38:15,764] Trial 20 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.3650766087242067e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e4295900>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 04:45:28,164] Trial 21 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.6379048216321092e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9dc425930>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 04:52:40,884] Trial 22 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 3.0984518971183424e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e43569e0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 04:59:53,931] Trial 23 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.511786263608811e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9f84f5f90>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2907,0.430406,0.0,0.0,0.0,0.920549
2,0.1954,0.363416,0.425,0.020335,0.038813,0.922265
3,0.1835,0.36673,0.517647,0.052632,0.095548,0.924109
4,0.128,0.381459,0.453125,0.173445,0.250865,0.927922
5,0.1643,0.38195,0.396325,0.180622,0.248151,0.927096


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 05:07:05,779] Trial 24 finished with value: 0.36341598629951477 and parameters: {'learning_rate': 2.035408204842762e-05, 'dropout': 0.2}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fba06c39ba0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 05:14:18,991] Trial 25 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.9898962858508633e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e4357b20>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 05:21:31,707] Trial 26 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 3.860435741228116e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9dc425720>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2907,0.430406,0.0,0.0,0.0,0.920549
2,0.1954,0.363416,0.425,0.020335,0.038813,0.922265
3,0.1835,0.36673,0.517647,0.052632,0.095548,0.924109
4,0.128,0.381459,0.453125,0.173445,0.250865,0.927922
5,0.1643,0.38195,0.396325,0.180622,0.248151,0.927096


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 05:28:44,814] Trial 27 finished with value: 0.36341598629951477 and parameters: {'learning_rate': 2.5999066297233968e-05, 'dropout': 0.2}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e4460340>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 05:35:57,687] Trial 28 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.2327626181615418e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9ec19aef0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 05:43:10,623] Trial 29 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 3.298382267125809e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9dc4264d0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2907,0.430406,0.0,0.0,0.0,0.920549
2,0.1954,0.363416,0.425,0.020335,0.038813,0.922265
3,0.1835,0.36673,0.517647,0.052632,0.095548,0.924109
4,0.128,0.381459,0.453125,0.173445,0.250865,0.927922
5,0.1643,0.38195,0.396325,0.180622,0.248151,0.927096


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 05:50:23,346] Trial 30 finished with value: 0.36341598629951477 and parameters: {'learning_rate': 4.103481347563509e-05, 'dropout': 0.2}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e4295390>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 05:57:35,569] Trial 31 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 3.3491261523883714e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e438b400>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 06:04:48,149] Trial 32 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 4.2067149817530413e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9ec19b8b0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 06:12:06,136] Trial 33 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.8887242028855094e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9dc426980>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 06:19:18,091] Trial 34 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 3.512219605505925e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9ec19a800>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 06:26:31,180] Trial 35 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.8713772416018355e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9ec18f850>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2907,0.430406,0.0,0.0,0.0,0.920549
2,0.1954,0.363416,0.425,0.020335,0.038813,0.922265
3,0.1835,0.36673,0.517647,0.052632,0.095548,0.924109
4,0.128,0.381459,0.453125,0.173445,0.250865,0.927922
5,0.1643,0.38195,0.396325,0.180622,0.248151,0.927096


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 06:33:44,752] Trial 36 finished with value: 0.36341598629951477 and parameters: {'learning_rate': 3.2922435549474384e-05, 'dropout': 0.2}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e4462cb0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 06:40:57,999] Trial 37 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.452140235817654e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e426bac0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 06:48:10,779] Trial 38 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 4.4681397069543635e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9dc4269e0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2907,0.430406,0.0,0.0,0.0,0.920549
2,0.1954,0.363416,0.425,0.020335,0.038813,0.922265
3,0.1835,0.36673,0.517647,0.052632,0.095548,0.924109
4,0.128,0.381459,0.453125,0.173445,0.250865,0.927922
5,0.1643,0.38195,0.396325,0.180622,0.248151,0.927096


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 06:55:23,184] Trial 39 finished with value: 0.36341598629951477 and parameters: {'learning_rate': 1.8712989299466186e-05, 'dropout': 0.2}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9be67a140>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 07:02:36,758] Trial 40 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.810788190605936e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9ec18c340>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 07:09:50,180] Trial 41 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.7169122494303167e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9a2701cf0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 07:17:02,726] Trial 42 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.4440048280396553e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e4357850>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 07:24:16,270] Trial 43 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 4.9409272120775824e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e438a6b0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 07:31:28,435] Trial 44 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 3.0320262463950622e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e4295540>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 07:38:41,558] Trial 45 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.243952565783021e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e43569b0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2973,0.452113,0.0,0.0,0.0,0.920549
2,0.1999,0.363565,0.274194,0.020335,0.037862,0.92252
3,0.1886,0.36778,0.604938,0.058612,0.10687,0.924681
4,0.1289,0.379684,0.432331,0.13756,0.208711,0.926969
5,0.1698,0.37992,0.398827,0.162679,0.231096,0.926842


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 07:45:54,479] Trial 46 finished with value: 0.36356452107429504 and parameters: {'learning_rate': 2.701851066970501e-05, 'dropout': 0.3}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9ec198ca0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 07:53:05,724] Trial 47 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 3.133011468479822e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9ec18f2b0>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2907,0.430406,0.0,0.0,0.0,0.920549
2,0.1954,0.363416,0.425,0.020335,0.038813,0.922265
3,0.1835,0.36673,0.517647,0.052632,0.095548,0.924109
4,0.128,0.381459,0.453125,0.173445,0.250865,0.927922
5,0.1643,0.38195,0.396325,0.180622,0.248151,0.927096


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 08:00:18,248] Trial 48 finished with value: 0.36341598629951477 and parameters: {'learning_rate': 2.5556128158905093e-05, 'dropout': 0.2}. Best is trial 9 with value: 0.3561481833457947.


Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 5e-05
    lr: 5e-05
    maximize: False
    weight_decay: 0.01
)
Scheduler: <torch.optim.lr_scheduler.LambdaLR object at 0x7fb9e4269120>


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2912,0.437413,0.0,0.0,0.0,0.920549
2,0.1943,0.361094,0.315789,0.028708,0.052632,0.922774
3,0.1824,0.356148,0.533333,0.076555,0.133891,0.92557
4,0.1251,0.383346,0.447099,0.156699,0.232064,0.927986
5,0.1621,0.383473,0.407311,0.186603,0.255947,0.927541


  _warn_prf(average, modifier, msg_start, len(result))


[I 2023-10-05 08:07:31,491] Trial 49 finished with value: 0.3561481833457947 and parameters: {'learning_rate': 2.8119082565489062e-05, 'dropout': 0.1}. Best is trial 9 with value: 0.3561481833457947.


Best parameters: {'learning_rate': 2.436768325486564e-05, 'dropout': 0.1}
Best evaluation loss: 0.3561481833457947
