<a href="https://colab.research.google.com/github/zpuiy/Emotion-Recognition/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q datasets evaluate
!pip install -q transformers[torch]
!pip install -q accelerate -U
! pip install -q optuna

In [None]:
from transformers import AutoTokenizer, TrainingArguments, Trainer
from torch.utils.data import DataLoader
from datasets import Dataset
import pandas as pd
from datasets import load_dataset
import random
import os
import numpy as np
import torch
from tqdm.auto import tqdm
import evaluate

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(7)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Data

Download all datasets

In [None]:
# Loading the GoEmotions dataset in its simplified version.
dataset = load_dataset("go_emotions", "simplified")
ds_train = dataset['train']
ds_test = dataset['test']
ds_validation = dataset['validation']

# create smaller dataset for experiments and tuning
small_train_ds = ds_train.shuffle(seed=7).select(range(int(len(ds_train['text']) / 20)))
small_val_ds = ds_validation.shuffle(seed=7).select(range(int(len(ds_validation['text']) / 20)))

In [None]:
id2label = {0:"admiration",
            1:"amusement",
            2:"anger",
            3:"annoyance",
            4:"approval",
            5:"caring",
            6:"confusion",
            7:"curiosity",
            8:"desire",
            9:"disappointment",
            10:"disapproval",
            11:"disgust",
            12:"embarrassment",
            13:"excitement",
            14:"fear",
            15:"gratitude",
            16:"grief",
            17:"joy",
            18:"love",
            19:"nervousness",
            20:"optimism",
            21:"pride",
            22:"realization",
            23:"relief",
            24:"remorse",
            25:"sadness",
            26:"surprise",
            27:"neutral"}

N_LABELS = len(id2label)   #27 emotion categories or Neutral

# Data Cleaning and Preprocessing

Converting emojis to descriptive text.

In [None]:
!pip install -q --upgrade emoji

In [None]:
import emoji

# Apply the function to the 'text' column
ds_train = ds_train.map(lambda x: {'text': emoji.demojize(x['text'])})
ds_test = ds_test.map(lambda x: {'text': emoji.demojize(x['text'])})
ds_validation = ds_validation.map(lambda x: {'text': emoji.demojize(x['text'])})

small_train_ds = small_train_ds.map(lambda x: {'text': emoji.demojize(x['text'])})
small_val_ds = small_val_ds.map(lambda x: {'text': emoji.demojize(x['text'])})

Map:   0%|          | 0/2170 [00:00<?, ? examples/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

remove non-alphabetical characters


In [None]:
import re

ds_train = ds_train.map(lambda x: {'text': re.sub(r'[^a-zA-Z]', ' ', x['text'])})
ds_test = ds_test.map(lambda x: {'text': re.sub(r'[^a-zA-Z]', ' ', x['text'])})
ds_validation = ds_validation.map(lambda x: {'text': re.sub(r'[^a-zA-Z]', ' ', x['text'])})

small_train_ds = small_train_ds.map(lambda x: {'text': re.sub(r'[^a-zA-Z]', ' ', x['text'])})
small_val_ds = small_val_ds.map(lambda x: {'text': re.sub(r'[^a-zA-Z]', ' ', x['text'])})

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/2170 [00:00<?, ? examples/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

remove extra spaces

In [None]:
def remove_spaces(string):
    return ' '.join(string.split())

ds_train = ds_train.map(lambda x: {'text': remove_spaces(x['text'])})
ds_test = ds_test.map(lambda x: {'text': remove_spaces(x['text'])})
ds_validation = ds_validation.map(lambda x: {'text': remove_spaces(x['text'])})

small_train_ds = small_train_ds.map(lambda x: {'text': remove_spaces(x['text'])})
small_val_ds = small_val_ds.map(lambda x: {'text': remove_spaces(x['text'])})

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/2170 [00:00<?, ? examples/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

# Model training and fine-tuning

In [None]:
def custom_dataloader(data, tokenizer, batch_size, shuffle, max_length=128):
    data = data.to_pandas()

    def multi_label(x):
        return [1 if i in [int(label) for label in x] else 0 for i in range(N_LABELS)]

    def tokenize_function(x):
        return tokenizer(x["text"], padding="max_length", truncation=True,
                         max_length=max_length, return_tensors="pt", return_token_type_ids=True)

    data['labels'] = data.apply(lambda x: multi_label(x['labels']), axis=1)
    data = data.drop('id',axis=1)
    datasets = Dataset.from_pandas(data)
    tokenized_datasets = datasets.map(tokenize_function, batched=True)
    tokenized_datasets.set_format("torch", columns=['input_ids', 'attention_mask', 'labels', 'token_type_ids'])
    # dataloader = DataLoader(tokenized_datasets, shuffle=shuffle, batch_size=batch_size)

    return tokenized_datasets

metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (torch.sigmoid(torch.tensor(logits)) > thres).int().cpu().numpy()
    # Flatten the lists of lists to single lists
    flat_predictions = [item for sublist in predictions for item in sublist]
    flat_labels = [item for sublist in labels.tolist() for item in sublist]
    formatted_predictions = {"predictions": flat_predictions, "references": flat_labels}
    return metric.compute(**formatted_predictions)


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
            outputs = model(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask']
            )
            logits = outputs['logits']
            labels = inputs['labels'].float()

            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)

            return (loss, outputs) if return_outputs else loss

In [None]:
def train_and_eval(base, sequence_classification, tokenizer, n_batch, max_length, thres=0.5, n_epoch=5, lr=2e-5):
    train_dataloader = custom_dataloader(ds_train, tokenizer, n_batch, True, max_length)
    # test_dataloader = custom_dataloader(ds_test, tokenizer, n_batch, False, max_length)
    valid_dataloader = custom_dataloader(ds_validation, tokenizer, n_batch, False, max_length)

    small_train_ld = custom_dataloader(small_train_ds, tokenizer, n_batch, True, max_length)
    small_val_ld = custom_dataloader(small_val_ds, tokenizer, n_batch, True, max_length)

    args = TrainingArguments(
        f"{base}-finetuned",
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        learning_rate=lr,
        per_device_train_batch_size=n_batch,
        per_device_eval_batch_size=n_batch,
        num_train_epochs=n_epoch,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        push_to_hub=True,
    )

    def model_init():
        return sequence_classification.from_pretrained(base, num_labels=N_LABELS)

    trainer_tune = CustomTrainer(
        model_init=model_init,
        args=args,
        train_dataset=train_dataloader.shard(index=1, num_shards=10), #find hyperparameters based on a portion of the training dataset
        eval_dataset=valid_dataloader,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    best_run = trainer_tune.hyperparameter_search(n_trials=10, direction="maximize")

    model = sequence_classification.from_pretrained(base, num_labels=N_LABELS)

    print('Use fine-tuned hyperparameter to train on the full train set')
    print('-'*50)

    trainer = CustomTrainer(
      model=model,
      args=args,
      train_dataset=train_dataloader,
      eval_dataset=valid_dataloader,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
    )

    for n, v in best_run.hyperparameters.items():
        setattr(trainer.args, n, v)

    trainer.train()

    print('Evaluation on validation set')
    print('-'*50)

    trainer.evaluate()

    torch.save(model.state_dict(), f"{base}-max-len={max_length}_thres={thres}_lr={lr}_batch={n_batch}.pt")

# ALBERT

**Trial 1:**

batch size 256

max sequence length 32

epoch 5

learning rate 2e-5

threshold 0.5


In [None]:
from transformers import AlbertForSequenceClassification

base = "albert-base-v2"
n_batch = 256
max_length = 32
lr = 2e-5
thres = 0.5
albert_tokenizer = AutoTokenizer.from_pretrained(base, use_fast=True, model_max_length=max_length, do_lower_case=False)
# albert_model = AlbertForSequenceClassification.from_pretrained(base, num_labels=N_LABELS)

train_and_eval(base, AlbertForSequenceClassification, albert_tokenizer, n_batch, max_length, thres, lr)


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/2170 [00:00<?, ? examples/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2023-11-30 05:50:52,576] A new study created in memory with name: no-name-548c0bd6-76c7-4ec3-ba64-14e642fd63cc
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.15214,0.0
2,No log,0.13697,0.227273
3,No log,0.127925,0.301235
4,0.158100,0.125322,0.360277
5,0.158100,0.12408,0.368664


[I 2023-11-30 05:52:07,148] Trial 0 finished with value: 0.3686635944700461 and parameters: {'learning_rate': 2.9896994592031926e-05, 'num_train_epochs': 5, 'seed': 24, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.3686635944700461.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.2225,0.161658,0.0


[I 2023-11-30 05:53:15,577] Trial 1 finished with value: 0.0 and parameters: {'learning_rate': 8.248052596267117e-06, 'num_train_epochs': 1, 'seed': 32, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 0.3686635944700461.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.297475,0.0


[I 2023-11-30 05:54:19,309] Trial 2 finished with value: 0.0 and parameters: {'learning_rate': 3.528423200744271e-06, 'num_train_epochs': 1, 'seed': 11, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.3686635944700461.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.146968,0.0
2,0.158600,0.128263,0.137313
3,0.158600,0.120287,0.375
4,0.125900,0.117181,0.443564
5,0.125900,0.114271,0.417204


[I 2023-11-30 05:56:06,231] Trial 3 finished with value: 0.4172043010752688 and parameters: {'learning_rate': 6.30253082597539e-05, 'num_train_epochs': 5, 'seed': 37, 'per_device_train_batch_size': 8}. Best is trial 3 with value: 0.4172043010752688.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.23239,0.0


[I 2023-11-30 05:56:56,500] Trial 4 finished with value: 0.0 and parameters: {'learning_rate': 2.253848886893945e-05, 'num_train_epochs': 1, 'seed': 23, 'per_device_train_batch_size': 32}. Best is trial 3 with value: 0.4172043010752688.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.136538,0.120482


[I 2023-11-30 05:57:49,700] Trial 5 finished with value: 0.12048192771084336 and parameters: {'learning_rate': 7.047319696950707e-05, 'num_train_epochs': 1, 'seed': 12, 'per_device_train_batch_size': 8}. Best is trial 3 with value: 0.4172043010752688.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.376554,0.100279
2,No log,0.266935,0.0
3,No log,0.23545,0.0


[I 2023-11-30 05:58:09,226] Trial 6 pruned. 
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.154446,0.0
2,No log,0.143691,0.0


[I 2023-11-30 05:58:22,395] Trial 7 pruned. 
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.150087,0.0
2,No log,0.137445,0.0


[I 2023-11-30 05:58:36,873] Trial 8 pruned. 
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.351142,0.0


[I 2023-11-30 05:59:01,660] Trial 9 finished with value: 0.0 and parameters: {'learning_rate': 4.530700130011341e-06, 'num_train_epochs': 1, 'seed': 18, 'per_device_train_batch_size': 16}. Best is trial 3 with value: 0.4172043010752688.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Use fine-tuned hyperparameter to train on the full train set
--------------------------------------------------


Epoch,Training Loss,Validation Loss,F1
1,0.1497,0.149777,0.0
2,0.1505,0.149597,0.0
3,0.151,0.149563,0.0
4,0.1504,0.149275,0.0
5,0.1517,0.149217,0.0


Evaluation on validation set
--------------------------------------------------


RuntimeError: ignored

In [None]:
from transformers import AlbertForSequenceClassification

# Configuring the ALBERT model with 28 output labels and initializing the model instance.
albert_model = AlbertForSequenceClassification.from_pretrained(base, num_labels=N_LABELS).to(device)

args = TrainingArguments(
    f"{base}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=n_batch,
    per_device_eval_batch_size=n_batch,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=True,
)


def model_init():
    return AlbertForSequenceClassification.from_pretrained(base, num_labels=N_LABELS)


trainer = CustomTrainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataloader.shard(index=1, num_shards=10), #find good hyperparameter based on a portion of the training dataset
    eval_dataset=valid_dataloader,
    tokenizer=albert_tokenizer,
    compute_metrics=compute_metrics
)



Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# trainer.train()

best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

[I 2023-11-29 07:00:29,059] A new study created in memory with name: no-name-140418dc-e747-4080-b4f6-92efcc474de8
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.325649,0.0
2,No log,0.228538,0.0
3,No log,0.203591,0.0
4,No log,0.198173,0.0


[I 2023-11-29 07:03:46,354] Trial 0 finished with value: 0.0 and parameters: {'learning_rate': 7.563796206106837e-06, 'num_train_epochs': 4, 'seed': 39, 'per_device_train_batch_size': 64}. Best is trial 0 with value: 0.0.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1538,0.126355,0.318019


[I 2023-11-29 07:04:57,758] Trial 1 finished with value: 0.3180186647523331 and parameters: {'learning_rate': 6.505436739874777e-05, 'num_train_epochs': 1, 'seed': 40, 'per_device_train_batch_size': 8}. Best is trial 1 with value: 0.3180186647523331.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.16011,0.0
2,No log,0.14422,0.209381
3,No log,0.134863,0.287477
4,0.171200,0.129917,0.338675
5,0.171200,0.128521,0.335441


[I 2023-11-29 07:09:00,198] Trial 2 finished with value: 0.335440772992962 and parameters: {'learning_rate': 1.915714298344707e-05, 'num_train_epochs': 5, 'seed': 22, 'per_device_train_batch_size': 32}. Best is trial 2 with value: 0.335440772992962.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1354,0.12176,0.32112
2,0.1125,0.108964,0.437466
3,0.0973,0.103929,0.461439
4,0.0847,0.105625,0.489217
5,0.0723,0.104232,0.490736


[I 2023-11-29 07:17:24,487] Trial 3 finished with value: 0.49073600163783393 and parameters: {'learning_rate': 2.3277568221232515e-05, 'num_train_epochs': 5, 'seed': 33, 'per_device_train_batch_size': 4}. Best is trial 3 with value: 0.49073600163783393.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.137399,0.189943
2,0.159400,0.116552,0.398853
3,0.159400,0.109593,0.428106
4,0.105500,0.106768,0.460397
5,0.105500,0.106678,0.468855


[I 2023-11-29 07:20:03,211] Trial 4 finished with value: 0.4688552188552189 and parameters: {'learning_rate': 2.8893186822992632e-05, 'num_train_epochs': 5, 'seed': 31, 'per_device_train_batch_size': 16}. Best is trial 3 with value: 0.49073600163783393.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1635,0.154479,0.0


[I 2023-11-29 07:20:55,069] Trial 5 pruned. 
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.317634,0.0
2,No log,0.237463,0.0


[I 2023-11-29 07:21:45,842] Trial 6 pruned. 
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.501274,0.042249


[I 2023-11-29 07:22:30,895] Trial 7 finished with value: 0.04224909690592115 and parameters: {'learning_rate': 3.504167610201941e-06, 'num_train_epochs': 1, 'seed': 22, 'per_device_train_batch_size': 64}. Best is trial 3 with value: 0.49073600163783393.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.275482,0.0


[I 2023-11-29 07:22:55,755] Trial 8 pruned. 
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1338,0.119979,0.292844


[I 2023-11-29 07:23:48,761] Trial 9 pruned. 


In [None]:
trainer = CustomTrainer(
    model=AlbertForSequenceClassification.from_pretrained(base, num_labels=N_LABELS),
    args=args,
    train_dataset=train_dataloader,
    eval_dataset=valid_dataloader,
    tokenizer=albert_tokenizer,
    compute_metrics=compute_metrics
)
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.0902,0.090101,0.543497
2,0.0837,0.086553,0.572334
3,0.0722,0.085003,0.571588
4,0.0652,0.089507,0.588297
5,0.0507,0.095328,0.58453


TrainOutput(global_step=54265, training_loss=0.07688760443743413, metrics={'train_runtime': 2172.2619, 'train_samples_per_second': 99.919, 'train_steps_per_second': 24.981, 'total_flos': 650051817292800.0, 'train_loss': 0.07688760443743413, 'epoch': 5.0})

evaluation

In [None]:
trainer.evaluate()

{'eval_loss': 0.08950689435005188,
 'eval_f1': 0.5882974043114826,
 'eval_runtime': 8.1555,
 'eval_samples_per_second': 665.322,
 'eval_steps_per_second': 5.273,
 'epoch': 5.0}

# RoBERTa

**Trial 1:**

batch size 128

max sequence length 64

epoch 5

learning rate 2e-5

threshold 0.5


In [None]:
from transformers import RobertaForSequenceClassification
base = "roberta-base"
n_batch = 128
max_length = 64
lr = 2e-5
thres = 0.5
roberta_tokenizer = AutoTokenizer.from_pretrained(base, use_fast=True, model_max_length=max_length, do_lower_case=False)

train_and_eval(base, RobertaForSequenceClassification, roberta_tokenizer, n_batch, max_length, thres, lr)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

use trainer class from hugging face

In [None]:
# Configuring the Roberta model with 28 output labels and initializing the model instance.
roberta_model = RobertaForSequenceClassification.from_pretrained(base, num_labels=N_LABELS).to(device)

args = TrainingArguments(
    f"{base}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=n_batch,
    per_device_eval_batch_size=n_batch,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=True,
)

def model_init():
    return RobertaForSequenceClassification.from_pretrained(base, num_labels=N_LABELS)


trainer = CustomTrainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataloader.shard(index=1, num_shards=10), #find good hyperparameter based on a portion of the training dataset
    eval_dataset=valid_dataloader,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


hyperparameter tuning

In [None]:
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

[I 2023-11-29 08:57:14,205] A new study created in memory with name: no-name-2f959f8c-655c-4745-bf79-b526da19d6cf
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1472,0.133363,0.275801
2,0.126,0.121379,0.393381


[I 2023-11-29 09:01:13,936] Trial 0 finished with value: 0.3933814961547425 and parameters: {'learning_rate': 1.2720124877887197e-05, 'num_train_epochs': 2, 'seed': 35, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 0.3933814961547425.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.31018,0.0


[I 2023-11-29 09:04:15,636] Trial 1 finished with value: 0.0 and parameters: {'learning_rate': 2.9899754919502976e-06, 'num_train_epochs': 1, 'seed': 23, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.3933814961547425.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.164696,0.0
2,No log,0.152055,0.0
3,No log,0.143758,0.004066
4,0.184300,0.141393,0.178242


[I 2023-11-29 09:07:46,122] Trial 2 finished with value: 0.17824233255398156 and parameters: {'learning_rate': 1.926138732305906e-05, 'num_train_epochs': 4, 'seed': 17, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 0.3933814961547425.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.264144,0.0
2,0.343300,0.213004,0.0
3,0.343300,0.192594,0.0
4,0.206500,0.183693,0.0
5,0.206500,0.181062,0.0


[I 2023-11-29 09:13:46,968] Trial 3 finished with value: 0.0 and parameters: {'learning_rate': 2.4573800348719033e-06, 'num_train_epochs': 5, 'seed': 4, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.3933814961547425.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.198011,0.0
2,0.272400,0.166557,0.0
3,0.272400,0.158069,0.0
4,0.164400,0.153374,0.0
5,0.164400,0.151642,0.0


[I 2023-11-29 09:20:41,663] Trial 4 finished with value: 0.0 and parameters: {'learning_rate': 4.84261888689438e-06, 'num_train_epochs': 5, 'seed': 2, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.3933814961547425.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1729,0.162502,0.0


[I 2023-11-29 09:21:46,350] Trial 5 pruned. 
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.225041,0.0
2,0.292300,0.184603,0.0
3,0.292300,0.172122,0.0
4,0.181300,0.168883,0.0


[I 2023-11-29 09:30:41,415] Trial 6 finished with value: 0.0 and parameters: {'learning_rate': 3.834659722691893e-06, 'num_train_epochs': 4, 'seed': 8, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.3933814961547425.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.4203,0.243949,0.0
2,0.2344,0.198187,0.0


[I 2023-11-29 09:32:03,786] Trial 7 pruned. 
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.157234,0.0
2,No log,0.15093,0.0
3,No log,0.148772,0.0
4,No log,0.144957,0.0


[I 2023-11-29 09:40:58,092] Trial 8 finished with value: 0.0 and parameters: {'learning_rate': 4.7656763033114183e-05, 'num_train_epochs': 4, 'seed': 35, 'per_device_train_batch_size': 64}. Best is trial 0 with value: 0.3933814961547425.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1675,0.135661,0.208538


[I 2023-11-29 09:52:17,173] Trial 9 finished with value: 0.2085383694640245 and parameters: {'learning_rate': 3.863265673217337e-05, 'num_train_epochs': 1, 'seed': 13, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.3933814961547425.


In [None]:
trainer = CustomTrainer(
    model_init=model_init,
    args=args,
    train_dataset=train_dataloader,
    eval_dataset=valid_dataloader,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.0897,0.086281,0.563834
2,0.0796,0.08357,0.586397


TrainOutput(global_step=21706, training_loss=0.09429715695298126, metrics={'train_runtime': 1145.1487, 'train_samples_per_second': 75.815, 'train_steps_per_second': 18.955, 'total_flos': 2856079305861120.0, 'train_loss': 0.09429715695298126, 'epoch': 2.0})

evaluation

In [None]:
trainer.evaluate()

NameError: ignored

# XLNet


In [None]:
from transformers import XLNetForSequenceClassification
base = "xlnet-base-cased"
n_batch = 256
count = 1
thres = 0.5
lr = 2e-5
xlnet_tokenizer = AutoTokenizer.from_pretrained(base, use_fast=True, model_max_length=max_length, do_lower_case=False)

for max_length in [32, 64, 128, 512]:
    # for thres in [0.25, 0.5, 0.75]:
        # for lr in [2e-3, 2e-5]:
      print(f'Trial {count}: max sequence length: {max_length}, threshold: {thres}, learning rate: {lr}')
      print('+'*50)
      print('+'*50)
      count += 1
      train_and_eval(base, XLNetForSequenceClassification, xlnet_tokenizer, n_batch, max_length, thres, lr)

Trial 1: max sequence length: 32, threshold: 0.5, learning rate: 2e-05
++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/2170 [00:00<?, ? examples/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2023-11-30 03:40:20,240] A new study created in memory with name: no-name-6c48b0b8-9f2f-47be-bf4d-c82a5648a783
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a XLNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the tex

Epoch,Training Loss,Validation Loss,F1
1,No log,0.202329,0.0
2,No log,0.17784,0.0
3,No log,0.173961,0.0


[I 2023-11-30 03:48:12,275] Trial 0 finished with value: 0.0 and parameters: {'learning_rate': 3.631827074569141e-06, 'num_train_epochs': 3, 'seed': 2, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.150801,0.0
2,No log,0.148925,0.0


[I 2023-11-30 03:55:06,171] Trial 1 finished with value: 0.0 and parameters: {'learning_rate': 2.9962561727013183e-05, 'num_train_epochs': 2, 'seed': 33, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.22794,0.0
2,No log,0.202605,0.0


[I 2023-11-30 04:02:08,725] Trial 2 finished with value: 0.0 and parameters: {'learning_rate': 1.1106756991011496e-05, 'num_train_epochs': 2, 'seed': 30, 'per_device_train_batch_size': 64}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.2813,0.165695,0.0
2,0.1686,0.155884,0.0
3,0.1599,0.1541,0.0


[I 2023-11-30 04:10:11,457] Trial 3 finished with value: 0.0 and parameters: {'learning_rate': 2.8017294367254946e-06, 'num_train_epochs': 3, 'seed': 20, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.16155,0.0


[I 2023-11-30 04:17:39,246] Trial 4 finished with value: 0.0 and parameters: {'learning_rate': 1.0647375447282852e-05, 'num_train_epochs': 1, 'seed': 19, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.255052,0.0
2,No log,0.224298,0.0


[I 2023-11-30 04:24:53,570] Trial 5 finished with value: 0.0 and parameters: {'learning_rate': 4.435168252016349e-06, 'num_train_epochs': 2, 'seed': 37, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.3189,0.20162,0.0


[I 2023-11-30 04:32:41,259] Trial 6 finished with value: 0.0 and parameters: {'learning_rate': 2.0638572113320786e-06, 'num_train_epochs': 1, 'seed': 9, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.2428,0.157867,0.0
2,0.1591,0.151178,0.0
3,0.1544,0.149838,0.0


[I 2023-11-30 04:40:36,931] Trial 7 finished with value: 0.0 and parameters: {'learning_rate': 4.223147672245012e-06, 'num_train_epochs': 3, 'seed': 27, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.349383,0.006173
2,No log,0.294334,0.0


[I 2023-11-30 04:49:03,390] Trial 8 finished with value: 0.0 and parameters: {'learning_rate': 1.941367997192519e-06, 'num_train_epochs': 2, 'seed': 35, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.174923,0.0


[I 2023-11-30 04:57:41,037] Trial 9 finished with value: 0.0 and parameters: {'learning_rate': 1.4217840456298608e-05, 'num_train_epochs': 1, 'seed': 28, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Use fine-tuned hyperparameter to train on the full train set
--------------------------------------------------


Epoch,Training Loss,Validation Loss,F1
1,0.1274,0.116658,0.43881
2,0.1115,0.10307,0.454365
3,0.104,0.10045,0.472618


Evaluation on validation set
--------------------------------------------------


Trial 2: max sequence length: 64, threshold: 0.5, learning rate: 2e-05
++++++++++++++++++++++++++++++++++++++++++++++++++
++++++++++++++++++++++++++++++++++++++++++++++++++


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/2170 [00:00<?, ? examples/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2023-11-30 05:09:15,395] A new study created in memory with name: no-name-2d07ae06-bc6f-4741-9d47-526a58242a30
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.152777,0.0
2,No log,0.148611,0.0
3,No log,0.145939,0.0
4,No log,0.140332,0.0


[I 2023-11-30 05:18:03,132] Trial 0 finished with value: 0.0 and parameters: {'learning_rate': 4.198390968640704e-05, 'num_train_epochs': 4, 'seed': 16, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.15375,0.0


[I 2023-11-30 05:27:34,724] Trial 1 finished with value: 0.0 and parameters: {'learning_rate': 1.7209619589768727e-05, 'num_train_epochs': 1, 'seed': 29, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.155819,0.0


[I 2023-11-30 05:36:04,436] Trial 2 finished with value: 0.0 and parameters: {'learning_rate': 2.720308038638284e-05, 'num_train_epochs': 1, 'seed': 1, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.153823,0.0


[W 2023-11-30 05:40:08,569] Trial 3 failed with parameters: {'learning_rate': 3.209140855991639e-05, 'num_train_epochs': 1, 'seed': 6, 'per_device_train_batch_size': 16} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/integration_utils.py", line 195, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1546, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2001, in _inner_training_loop
    self._finish_current_push()
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 3657, in _finish_current_push
    self.push_in_progress.wait_until_done()
 

KeyboardInterrupt: ignored