In [2]:
!pip install -q datasets evaluate
!pip install -q torch
!pip install -q transformers[torch]
!pip install -q accelerate -U
!pip install -q optuna

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/521.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/521.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90

In [3]:
import torch
from transformers import AutoTokenizer, TrainingArguments, Trainer, set_seed
from torch.utils.data import DataLoader
from datasets import Dataset
import pandas as pd
from datasets import load_dataset
import random
import os
import numpy as np
from tqdm.auto import tqdm
import evaluate

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(7)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

set_seed(7)

# Load Data

Download all datasets

In [6]:
# Loading the GoEmotions dataset in its simplified version.
dataset = load_dataset("go_emotions", "simplified")
ds_train = dataset['train']
ds_test = dataset['test']
ds_validation = dataset['validation']

# create smaller dataset for experiments and tuning
small_train_ds = ds_train.shuffle(seed=7).select(range(int(len(ds_train['text']) / 20)))
small_val_ds = ds_validation.shuffle(seed=7).select(range(int(len(ds_validation['text']) / 20)))

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/7.03k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.12k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/203k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/201k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [7]:
id2label = {0:"admiration",
            1:"amusement",
            2:"anger",
            3:"annoyance",
            4:"approval",
            5:"caring",
            6:"confusion",
            7:"curiosity",
            8:"desire",
            9:"disappointment",
            10:"disapproval",
            11:"disgust",
            12:"embarrassment",
            13:"excitement",
            14:"fear",
            15:"gratitude",
            16:"grief",
            17:"joy",
            18:"love",
            19:"nervousness",
            20:"optimism",
            21:"pride",
            22:"realization",
            23:"relief",
            24:"remorse",
            25:"sadness",
            26:"surprise",
            27:"neutral"}

N_LABELS = len(id2label)   #27 emotion categories or Neutral

# Data Cleaning and Preprocessing

Converting emojis to descriptive text.

In [8]:
!pip install -q --upgrade emoji

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/358.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/358.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.9/358.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
import emoji

# Apply the function to the 'text' column
ds_train = ds_train.map(lambda x: {'text': emoji.demojize(x['text'])})
ds_test = ds_test.map(lambda x: {'text': emoji.demojize(x['text'])})
ds_validation = ds_validation.map(lambda x: {'text': emoji.demojize(x['text'])})

small_train_ds = small_train_ds.map(lambda x: {'text': emoji.demojize(x['text'])})
small_val_ds = small_val_ds.map(lambda x: {'text': emoji.demojize(x['text'])})

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/2170 [00:00<?, ? examples/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

remove non-alphabetical characters


In [10]:
import re

ds_train = ds_train.map(lambda x: {'text': re.sub(r'[^a-zA-Z]', ' ', x['text'])})
ds_test = ds_test.map(lambda x: {'text': re.sub(r'[^a-zA-Z]', ' ', x['text'])})
ds_validation = ds_validation.map(lambda x: {'text': re.sub(r'[^a-zA-Z]', ' ', x['text'])})

small_train_ds = small_train_ds.map(lambda x: {'text': re.sub(r'[^a-zA-Z]', ' ', x['text'])})
small_val_ds = small_val_ds.map(lambda x: {'text': re.sub(r'[^a-zA-Z]', ' ', x['text'])})

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/2170 [00:00<?, ? examples/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

remove extra spaces

In [11]:
def remove_spaces(string):
    return ' '.join(string.split())

ds_train = ds_train.map(lambda x: {'text': remove_spaces(x['text'])})
ds_test = ds_test.map(lambda x: {'text': remove_spaces(x['text'])})
ds_validation = ds_validation.map(lambda x: {'text': remove_spaces(x['text'])})

small_train_ds = small_train_ds.map(lambda x: {'text': remove_spaces(x['text'])})
small_val_ds = small_val_ds.map(lambda x: {'text': remove_spaces(x['text'])})

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/2170 [00:00<?, ? examples/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

In [12]:
len(ds_train['text'])

43410

# Model training and fine-tuning

In [13]:
def custom_dataloader(data, tokenizer, batch_size, shuffle, max_length=128):
    data = data.to_pandas()

    def multi_label(x):
        return [1 if i in [int(label) for label in x] else 0 for i in range(N_LABELS)]

    def tokenize_function(x):
        return tokenizer(x["text"], padding="max_length", truncation=True,
                         max_length=max_length, return_tensors="pt", return_token_type_ids=True)

    data['labels'] = data.apply(lambda x: multi_label(x['labels']), axis=1)
    data = data.drop('id',axis=1)
    datasets = Dataset.from_pandas(data)
    tokenized_datasets = datasets.map(tokenize_function, batched=True)
    tokenized_datasets.set_format("torch", columns=['input_ids', 'attention_mask', 'labels', 'token_type_ids'])
    # dataloader = DataLoader(tokenized_datasets, shuffle=shuffle, batch_size=batch_size)

    return tokenized_datasets

metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (torch.sigmoid(torch.tensor(logits)) > thres).int().cpu().numpy()
    # Flatten the lists of lists to single lists
    flat_predictions = [item for sublist in predictions for item in sublist]
    flat_labels = [item for sublist in labels.tolist() for item in sublist]
    formatted_predictions = {"predictions": flat_predictions, "references": flat_labels}
    return metric.compute(**formatted_predictions)


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
            outputs = model(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask']
            )
            logits = outputs['logits']
            labels = inputs['labels'].float()

            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)

            return (loss, outputs) if return_outputs else loss

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [14]:
def train_and_eval(base, sequence_classification, tokenizer, n_batch, max_length, thres=0.5, n_epoch=5, lr=2e-5):
    train_dataloader = custom_dataloader(ds_train, tokenizer, n_batch, True, max_length)
    # test_dataloader = custom_dataloader(ds_test, tokenizer, n_batch, False, max_length)
    valid_dataloader = custom_dataloader(ds_validation, tokenizer, n_batch, False, max_length)

    small_train_ld = custom_dataloader(small_train_ds, tokenizer, n_batch, True, max_length)
    small_val_ld = custom_dataloader(small_val_ds, tokenizer, n_batch, True, max_length)

    args = TrainingArguments(
        f"{base}-finetuned",
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        learning_rate=lr,
        per_device_train_batch_size=n_batch,
        per_device_eval_batch_size=n_batch,
        num_train_epochs=n_epoch,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        push_to_hub=True,
    )

    def model_init():
        return sequence_classification.from_pretrained(base, num_labels=N_LABELS)

    trainer_tune = CustomTrainer(
        model_init=model_init,
        args=args,
        # train_dataset=train_dataloader.shard(index=10, num_shards=20), #find hyperparameters based on a portion of the training dataset
        # eval_dataset=valid_dataloader,
        train_dataset=small_train_ld,
        eval_dataset=small_val_ld,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    best_run = trainer_tune.hyperparameter_search(n_trials=10, direction="maximize")

    # model = sequence_classification.from_pretrained(base, num_labels=N_LABELS)

    # print('-'*100)
    # print('Use fine-tuned hyperparameter to train on the full train set')

    # trainer = CustomTrainer(
    #     model=model,
    #     args=args,
    #     train_dataset=train_dataloader,
    #     eval_dataset=valid_dataloader,
    #     tokenizer=tokenizer,
    #     compute_metrics=compute_metrics
    # )

    # for n, v in best_run.hyperparameters.items():
    #     setattr(trainer.args, n, v)

    # trainer.train()

    # print('-'*100)
    # print('Evaluation on validation set')

    # trainer.evaluate()

    # torch.save(model.state_dict(), f"{base}-max-len={max_length}_thres={thres}_lr={lr}_batch={n_batch}.pt")

    return best_run

# ALBERT

**Trial 1:**

batch size 256

max sequence length 128

epoch 5

learning rate 2e-5

threshold 0.5


In [None]:
from transformers import AlbertForSequenceClassification

base = "albert-base-v2"
n_batch = 256
max_length = 128
lr = 2e-5
thres = 0.5
n_epoch = 5
albert_tokenizer = AutoTokenizer.from_pretrained(base, use_fast=True, model_max_length=max_length, do_lower_case=False)
# albert_model = AlbertForSequenceClassification.from_pretrained(base, num_labels=N_LABELS)

# base, sequence_classification, tokenizer, n_batch, max_length, thres=0.5, n_epoch=5, lr=2e-5
best_run = train_and_eval(base, AlbertForSequenceClassification, albert_tokenizer, n_batch, max_length, thres, n_epoch, lr)


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/2170 [00:00<?, ? examples/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2023-12-01 05:54:06,226] A new study created in memory with name: no-name-c1950e95-382b-4065-86d1-6bb31dc48613
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.148729,0.0
2,No log,0.135925,0.094955
3,No log,0.132888,0.228571


[I 2023-12-01 05:57:05,532] Trial 0 finished with value: 0.22857142857142856 and parameters: {'learning_rate': 3.3262207749436694e-05, 'num_train_epochs': 3, 'seed': 6, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.22857142857142856.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.17508,0.0
2,No log,0.157886,0.0
3,No log,0.15263,0.0


[I 2023-12-01 06:00:05,554] Trial 1 finished with value: 0.0 and parameters: {'learning_rate': 1.2540187226296499e-05, 'num_train_epochs': 3, 'seed': 34, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.22857142857142856.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.39035,0.012658
2,No log,0.281571,0.0
3,No log,0.24413,0.0
4,No log,0.228421,0.0
5,No log,0.22425,0.0


[I 2023-12-01 06:04:32,236] Trial 2 finished with value: 0.0 and parameters: {'learning_rate': 7.728558213353496e-06, 'num_train_epochs': 5, 'seed': 2, 'per_device_train_batch_size': 64}. Best is trial 0 with value: 0.22857142857142856.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.18011,0.0
2,0.239000,0.158246,0.0
3,0.239000,0.150254,0.0
4,0.155000,0.145891,0.084592
5,0.155000,0.144421,0.061538


[I 2023-12-01 06:09:49,485] Trial 3 finished with value: 0.06153846153846153 and parameters: {'learning_rate': 5.106249997146786e-06, 'num_train_epochs': 5, 'seed': 2, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.22857142857142856.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.188127,0.0
2,No log,0.16036,0.0
3,No log,0.157106,0.0


[I 2023-12-01 06:12:49,135] Trial 4 finished with value: 0.0 and parameters: {'learning_rate': 4.572438057246573e-05, 'num_train_epochs': 3, 'seed': 31, 'per_device_train_batch_size': 64}. Best is trial 0 with value: 0.22857142857142856.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.646425,0.062402
2,No log,0.597724,0.064711
3,No log,0.570639,0.065961
4,No log,0.561811,0.066914


[I 2023-12-01 06:16:37,574] Trial 5 finished with value: 0.06691449814126393 and parameters: {'learning_rate': 1.0404077577621375e-06, 'num_train_epochs': 4, 'seed': 36, 'per_device_train_batch_size': 64}. Best is trial 0 with value: 0.22857142857142856.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.2403,0.155754,0.0
2,0.1509,0.138496,0.214286
3,0.1382,0.131981,0.345291
4,0.1276,0.128234,0.335766
5,0.1254,0.126945,0.352113


[I 2023-12-01 06:21:59,894] Trial 6 finished with value: 0.35211267605633806 and parameters: {'learning_rate': 5.853134071527912e-06, 'num_train_epochs': 5, 'seed': 40, 'per_device_train_batch_size': 4}. Best is trial 6 with value: 0.35211267605633806.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.258183,0.0
2,No log,0.22339,0.0


[I 2023-12-01 06:24:16,743] Trial 7 finished with value: 0.0 and parameters: {'learning_rate': 1.0663539632784356e-05, 'num_train_epochs': 2, 'seed': 21, 'per_device_train_batch_size': 32}. Best is trial 6 with value: 0.35211267605633806.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.147075,0.0
2,0.174900,0.137968,0.060976


[I 2023-12-01 06:26:44,667] Trial 8 finished with value: 0.06097560975609755 and parameters: {'learning_rate': 2.128933168749278e-05, 'num_train_epochs': 2, 'seed': 11, 'per_device_train_batch_size': 8}. Best is trial 6 with value: 0.35211267605633806.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.443599,0.048565
2,No log,0.377464,0.006369


[I 2023-12-01 06:29:03,182] Trial 9 finished with value: 0.006369426751592357 and parameters: {'learning_rate': 7.904107748069728e-06, 'num_train_epochs': 2, 'seed': 4, 'per_device_train_batch_size': 64}. Best is trial 6 with value: 0.35211267605633806.


In [None]:
train_dataloader = custom_dataloader(ds_train, albert_tokenizer, n_batch, True, max_length)
# test_dataloader = custom_dataloader(ds_test, tokenizer, n_batch, False, max_length)
valid_dataloader = custom_dataloader(ds_validation, albert_tokenizer, n_batch, False, max_length)

args = TrainingArguments(
      f"{base}-finetuned",
      evaluation_strategy = "epoch",
      save_strategy = "epoch",
      learning_rate=lr,
      per_device_train_batch_size=n_batch,
      per_device_eval_batch_size=n_batch,
      num_train_epochs=n_epoch,
      weight_decay=0.01,
      load_best_model_at_end=True,
      metric_for_best_model="f1",
      push_to_hub=True,
  )

model=AlbertForSequenceClassification.from_pretrained(base, num_labels=N_LABELS)

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataloader,
    eval_dataset=valid_dataloader,
    tokenizer=albert_tokenizer,
    compute_metrics=compute_metrics
)
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.0952,0.093627,0.51536
2,0.0851,0.087808,0.563423
3,0.0793,0.086407,0.572637
4,0.0729,0.088644,0.577032
5,0.0657,0.088974,0.579229


TrainOutput(global_step=54265, training_loss=0.08452786619867761, metrics={'train_runtime': 5565.3263, 'train_samples_per_second': 39.0, 'train_steps_per_second': 9.751, 'total_flos': 1300103634585600.0, 'train_loss': 0.08452786619867761, 'epoch': 5.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.0889739841222763,
 'eval_f1': 0.5792290249433106,
 'eval_runtime': 46.3956,
 'eval_samples_per_second': 116.951,
 'eval_steps_per_second': 0.474,
 'epoch': 5.0}

In [None]:
torch.save(model.state_dict(), f"{base}-max-len={max_length}_thres={thres}_lr={lr}_batch={n_batch}.pt")

# RoBERTa

**Trial 1:**

batch size 256

max sequence length 128

epoch 5

learning rate 2e-5

threshold 0.5


In [16]:
from transformers import RobertaForSequenceClassification

base = "roberta-base"
n_batch = 256
max_length = 128
lr = 2e-5
thres = 0.5
n_epoch = 5
roberta_tokenizer = AutoTokenizer.from_pretrained(base, use_fast=True, model_max_length=max_length, do_lower_case=False)

# base, sequence_classification, tokenizer, n_batch, max_length, thres=0.5, n_epoch=5, lr=2e-5
best_run = train_and_eval(base, RobertaForSequenceClassification, roberta_tokenizer, n_batch, max_length, thres, n_epoch, lr)


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/2170 [00:00<?, ? examples/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2023-12-02 03:40:54,728] A new study created in memory with name: no-name-1d7f92bc-75b9-4a20-aaa8-2e6313b3a8eb
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text 

Epoch,Training Loss,Validation Loss,F1
1,0.1628,0.147004,0.0
2,0.1515,0.146587,0.0


[I 2023-12-02 03:44:25,630] Trial 0 finished with value: 0.0 and parameters: {'learning_rate': 7.20934299457575e-05, 'num_train_epochs': 2, 'seed': 38, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 0.0.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.150834,0.0


[I 2023-12-02 03:46:23,229] Trial 1 finished with value: 0.0 and parameters: {'learning_rate': 6.326052953316336e-05, 'num_train_epochs': 1, 'seed': 18, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.0.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.146864,0.0
2,0.169500,0.131589,0.368889
3,0.169500,0.119986,0.310526
4,0.123800,0.113267,0.446389
5,0.123800,0.111391,0.467249


[I 2023-12-02 03:52:57,628] Trial 2 finished with value: 0.46724890829694316 and parameters: {'learning_rate': 3.470015441919893e-05, 'num_train_epochs': 5, 'seed': 7, 'per_device_train_batch_size': 8}. Best is trial 2 with value: 0.46724890829694316.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.251087,0.0
2,No log,0.215905,0.0


[I 2023-12-02 03:55:31,660] Trial 3 finished with value: 0.0 and parameters: {'learning_rate': 3.441003253275865e-05, 'num_train_epochs': 2, 'seed': 31, 'per_device_train_batch_size': 64}. Best is trial 2 with value: 0.46724890829694316.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.161989,0.0


[I 2023-12-02 03:58:23,992] Trial 4 finished with value: 0.0 and parameters: {'learning_rate': 2.082301821400477e-05, 'num_train_epochs': 1, 'seed': 19, 'per_device_train_batch_size': 8}. Best is trial 2 with value: 0.46724890829694316.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.162825,0.0
2,No log,0.15479,0.0


[I 2023-12-02 04:01:13,581] Trial 5 finished with value: 0.0 and parameters: {'learning_rate': 2.4061789637885192e-05, 'num_train_epochs': 2, 'seed': 5, 'per_device_train_batch_size': 16}. Best is trial 2 with value: 0.46724890829694316.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.276231,0.0
2,No log,0.215329,0.0
3,No log,0.192684,0.0
4,No log,0.183096,0.0
5,No log,0.180336,0.0


[I 2023-12-02 04:08:16,794] Trial 6 finished with value: 0.0 and parameters: {'learning_rate': 9.656231101012835e-06, 'num_train_epochs': 5, 'seed': 13, 'per_device_train_batch_size': 32}. Best is trial 2 with value: 0.46724890829694316.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.146186,0.0
2,0.159500,0.135549,0.0


[I 2023-12-02 04:10:19,729] Trial 7 pruned. 
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.311668,0.0
2,No log,0.27883,0.0


[I 2023-12-02 04:12:22,771] Trial 8 finished with value: 0.0 and parameters: {'learning_rate': 3.930905316431344e-06, 'num_train_epochs': 2, 'seed': 13, 'per_device_train_batch_size': 16}. Best is trial 2 with value: 0.46724890829694316.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.5004,0.360226,0.0


[I 2023-12-02 04:17:48,799] Trial 9 finished with value: 0.0 and parameters: {'learning_rate': 1.0596050299315528e-06, 'num_train_epochs': 1, 'seed': 36, 'per_device_train_batch_size': 4}. Best is trial 2 with value: 0.46724890829694316.


In [17]:
train_dataloader = custom_dataloader(ds_train, roberta_tokenizer, n_batch, True, max_length)
# test_dataloader = custom_dataloader(ds_test, tokenizer, n_batch, False, max_length)
valid_dataloader = custom_dataloader(ds_validation, roberta_tokenizer, n_batch, False, max_length)

args = TrainingArguments(
      f"{base}-finetuned",
      evaluation_strategy = "epoch",
      save_strategy = "epoch",
      learning_rate=lr,
      per_device_train_batch_size=n_batch,
      per_device_eval_batch_size=n_batch,
      num_train_epochs=n_epoch,
      weight_decay=0.01,
      load_best_model_at_end=True,
      metric_for_best_model="f1",
      push_to_hub=True,
  )

model=RobertaForSequenceClassification.from_pretrained(base, num_labels=N_LABELS)

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataloader,
    eval_dataset=valid_dataloader,
    tokenizer=roberta_tokenizer,
    compute_metrics=compute_metrics
)
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.095,0.087791,0.558829
2,0.0836,0.085134,0.572858
3,0.0722,0.085537,0.572271
4,0.0616,0.088741,0.582512
5,0.0528,0.093401,0.577369


TrainOutput(global_step=27135, training_loss=0.07582867520277685, metrics={'train_runtime': 5651.1733, 'train_samples_per_second': 38.408, 'train_steps_per_second': 4.802, 'total_flos': 1.42803965293056e+16, 'train_loss': 0.07582867520277685, 'epoch': 5.0})

In [18]:
trainer.evaluate()

{'eval_loss': 0.08874129503965378,
 'eval_f1': 0.5825124226309824,
 'eval_runtime': 34.8284,
 'eval_samples_per_second': 155.792,
 'eval_steps_per_second': 0.632,
 'epoch': 5.0}

In [19]:
torch.save(model.state_dict(), f"{base}-max-len={max_length}_thres={thres}_lr={lr}_batch={n_batch}.pt")

# XLNet


In [20]:
from transformers import XLNetForSequenceClassification
base = "xlnet-base-cased"
n_batch = 256
max_length = 128
lr = 2e-5
thres = 0.5
n_epoch = 5
xlnet_tokenizer = AutoTokenizer.from_pretrained(base, use_fast=True, model_max_length=max_length, do_lower_case=False)

# base, sequence_classification, tokenizer, n_batch, max_length, thres=0.5, n_epoch=5, lr=2e-5
best_run = train_and_eval(base, XLNetForSequenceClassification, xlnet_tokenizer, n_batch, max_length, thres, n_epoch, lr)


config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/2170 [00:00<?, ? examples/s]

Map:   0%|          | 0/271 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2023-12-02 05:57:24,069] A new study created in memory with name: no-name-4eccbdf8-767a-4cfe-bb05-4ace7770422f
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a XLNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the tex

Epoch,Training Loss,Validation Loss,F1
1,No log,0.254858,0.0


[I 2023-12-02 05:59:36,218] Trial 0 finished with value: 0.0 and parameters: {'learning_rate': 1.3630443103548274e-05, 'num_train_epochs': 1, 'seed': 28, 'per_device_train_batch_size': 64}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.169315,0.0
2,No log,0.161484,0.0


[I 2023-12-02 06:02:00,377] Trial 1 finished with value: 0.0 and parameters: {'learning_rate': 1.8240233707821746e-05, 'num_train_epochs': 2, 'seed': 29, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.198873,0.0
2,No log,0.177779,0.0
3,No log,0.173913,0.0


[I 2023-12-02 06:06:45,924] Trial 2 finished with value: 0.0 and parameters: {'learning_rate': 4.193235406461215e-06, 'num_train_epochs': 3, 'seed': 37, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.0.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.2208,0.152346,0.0
2,0.1537,0.148573,0.0
3,0.1522,0.14626,0.0
4,0.1485,0.140487,0.123529
5,0.1453,0.138334,0.13913


[I 2023-12-02 06:15:43,329] Trial 3 finished with value: 0.13913043478260873 and parameters: {'learning_rate': 5.099888719643852e-06, 'num_train_epochs': 5, 'seed': 24, 'per_device_train_batch_size': 4}. Best is trial 3 with value: 0.13913043478260873.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.147468,0.0
2,No log,0.14628,0.0
3,No log,0.146654,0.0


[I 2023-12-02 06:21:13,983] Trial 4 finished with value: 0.0 and parameters: {'learning_rate': 9.646501935103799e-05, 'num_train_epochs': 3, 'seed': 37, 'per_device_train_batch_size': 16}. Best is trial 3 with value: 0.13913043478260873.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.147267,0.0
2,0.161500,0.129651,0.152047
3,0.161500,0.120852,0.345499
4,0.125600,0.114162,0.426667
5,0.125600,0.112384,0.438596


[I 2023-12-02 06:27:22,890] Trial 5 finished with value: 0.43859649122807026 and parameters: {'learning_rate': 3.542605573277489e-05, 'num_train_epochs': 5, 'seed': 22, 'per_device_train_batch_size': 8}. Best is trial 5 with value: 0.43859649122807026.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.153568,0.0


[I 2023-12-02 06:31:33,510] Trial 6 finished with value: 0.0 and parameters: {'learning_rate': 1.680933470100455e-05, 'num_train_epochs': 1, 'seed': 13, 'per_device_train_batch_size': 8}. Best is trial 5 with value: 0.43859649122807026.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,No log,0.167562,0.0


[I 2023-12-02 06:35:04,966] Trial 7 finished with value: 0.0 and parameters: {'learning_rate': 1.51920227592585e-05, 'num_train_epochs': 1, 'seed': 2, 'per_device_train_batch_size': 16}. Best is trial 5 with value: 0.43859649122807026.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1909,0.144151,0.0
2,0.1449,0.131687,0.248101
3,0.1313,0.125298,0.28125
4,0.1278,0.123653,0.376471


[I 2023-12-02 06:42:28,383] Trial 8 finished with value: 0.3764705882352941 and parameters: {'learning_rate': 1.1179635573623456e-05, 'num_train_epochs': 4, 'seed': 28, 'per_device_train_batch_size': 4}. Best is trial 5 with value: 0.43859649122807026.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.2701,0.162667,0.0
2,0.1645,0.15365,0.0


[I 2023-12-02 06:45:14,834] Trial 9 pruned. 


In [21]:
train_dataloader = custom_dataloader(ds_train, xlnet_tokenizer, n_batch, True, max_length)
# test_dataloader = custom_dataloader(ds_test, tokenizer, n_batch, False, max_length)
valid_dataloader = custom_dataloader(ds_validation, xlnet_tokenizer, n_batch, False, max_length)

args = TrainingArguments(
      f"{base}-finetuned",
      evaluation_strategy = "epoch",
      save_strategy = "epoch",
      learning_rate=lr,
      per_device_train_batch_size=n_batch,
      per_device_eval_batch_size=n_batch,
      num_train_epochs=n_epoch,
      weight_decay=0.01,
      load_best_model_at_end=True,
      metric_for_best_model="f1",
      push_to_hub=True,
  )

model=XLNetForSequenceClassification.from_pretrained(base, num_labels=N_LABELS)

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataloader,
    eval_dataset=valid_dataloader,
    tokenizer=xlnet_tokenizer,
    compute_metrics=compute_metrics
)
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.1015,0.096203,0.504023
2,0.0892,0.087982,0.555012
3,0.0761,0.087583,0.563328
4,0.0646,0.091062,0.565829
5,0.0536,0.096055,0.569239


TrainOutput(global_step=27135, training_loss=0.08057146142678782, metrics={'train_runtime': 6698.0336, 'train_samples_per_second': 32.405, 'train_steps_per_second': 4.051, 'total_flos': 1.5461649057024e+16, 'train_loss': 0.08057146142678782, 'epoch': 5.0})

In [22]:
trainer.evaluate()

{'eval_loss': 0.09605488181114197,
 'eval_f1': 0.5692387185669997,
 'eval_runtime': 64.9656,
 'eval_samples_per_second': 83.521,
 'eval_steps_per_second': 0.339,
 'epoch': 5.0}

In [23]:
torch.save(model.state_dict(), f"{base}-max-len={max_length}_thres={thres}_lr={lr}_batch={n_batch}.pt")