In [1]:
!pip install -q torch torchinfo bitsandbytes peft trl accelerate datasets transformers sentencepiece

[0m

In [2]:
import torch
from torchinfo import summary
from datasets import load_dataset, Dataset
from torch.utils.data import ConcatDataset
import sys
sys.path.append('../')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import DistilBertTokenizer
from ..MDMT_DistilBert import MDMT_DistilBertConfig, MDMT_DistilBertWrapper

config = MDMT_DistilBertConfig(tasks_configs=[dict(target_features=1, criterion_type='MSE'), dict(target_features=6, criterion_type='MSE'), dict(target_features=1, criterion_type='MSE')], max_position_embeddings=2048)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = MDMT_DistilBertWrapper.from_pretrained("distilbert-base-uncased", config=config, ignore_mismatched_sizes=True).to(device)
model.base_model.resize_position_embeddings(2048)

[{'target_features': 1, 'criterion_type': 'MSE'}, {'target_features': 6, 'criterion_type': 'MSE'}, {'target_features': 1, 'criterion_type': 'MSE'}]


Some weights of MDMT_DistilBertWrapper were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifiers.0.bias', 'classifiers.0.weight', 'classifiers.1.bias', 'classifiers.1.weight', 'classifiers.2.bias', 'classifiers.2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of MDMT_DistilBertWrapper were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized because the shapes did not match:
- distilbert.embeddings.position_embeddings.weight: found shape torch.Size([512, 768]) in the checkpoint and torch.Size([2048, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
text = "Quote: Imagination is more"
inputs = tokenizer(text, return_tensors="pt").to(device)

model(**inputs).logits
print(summary(model, input_data=[inputs.input_ids, inputs.attention_mask]))

Layer (type:depth-idx)                                  Output Shape              Param #
MDMT_DistilBertWrapper                                  [1, 8]                    --
├─DistilBertModel: 1-1                                  [1, 7, 768]               --
│    └─Embeddings: 2-1                                  [1, 7, 768]               --
│    │    └─Embedding: 3-1                              [1, 7, 768]               23,440,896
│    │    └─Embedding: 3-2                              [1, 7, 768]               1,572,864
│    │    └─LayerNorm: 3-3                              [1, 7, 768]               1,536
│    │    └─Dropout: 3-4                                [1, 7, 768]               --
│    └─Transformer: 2-2                                 [1, 7, 768]               --
│    │    └─ModuleList: 3-5                             --                        42,527,232
├─ModuleList: 1-2                                       --                        --
│    └─Linear: 2-3                

In [5]:
class MTMD_Dataset():
    def __init__(self, dataset, task_index, text: str, labels: list[str]):
        self.dataset = dataset
        self.task_index = task_index
        self.text = text
        self.labels = labels
    
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        item["task_index"] = self.task_index
        item["full_text"] = item[self.text]
        item["labels"] = [item[label] for label in self.labels]
        return item

In [6]:
essay_scoring_dataset = MTMD_Dataset(load_dataset("essay_scoring", data_files="train.csv", split="train"), task_index=0, text="full_text", labels=["score"])
ellipse_scoring_dataset = MTMD_Dataset(load_dataset("ellipse", data_files="train.csv", split="train"), task_index=1, text="full_text", labels=["cohesion","syntax","vocabulary","phraseology","grammar","conventions"])
#short_answer_scoring_dataset = MTMD_Dataset(load_dataset(path="short_answer_scoring", data_files="train.tsv", split="train"), task_index=2, text="EssayText", labels=["Score1"])
clear_dataset = MTMD_Dataset(load_dataset(path="clear", data_files="CLEAR_corpus_final.csv", split="train"), task_index=2, text="Excerpt", labels=["BT_easiness"])

datasets = [essay_scoring_dataset, ellipse_scoring_dataset, clear_dataset]
dataset = ConcatDataset(datasets)
loss_weights = torch.nn.functional.normalize(1/torch.tensor([len(dataset) for dataset in datasets], device=device, dtype=torch.float), p=1.0, dim=0)
print(f"dataset lenghts: {[len(dataset) for dataset in datasets]}, loss weights: {loss_weights.data}")

dataset lenghts: [17307, 3911, 4724], loss weights: tensor([0.1100, 0.4869, 0.4031], device='cuda:0')


In [7]:
train_dataset, eval_dataset = torch.utils.data.random_split(dataset, [0.9, 0.1])
print(len(train_dataset), len(eval_dataset))

23348 2594


In [8]:
def collate_fn(batch):
    full_texts = [example["full_text"] for example in batch]
    labels = [torch.stack([torch.tensor(label, device=device, dtype=torch.float32) for label in example["labels"]]) for example in batch]
    task_indices = torch.tensor([example["task_index"] for example in batch], device=device, dtype=torch.long)
    encoded = tokenizer(full_texts, return_tensors='pt', padding='longest', truncation=True, max_length=config.max_position_embeddings)
    
    output = { "input_ids": encoded["input_ids"], 
               "attention_mask": encoded["attention_mask"], 
               "labels": labels, 
               "task_indices": task_indices,
               "loss_weights": loss_weights }
    return output

In [9]:
import transformers

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=6,
        gradient_accumulation_steps=1,
        warmup_ratio=0.05,
        num_train_epochs=2,
        learning_rate=1e-4,
        output_dir="outputs",
        optim="adamw_torch",
        remove_unused_columns=False,
        dataloader_pin_memory=False,
        logging_steps=100,
        torch_empty_cache_steps=100,
        prediction_loss_only=True,
        save_strategy="no"
    ),
    data_collator=collate_fn
)
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
100,0.4249
200,0.1366
300,0.1434
400,0.1149
500,0.098
600,0.0854
700,0.0943
800,0.0962
900,0.098
1000,0.0998


TrainOutput(global_step=7784, training_loss=0.06872757601590955, metrics={'train_runtime': 877.1461, 'train_samples_per_second': 53.236, 'train_steps_per_second': 8.874, 'total_flos': 8144707110633120.0, 'train_loss': 0.06872757601590955, 'epoch': 2.0})

In [10]:
trainer.evaluate()

{'eval_loss': 0.05439744144678116,
 'eval_runtime': 27.5214,
 'eval_samples_per_second': 94.254,
 'eval_steps_per_second': 11.809,
 'epoch': 2.0}