In [None]:
import sys

app = "/app"
if app not in sys.path:
    sys.path.append(app)
sys.path

In [None]:
!pwd

In [None]:
!ls /data/emboj_abstracts

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "text",
    data_dir="/data/emboj_abstracts/",
    data_files={'train': 'train/examples.txt', 'test': 'test/examples.txt'},
)
dataset

In [None]:
MODEL_NAME = "roberta-base"

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"tokenizer vocab size: {tokenizer.vocab_size}")
print(f"max length: {tokenizer.max_model_input_sizes[MODEL_NAME]}")

In [None]:
# tokenize the dataset
def tokenization(examples):
    return tokenizer(
    examples["text"],
        max_length=64,  # tokenizer.max_model_input_sizes[MODEL_NAME],
        truncation=True,
        return_special_tokens_mask=False,
    )
tokenized = dataset.map(tokenization, batched=True)
tokenized

In [None]:
tokenized = tokenized.remove_columns(["text"])
tokenized

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [None]:
from transformers import (
    # RobertaForMaskedLM,
    AutoConfig
)
from src.models.modeling_dendroberta import RobertaForMaskedLM

In [None]:
config = AutoConfig.from_pretrained(MODEL_NAME)
config

In [None]:
config.num_hidden_layers=8
# config.intermediate_size=128
config.max_position_embeddings=66

In [None]:
model = RobertaForMaskedLM(config=config)

In [None]:
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

In [None]:
import torch
(
    torch.torch.backends.mps.is_built(),
    torch.cuda.is_available(),
)

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 2
train_dataloader = DataLoader(
    tokenized["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    tokenized["test"],
    batch_size=batch_size,
    collate_fn=default_data_collator
)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-4)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 5
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=1_000,
    num_training_steps=num_training_steps,
)

In [None]:
from tqdm.auto import tqdm
import torch
import math
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter

def log(self, loss, epoch, writer):
    # write to tensorboard
    writer.add_scalar("Loss/train", loss.data, epoch)
    # loss on Trainset (we are lazy and don't use a separate validation set)
    with torch.no_grad():
        outputs, states = self.model(self.X_test)
        valid_loss = self.criterion(outputs, self.y_test)
    writer.add_scalar("Loss/valid", valid_loss, epoch)

progress_bar = tqdm(range(num_training_steps))
writer = SummaryWriter(log_dir="/runs")
eval_steps = 100
step = 0

for epoch in range(num_train_epochs):
    model.train(True)
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        step += 1

        if step % eval_steps == 0:
            writer.add_scalar("Loss/train", loss, epoch)
            # Evaluation
            model.eval()
            losses = []
            for batch in tqdm(eval_dataloader):
                with torch.no_grad():
                    eval_outputs = model(**batch)
                eval_loss = eval_outputs.loss
                if eval_loss is not None:
                    losses.append(accelerator.gather(eval_loss.repeat(batch_size)))
                else:
                    losses.append(torch.tensor(0))
        
            losses = torch.cat(losses)
            losses = losses[: len(eval_dataset)]
            avg_loss = torch.mean(losses)
            writer.add_scalar("Loss/eval", avg_loss, step)
            try:
                perplexity = math.exp(avg_loss)
            except OverflowError:
                perplexity = float("inf")
            writer.add_scalar("Perplexity/eval", perplexity, step)

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        # repo.push_to_hub(
        #     commit_message=f"Training in progress epoch {epoch}", blocking=False
        # )