In [None]:
# %load_ext lab_black
# %load_ext autoreload
# %autoreload 2

In [None]:
# Set for local or colab

import os
from os.path import join
import shutil
import sys

ASK_TO_DELETE_LOG_FOLDERS = False


def check_create_folder(path: str, ask_to_rm_if_exists=ASK_TO_DELETE_LOG_FOLDERS):
    if os.path.exists(path):
        if ask_to_rm_if_exists:
            response = input(
                f"<{path}>: Already exists.\n\nWrite 'del' if you wish to delete other wise press any key: "
            )
            if response.lower() == "del":
                print(f"Deleting: {path}")
                shutil.rmtree(path)

                os.makedirs(path)
    else:
        os.makedirs(path)


# Check if running in colab
IN_COLAB = "google.colab" in sys.modules

# Project defaults
if IN_COLAB:
    print("ENVIRONMENT: Colab")

    # Mount drive
    from google.colab import drive

    drive.mount("/content/drive")

    # Set the project directory
    PROJECT_FOLDER = "/content/drive/MyDrive/MIDS/w266/w266-project-carlos"

    # Install dependencies
    !pip install -q transformers datasets pytorch-lightning SentencePiece #wandb
else:
    print("ENVIRONMENT: Local")
    # Set the project directory
    PROJECT_FOLDER = "/user/w266/w266-project-carlos"

os.chdir(PROJECT_FOLDER)

# FOLDERS
DATASET_FOLDER = join(PROJECT_FOLDER, "dataset/dataset_final")
CHECKPOINT_FOLDER = join(PROJECT_FOLDER, "checkpoints")
MODEL_FOLDER = join(PROJECT_FOLDER, "saved_models")
LOGGER_FOLDER = join(PROJECT_FOLDER, "logger")

check_create_folder(CHECKPOINT_FOLDER)
check_create_folder(MODEL_FOLDER)
check_create_folder(LOGGER_FOLDER)

print(f"Working directory is: {os.getcwd()}")

In [None]:
import copy
import math

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from datasets import Dataset
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import (
    Callback,
    EarlyStopping,
    LearningRateMonitor,
    TQDMProgressBar,
)
from pytorch_lightning.loggers import CSVLogger
from torch.utils.data import DataLoader
from transformers import (
    AdamW,
    RobertaTokenizer,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup,
)


np.random.seed(123)

In [None]:
from t5_model_support_functions import load_csv_files, token_to_df, expand_tokenizer

### Load `csv` data as `dataframes`

In [None]:
TARGET_FEATURES = ["source", "labels", "token_types"]

df_train, df_val, df_test = load_csv_files(
    [
        join(DATASET_FOLDER, "train.csv"),
        join(DATASET_FOLDER, "dev.csv"),
        join(DATASET_FOLDER, "test.csv"),
    ],
    focus_columns=TARGET_FEATURES,
    drop_duplicates=True,
    dropna=True,
    shuffle=False,
)

### Create the datasets (and set dataset length)

In [None]:
DEV_TESTING = False

if DEV_TESTING:
    train_dataset = Dataset.from_pandas(df_train.head(10), split="train")
    val_dataset = Dataset.from_pandas(df_val.head(10), split="validation")
    test_dataset = Dataset.from_pandas(df_test.head(10), split="test")
else:
    train_dataset = Dataset.from_pandas(df_train, split="train")
    val_dataset = Dataset.from_pandas(df_val, split="validation")
    test_dataset = Dataset.from_pandas(df_test, split="test")

display(train_dataset)
display(val_dataset)
display(test_dataset)

display(train_dataset[0])

### Pre-process and tokenize the data

#### Select the pretrained model and tokenizer type

In [None]:
# MODEL_TYPE = "Salesforce/codet5-small"
# tokenizer_library = RobertaTokenizer

# MODEL_TYPE = "Salesforce/codet5-base"
# tokenizer_library = RobertaTokenizer

MODEL_TYPE = "Salesforce/codet5-large"
tokenizer_library = RobertaTokenizer

# MODEL_TYPE = "t5-small"
# tokenizer_library = T5Tokenizer

# MODEL_TYPE = "t5-base"
# tokenizer_library = T5Tokenizer

# Get Tokenizer
tokenizer = tokenizer_library.from_pretrained(MODEL_TYPE)

#### Declare and expand tokenizer

In [None]:
new_tokens = [
    "<N>",
    "</N>",
    "<C>",
    "[T]",
    "[D]",
    "[X]",
    "[AggFunction]",
    "[Y]",
    "[Z]",
    "[F]",
    "[G]",
    "[B]",
    "[S]",
    "[K]",
    "</C>",
    "<D>",
    "<COL>",
    "</COL>",
    "<VAL>",
    "</VAL>",
    "</D>",
]

# Expand Tokenzier with new Tokens
display(expand_tokenizer(new_tokens, tokenizer))

#### Explore tokenized lengths

In [None]:
# Determine the maximum input length to not truncate any of the inputs
# print("input token lengths")
# df_token_len = pd.DataFrame(
#     columns=["token_count"],
#     data=[
#         tokenizer(item, return_tensors="pt")["input_ids"].numpy().shape[1]
#         for item in df_dataset["source"]
#     ],
# )

# display(df_token_len.describe(percentiles=[0.90, 0.95, 0.99, 0.999]))

# display(df_token_len.query("token_count > 160"))

In [None]:
# Determine the maximum input length to not truncate any of the inputs
# print("outputs token lengths")
# df_token_len = pd.DataFrame(
#     columns=["token_count"],
#     data=[
#         tokenizer(item, return_tensors="pt")["input_ids"].numpy().shape[1]
#         for item in df_dataset["labels"]
#     ],
# )

# display(df_token_len.describe(percentiles=[0.90, 0.95, 0.99, 0.999]))

#### Set sequence length

In [None]:
prefix = "Generate vega_zero code: "
max_input_length = 162
max_target_length = 60

In [None]:
def preprocess_examples(examples):
    """
    This function process the input and targets (labels)

    Inputs:
    - Adds a prefix to the source (for t5)
    - Tokenizes the input

    Targets (labels):
    - Tokenizes
    - Replaces the padding token index from 0 to -100
    """
    sources = examples["source"]  # inputs
    label_queries = examples["labels"]  # targets

    inputs = [prefix + source for source in sources]

    # Tokenize the inputs
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )

    # Tokenize the targets
    labels = tokenizer(
        label_queries,
        max_length=max_target_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    ).input_ids

    # important: we need to replace the index of the padding tokens by -100
    # such that they are not taken into account by the CrossEntropyLoss
    labels_with_ignore_index = []
    for label_set in labels:
        label_set = [label if label != 0 else -100 for label in label_set]
        labels_with_ignore_index.append(label_set)

    model_inputs["label_tokens"] = labels_with_ignore_index

    return model_inputs


# Map the function to each dataset
train_dataset = train_dataset.map(preprocess_examples, batched=True)
val_dataset = val_dataset.map(preprocess_examples, batched=True)
test_dataset = test_dataset.map(preprocess_examples, batched=True)

columns = ["input_ids", "attention_mask", "label_tokens"]

# This sets `__getitem__` return format (type and columns). The data formatting is applied on-the-fly.
# `__getitem__` is what pulls the batches during training
train_dataset.set_format(type="torch", columns=columns)
val_dataset.set_format(type="torch", columns=columns)
test_dataset.set_format(type="torch", columns=columns)

print("Training")
print(train_dataset)
print("*" * 100)

print("Validation")
print(val_dataset)
print("*" * 100)

print("Test")
print(test_dataset)

# Without the `.set_format`, this would get you all the columns
print(train_dataset[0].keys())

### Check that the previous is working as intended

In [None]:
sample_dataloader = DataLoader(val_dataset, batch_size=4)

batch = next(iter(sample_dataloader))

print(f"The keys for each batch are:")
print(batch.keys())
print("*" * 100)

print("Input token ids:")
print(batch["input_ids"][0])
print("*" * 100)

print("Decoded input tokens:")
print(tokenizer.decode(batch["input_ids"][0]))
print("*" * 100)

print("Label token ids:")
labels = batch["label_tokens"][0]
print(labels)
print("*" * 100)

print("Decoded label tokens:")
print(tokenizer.decode([label for label in labels if label != -100]))

### Set hyper-parameters

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device type: {device}")

if device == "cuda":
    torch.cuda.empty_cache()


# hyper parameters
num_epochs = 5
batch_size = 15
test_batch_size = 3
learning_rate = 5e-5
# warmup_steps = 1000

# Calculated values
training_set_len = len(train_dataset)
batches_per_epoch = math.ceil(training_set_len / batch_size)
total_training_steps = num_epochs * batches_per_epoch
warmup_steps = (
    int(np.round(num_epochs / 3)) * batches_per_epoch
)  # after a third of the epocs

# Extra note-only parameters
val_set_len = len(val_dataset)
test_set_len = len(test_dataset)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=test_batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=test_batch_size)

assert len(train_dataloader) == batches_per_epoch

print("Training stats:")
print(f"learning_rate: {learning_rate}")
pd.DataFrame(
    index=[
        "TOTAL Samples",
        "training_set_len",
        "val_set_len",
        "test_set_len",
        "num_epochs",
        "batch_size",
        "batches_per_epoch",
        "total_training_steps",
        "warmup_steps",
    ],
    columns=["training_value"],
    data=[
        training_set_len + val_set_len + test_set_len,
        training_set_len,
        val_set_len,
        test_set_len,
        num_epochs,
        batch_size,
        batches_per_epoch,
        total_training_steps,
        warmup_steps,
    ],
)

### Train model

In [None]:
class EpochProgressBar(TQDMProgressBar):
    """
    This extends the base progress bar to not overwrite the progress bar
    and show its history.
    """

    def on_train_epoch_end(self, trainer="pl.Trainer", pl_module="pl.LightningModule"):
        super().on_train_epoch_end(trainer=trainer, pl_module=pl_module)
        print("\n")


class HistoryCallback(Callback):
    """PyTorch Lightning metric callback."""

    def __init__(self):
        super().__init__()
        self.history = []

    def on_train_epoch_end(self, trainer, pl_module):
        each_me = copy.deepcopy(trainer.callback_metrics)
        self.history.append(each_me)

    def history_dataframe(self):
        return pd.DataFrame(self.history).astype(np.float32)


class CodeT5(pl.LightningModule):
    def __init__(
        self,
        model_type,
        num_epochs,
        batch_size,
        lr,
        training_set_len,
        total_training_steps,
        warmup_steps,
    ):
        super().__init__()

        # Model or layers
        self.model = T5ForConditionalGeneration.from_pretrained(model_type)

        self.model.resize_token_embeddings(len(tokenizer))

        self.save_hyperparameters()

        # Debuggin vars(not needed for training)
        self.training_steps_completed = 0.0

    def forward(self, input_ids, attention_mask, label_tokens=None):
        outputs = self.model(
            input_ids=input_ids, attention_mask=attention_mask, labels=label_tokens
        )
        return outputs

    def common_step(self, batch, batch_idx):
        # `batch` is a dictionary, the '**' before batch
        # allows the 'forward step' to directly unpack the dictionary
        outputs = self(**batch)

        # The pretrained model aut calcs the loss
        loss = outputs.loss

        return loss

    def training_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)
        self.log(
            "training_loss",
            loss,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )

        last_lr = self.lr_schedulers().get_last_lr()[0]
        self.log(
            "learning_rate",
            last_lr,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )

        self.training_steps_completed += 1.0  # logger don't like ints
        self.log(
            "training_steps_completed",
            self.training_steps_completed + 0.5,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )

        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)
        self.log(
            "validation_loss",
            loss,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )

        return loss

    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)
        return loss

    def configure_optimizers(self):
        # create optimizer
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)

        # create learning rate scheduler
        lr_scheduler = {
            "scheduler": get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=self.hparams.warmup_steps,
                num_training_steps=self.hparams.total_training_steps,
            ),
            "name": "learning_rate",
            "interval": "step",
            "frequency": 1,
        }

        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return val_dataloader

    def test_dataloader(self):
        return test_dataloader


model = CodeT5(
    model_type=MODEL_TYPE,
    num_epochs=num_epochs,
    batch_size=batch_size,
    lr=learning_rate,
    training_set_len=training_set_len,
    total_training_steps=total_training_steps,
    warmup_steps=warmup_steps,
)

# model.resize_token_embeddings(len(tokenizer))

# -----------------------------------------------------------------------
# Trainer section
# -----------------------------------------------------------------------

# Logger
logger = CSVLogger(save_dir=LOGGER_FOLDER, name="CodeT5_Logger")

# Callbacks
progress_bar = EpochProgressBar()
history = HistoryCallback()

# Trainer
trainer = Trainer(
    default_root_dir=CHECKPOINT_FOLDER,
    callbacks=[progress_bar, history],
    logger=logger,
    max_epochs=num_epochs,
)

trainer.fit(model)

history.history_dataframe()

In [None]:
model.model.save_pretrained(MODEL_FOLDER)
tokenizer.save_pretrained(join(MODEL_FOLDER, "tokenizer"))