In [1]:
%load_ext lab_black

## Set-up environment

Let's first install the required libraries:
* HuggingFace Transformers (for the CodeT5 model)
* HuggingFace Datasets (for loading the dataset + preprocessing it)
* PyTorch Lightning (for training)
* Weights and Biases (for logging training metrics).

In [2]:
import os
from os.path import join
import shutil
import sys


def check_create_folder(path):
    if not os.path.exists(path):
        os.makedirs(path)


# Check if running in colab
IN_COLAB = "google.colab" in sys.modules

# Project defaults
if IN_COLAB:
    from google.colab import drive

    drive.mount("/content/drive")
    PROJECT_FOLDER = "/content/drive/MyDrive/w266/w266-project-carlos"
    os.chdir(PROJECT_FOLDER)

    # Install dependencies
    !pip install -q transformers datasets pytorch-lightning wandb
else:
    PROJECT_FOLDER = "/user/w266/w266-project-carlos"


# FOLDERS
DATASET_FOLDER = join(PROJECT_FOLDER, "dataset")
CHECKPOINT_FOLDER = join(PROJECT_FOLDER, "checkpoints")
MODEL_FOLDER = join(PROJECT_FOLDER, "saved_models")
LOGGER_FOLDER = join(PROJECT_FOLDER, "logger")

check_create_folder(CHECKPOINT_FOLDER)
check_create_folder(MODEL_FOLDER)
check_create_folder(LOGGER_FOLDER)

os.getcwd()

'/user/w266/w266-project-carlos'

In [3]:
# response = input("Do you want to delete any of the logger folders: yes/no")

# if response == "yes":
#     print("deleteing")
#     !rm -rf $CHECKPOINT_FOLDER
#     !rm -rf $MODEL_FOLDER
#     !rm -rf $LOGGER_FOLDER
#     !pwd && ls -l
# else:
#     print("skipping")

In [4]:
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import RobertaTokenizer
from torch.utils.data import DataLoader
from transformers import (
    T5ForConditionalGeneration,
    AdamW,
    get_linear_schedule_with_warmup,
)
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger, TensorBoardLogger, CSVLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor


np.random.seed(123)

  from .autonotebook import tqdm as notebook_tqdm
2023-03-25 14:43:00.846399: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-25 14:43:00.846496: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


## Data load and prep

In [5]:
def combine_csv_files(csv_paths: list[str], shuffle=False):
    for i, path in enumerate(csv_paths):
        df = pd.read_csv(path)
        columns = df.columns

        print(f"Number of records in {path}: {df.shape[0]}")

        if i == 0:
            df_full = df
            columns_base = columns
        else:
            if not np.array_equal(columns, columns_base):
                raise (Exception("Columns do not match"))

            total_records = df_full.shape[0] + df.shape[0]

            df_full = (
                pd.concat([df_full, df]).drop_duplicates(columns).reset_index(drop=True)
            )

            records_dropped = total_records - df_full.shape[0]

            print(f"-> Merged!!, {records_dropped} duplicates were found and dropped\n")

    if shuffle:
        shuffled_indices = np.random.permutation(np.arange(df_full.shape[0]))
        df_full = df_full.iloc[shuffled_indices, :].reset_index(drop=True)

    print(f"A total of {df_full.shape[0]} recrods were loaded")
    return df_full

In [6]:
df_dataset = combine_csv_files(
    [
        join(DATASET_FOLDER, "train.csv"),
        join(DATASET_FOLDER, "dev.csv"),
        join(DATASET_FOLDER, "test.csv"),
    ],
    shuffle=False,
)

df_dataset.head()

Number of records in /user/w266/w266-project-carlos/dataset/train.csv: 12621
Number of records in /user/w266/w266-project-carlos/dataset/dev.csv: 717
-> Merged!!, 0 duplicates were found and dropped

Number of records in /user/w266/w266-project-carlos/dataset/test.csv: 2461
-> Merged!!, 0 duplicates were found and dropped

A total of 15799 recrods were loaded


Unnamed: 0,tvBench_id,db_id,chart,hardness,query,question,vega_zero
0,1000@y_name@DESC,customers_and_products_contacts,Bar,Medium,"Visualize BAR SELECT product_name , COUNT(prod...",Bar chart x axis product name y axis how many ...,mark bar data products encoding x product_name...
1,2463@x_name@ASC,network_2,Bar,Easy,"Visualize BAR SELECT job , min(age) FROM Perso...",how old is the youngest person for each job ? ...,mark bar data person encoding x job y aggregat...
2,2545@y_name@DESC,pets_1,Bar,Medium,"Visualize BAR SELECT PetType , avg(pet_age) FR...",Please give me a bar chart to show the average...,mark bar data pets encoding x pettype y aggreg...
3,2615@y_name@ASC,products_for_hire,Bar,Extra Hard,"Visualize BAR SELECT payment_date , COUNT(paym...",What are the payment date of the payment with ...,mark bar data payments encoding x payment_date...
4,1304,election,Bar,Easy,"Visualize BAR SELECT County_name , Population ...",What are the name and population of each count...,mark bar data county encoding x county_name y ...


### Create the model dataset

In [7]:
def get_indexes(total_indices, splits=[0.70, 0.10, 0.20]):
    start_index = 0
    indices = []
    groups = len(splits)

    for i, split in enumerate(splits):
        count = int(total_indices * split)
        end_index = start_index + count

        print(f"Group {i} > {start_index}:{end_index-1}, {count}")

        if i + 1 < groups:
            indices.append(np.arange(start_index, end_index))
        else:
            indices.append(np.arange(start_index, total_indices))

        start_index = end_index

    return indices

In [8]:
# indces = get_indexes(df_dataset.shape[0], splits=[0.75, 0.10, 0.15])

indces = get_indexes(100, splits=[0.75, 0.10, 0.15])

train_datset = Dataset.from_pandas(df_dataset.iloc[indces[0]], split="train")
valid_dataset = Dataset.from_pandas(df_dataset.iloc[indces[1]], split="validation")
test_dataset = Dataset.from_pandas(df_dataset.iloc[indces[2]], split="train")

display(train_datset)
display(valid_dataset)
display(test_dataset)

display(train_datset[0])

Group 0 > 0:74, 75
Group 1 > 75:84, 10
Group 2 > 85:99, 15


Dataset({
    features: ['tvBench_id', 'db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', '__index_level_0__'],
    num_rows: 75
})

Dataset({
    features: ['tvBench_id', 'db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', '__index_level_0__'],
    num_rows: 10
})

Dataset({
    features: ['tvBench_id', 'db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', '__index_level_0__'],
    num_rows: 15
})

{'tvBench_id': '1000@y_name@DESC',
 'db_id': 'customers_and_products_contacts',
 'chart': 'Bar',
 'hardness': 'Medium',
 'query': 'Visualize BAR SELECT product_name , COUNT(product_name) FROM products GROUP BY product_name ORDER BY COUNT(product_name) DESC',
 'question': 'Bar chart x axis product name y axis how many product name , rank by the Y-axis in desc .',
 'vega_zero': 'mark bar data products encoding x product_name y aggregate count product_name transform group x sort y desc',
 '__index_level_0__': 0}

### Pre-process and tokenize the data

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

prefix = "Generate vega_zero code: "
max_input_length = 100
max_target_length = 100

In [None]:
def preprocess_examples(examples, add_db_info=False):
    questions = examples["question"]  # inputs
    queries = examples["query"]  # targets

    inputs = [prefix + question for question in questions]

    if add_db_info:
        pass

    # Tokenize the inputs
    model_inputs = tokenizer(
        inputs, max_length=max_input_length, padding="max_length", truncation=True
    )

    # Tokenize the targets
    labels = tokenizer(
        queries, max_length=max_target_length, padding="max_length", truncation=True
    ).input_ids

    # important: we need to replace the index of the padding tokens by -100
    # such that they are not taken into account by the CrossEntropyLoss
    labels_with_ignore_index = []
    for labels_example in labels:
        labels_example = [label if label != 0 else -100 for label in labels_example]
        labels_with_ignore_index.append(labels_example)

    model_inputs["labels"] = labels_with_ignore_index

    return model_inputs

In [None]:
train_datset = train_datset.map(preprocess_examples, batched=True)
valid_dataset = valid_dataset.map(preprocess_examples, batched=True)
test_dataset = test_dataset.map(preprocess_examples, batched=True)

display(train_datset)
display(valid_dataset)
display(test_dataset)

In [None]:
train_datset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


train_dataloader = DataLoader(train_datset, shuffle=True, batch_size=8)
valid_dataloader = DataLoader(valid_dataset, batch_size=4)
test_dataloader = DataLoader(test_dataset, batch_size=4)

In [None]:
batch = next(iter(train_dataloader))
print(batch.keys())

In [None]:
tokenizer.decode(batch["input_ids"][0])

In [None]:
labels = batch["labels"][0]
tokenizer.decode([label for label in labels if label != -100])

## Fine Tune Model

In [None]:
from transformers import (
    T5ForConditionalGeneration,
    AdamW,
    get_linear_schedule_with_warmup,
)
import pytorch_lightning as pl


class CodeT5(pl.LightningModule):
    def __init__(self, lr=5e-5, num_train_epochs=15, warmup_steps=1000):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(
            "Salesforce/codet5-small"
        )
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        )
        return outputs

    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss

        return loss

    def training_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)
        self.log("validation_loss", loss, on_epoch=True)

        return loss

    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)

        return loss

    def configure_optimizers(self):
        # create optimizer
        optimizer = AdamW(self.parameters(), lr=self.hparams.lr)
        # create learning rate scheduler
        num_train_optimization_steps = self.hparams.num_train_epochs * len(
            train_dataloader
        )
        lr_scheduler = {
            "scheduler": get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=self.hparams.warmup_steps,
                num_training_steps=num_train_optimization_steps,
            ),
            "name": "learning_rate",
            "interval": "step",
            "frequency": 1,
        }

        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return valid_dataloader

    def test_dataloader(self):
        return test_dataloader

In [None]:
model = CodeT5()

In [None]:
logger = CSVLogger(save_dir=LOGGER_FOLDER, name="My_Logger")

early_stop_callback = EarlyStopping(
    monitor="validation_loss", patience=3, strict=False, verbose=False, mode="min"
)
lr_monitor = LearningRateMonitor(logging_interval="step")

trainer = Trainer(
    default_root_dir=CHECKPOINT_FOLDER,
    callbacks=[early_stop_callback, lr_monitor],
    max_epochs=5,
    logger=logger,
)
trainer.fit(model)

In [None]:
model.model.save_pretrained(MODEL_FOLDER)