In [24]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [25]:
# Set for local or colab

import os
from os.path import join
import shutil
import sys

ASK_TO_DELETE_LOG_FOLDERS = True


def check_create_folder(path: str, ask_to_rm_if_exists=ASK_TO_DELETE_LOG_FOLDERS):
    if os.path.exists(path):
        if ask_to_rm_if_exists:
            response = input(
                f"<{path}>: Already exists.\n\nWrite 'del' if you wish to delete other wise press any key"
            )
            if response.lower() == "del":
                print(f"Deleting: {path}")
                shutil.rmtree(path)

                os.makedirs(path)
        else:
            os.makedirs(path)


# Check if running in colab
IN_COLAB = "google.colab" in sys.modules

# Project defaults
if IN_COLAB:
    print("ENVIRONMENT: Colab")

    # Mount drive
    from google.colab import drive

    drive.mount("/content/drive")

    # Set the project directory
    PROJECT_FOLDER = "/content/drive/MyDrive/w266/w266-project-carlos"

    # Install dependencies
    !pip install -q transformers datasets pytorch-lightning wandb
else:
    print("ENVIRONMENT: Local")
    # Set the project directory
    PROJECT_FOLDER = "/user/w266/w266-project-carlos"

os.chdir(PROJECT_FOLDER)

# FOLDERS
DATASET_FOLDER = join(PROJECT_FOLDER, "dataset")
CHECKPOINT_FOLDER = join(PROJECT_FOLDER, "checkpoints")
MODEL_FOLDER = join(PROJECT_FOLDER, "saved_models")
LOGGER_FOLDER = join(PROJECT_FOLDER, "logger")

check_create_folder(CHECKPOINT_FOLDER)
check_create_folder(MODEL_FOLDER)
check_create_folder(LOGGER_FOLDER)

print(f"Working directory is: {os.getcwd()}")

ENVIRONMENT: Local
Working directory is: /user/w266/w266-project-carlos


In [26]:
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import RobertaTokenizer
from torch.utils.data import DataLoader
from transformers import (
    T5ForConditionalGeneration,
    AdamW,
    get_linear_schedule_with_warmup,
)
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger, TensorBoardLogger, CSVLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor


np.random.seed(123)

### Load `csv` data as a single `dataframe`

In [27]:
def combine_csv_files(csv_paths: list[str], shuffle=False):
    """
    Combine csv data into a single dataframe and checks for duplicate records.

    """
    for i, path in enumerate(csv_paths):
        df = pd.read_csv(path)
        columns = df.columns

        print(f"Number of records in {path}: {df.shape[0]}")

        if i == 0:
            df_full = df
            columns_base = columns
        else:
            if not np.array_equal(columns, columns_base):
                raise (Exception("Columns do not match"))

            total_records = df_full.shape[0] + df.shape[0]

            df_full = (
                pd.concat([df_full, df]).drop_duplicates(columns).reset_index(drop=True)
            )

            records_dropped = total_records - df_full.shape[0]

            print(f"-> Merged!!, {records_dropped} duplicates were found and dropped")

        print("")

    if shuffle:
        shuffled_indices = np.random.permutation(np.arange(df_full.shape[0]))
        df_full = df_full.iloc[shuffled_indices, :].reset_index(drop=True)

    print(f"A total of {df_full.shape[0]} recrods were loaded")
    return df_full


df_dataset = combine_csv_files(
    [
        join(DATASET_FOLDER, "train.csv"),
        join(DATASET_FOLDER, "dev.csv"),
        join(DATASET_FOLDER, "test.csv"),
    ],
    shuffle=False,
)

df_dataset.head()

Number of records in /user/w266/w266-project-carlos/dataset/train.csv: 12621

Number of records in /user/w266/w266-project-carlos/dataset/dev.csv: 717
-> Merged!!, 0 duplicates were found and dropped

Number of records in /user/w266/w266-project-carlos/dataset/test.csv: 2461
-> Merged!!, 0 duplicates were found and dropped

A total of 15799 recrods were loaded


Unnamed: 0,tvBench_id,db_id,chart,hardness,query,question,vega_zero
0,1000@y_name@DESC,customers_and_products_contacts,Bar,Medium,"Visualize BAR SELECT product_name , COUNT(prod...",Bar chart x axis product name y axis how many ...,mark bar data products encoding x product_name...
1,2463@x_name@ASC,network_2,Bar,Easy,"Visualize BAR SELECT job , min(age) FROM Perso...",how old is the youngest person for each job ? ...,mark bar data person encoding x job y aggregat...
2,2545@y_name@DESC,pets_1,Bar,Medium,"Visualize BAR SELECT PetType , avg(pet_age) FR...",Please give me a bar chart to show the average...,mark bar data pets encoding x pettype y aggreg...
3,2615@y_name@ASC,products_for_hire,Bar,Extra Hard,"Visualize BAR SELECT payment_date , COUNT(paym...",What are the payment date of the payment with ...,mark bar data payments encoding x payment_date...
4,1304,election,Bar,Easy,"Visualize BAR SELECT County_name , Population ...",What are the name and population of each count...,mark bar data county encoding x county_name y ...


### Create the datasets

In [28]:
# TOTAL_DATASET_SIZE = df_dataset.shape[0]
TOTAL_DATASET_SIZE = 10


def get_indexes(total_indices, splits=[0.70, 0.10, 0.20]):
    start_index = 0
    indices = []
    groups = len(splits)

    for i, split in enumerate(splits):
        count = int(total_indices * split)
        end_index = start_index + count

        print(f"Group {i} > {start_index}:{end_index-1}, {count}")

        if i + 1 < groups:
            indices.append(np.arange(start_index, end_index))
        else:
            indices.append(np.arange(start_index, total_indices))

        start_index = end_index

    return indices


# indces = get_indexes(df_dataset.shape[0], splits=[0.75, 0.10, 0.15])
indces = get_indexes(TOTAL_DATASET_SIZE, splits=[0.75, 0.10, 0.15])

train_datset = Dataset.from_pandas(df_dataset.iloc[indces[0]], split="train")
valid_dataset = Dataset.from_pandas(df_dataset.iloc[indces[1]], split="validation")
test_dataset = Dataset.from_pandas(df_dataset.iloc[indces[2]], split="train")

display(train_datset)
display(valid_dataset)
display(test_dataset)

display(train_datset[0])

Group 0 > 0:6, 7
Group 1 > 7:7, 1
Group 2 > 8:8, 1


Dataset({
    features: ['tvBench_id', 'db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', '__index_level_0__'],
    num_rows: 7
})

Dataset({
    features: ['tvBench_id', 'db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', '__index_level_0__'],
    num_rows: 1
})

Dataset({
    features: ['tvBench_id', 'db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', '__index_level_0__'],
    num_rows: 2
})

{'tvBench_id': '1000@y_name@DESC',
 'db_id': 'customers_and_products_contacts',
 'chart': 'Bar',
 'hardness': 'Medium',
 'query': 'Visualize BAR SELECT product_name , COUNT(product_name) FROM products GROUP BY product_name ORDER BY COUNT(product_name) DESC',
 'question': 'Bar chart x axis product name y axis how many product name , rank by the Y-axis in desc .',
 'vega_zero': 'mark bar data products encoding x product_name y aggregate count product_name transform group x sort y desc',
 '__index_level_0__': 0}

### Pre-process and tokenize the data

In [29]:
tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

prefix = "Generate vega_zero code: "
max_input_length = 100
max_target_length = 100

In [30]:
def preprocess_examples(examples, add_db_info=False):
    """
    This function process the input and targets (labels)

    Inputs:
    - Adds a prefix question (for t5)
    - Tokenizes the input

    Targets (labels):
    - Tokenizes
    - Replaces the padding token index from 0 to -100
    """
    questions = examples["question"]  # inputs
    queries = examples["query"]  # targets

    inputs = [prefix + question for question in questions]

    if add_db_info:
        pass

    # Tokenize the inputs
    model_inputs = tokenizer(
        inputs, max_length=max_input_length, padding="max_length", truncation=True
    )

    # Tokenize the targets
    labels = tokenizer(
        queries, max_length=max_target_length, padding="max_length", truncation=True
    ).input_ids

    # important: we need to replace the index of the padding tokens by -100
    # such that they are not taken into account by the CrossEntropyLoss
    labels_with_ignore_index = []
    for labels_example in labels:
        labels_example = [label if label != 0 else -100 for label in labels_example]
        labels_with_ignore_index.append(labels_example)

    model_inputs["labels"] = labels_with_ignore_index

    return model_inputs


# Map the function to each dataset
train_datset = train_datset.map(preprocess_examples, batched=True)
valid_dataset = valid_dataset.map(preprocess_examples, batched=True)
test_dataset = test_dataset.map(preprocess_examples, batched=True)

columns = ["input_ids", "attention_mask", "labels"]

# This sets `__getitem__` return format (type and columns). The data formatting is applied on-the-fly.
# `__getitem__` is what pulls the batches during training
train_datset.set_format(type="torch", columns=columns)
valid_dataset.set_format(type="torch", columns=columns)
test_dataset.set_format(type="torch", columns=columns)

print("Training")
print(train_datset)
print("*" * 100)

print("Validation")
print(valid_dataset)
print("*" * 100)

print("Test")
print(test_dataset)

# Without the `.set_format`, this would get you all the columns
print(train_datset[0].keys())



Training
Dataset({
    features: ['tvBench_id', 'db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 7
})
****************************************************************************************************
Validation
Dataset({
    features: ['tvBench_id', 'db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1
})
****************************************************************************************************
Test
Dataset({
    features: ['tvBench_id', 'db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2
})
dict_keys(['input_ids', 'attention_mask', 'labels'])


### Check that the previous is working as intended

In [31]:
sample_dataloader = DataLoader(valid_dataset, batch_size=4)

batch = next(iter(sample_dataloader))

print(f"The keys for each batch are:")
print(batch.keys())
print("*" * 100)

print("Input token ids:")
print(batch["input_ids"][0])
print("*" * 100)

print("Decoded input tokens:")
print(tokenizer.decode(batch["input_ids"][0]))
print("*" * 100)

print("Label token ids:")
labels = batch["labels"][0]
print(labels)
print("*" * 100)

print("Decoded label tokens:")
print(tokenizer.decode([label for label in labels if label != -100]))

The keys for each batch are:
dict_keys(['input_ids', 'attention_mask', 'labels'])
****************************************************************************************************
Input token ids:
tensor([    1,  4625,   331, 11061,    67,  7124,   981,    30,  3756,   635,
          326,   563,   471,  1056,  2182,   635,   279,  4653,  4980,   269,
         3377,  1846,   666,   635,   326,   619,    17,  4890,   316, 17044,
          692,     2,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
*******************

### Declare and tune model

In [32]:
# Hyper parameters
train_batch_size = 8
test_batch_size = 4

train_dataloader = DataLoader(train_datset, shuffle=True, batch_size=train_batch_size)
valid_dataloader = DataLoader(valid_dataset, batch_size=test_batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=test_batch_size)

In [33]:
class CodeT5(pl.LightningModule):
    def __init__(self, lr=5e-5, num_train_epochs=5, warmup_steps=1000):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(
            "Salesforce/codet5-small"
        )
        self.save_hyperparameters()
        self.training_step_count = 0

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        )
        return outputs

    def common_step(self, batch, batch_idx):
        # `batch` is a dictionary, the '**' before batch
        # allows the 'forward step' to directly unpack the dictionary

        outputs = self(**batch)
        loss = outputs.loss  # The pretrained model aut calcs the loss

        return loss

    def training_step(self, batch, batch_idx):
        self.training_step_count += 1  # for debugging

        loss = self.common_step(batch, batch_idx)
        self.log(
            "training_loss",
            loss,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)
        self.log(
            "validation_loss",
            loss,
            on_step=False,
            on_epoch=True,
            prog_bar=True,
            logger=True,
        )

        return loss

    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)
        return loss

    def configure_optimizers(self):
        # create optimizer
        optimizer = AdamW(self.parameters(), lr=self.hparams.lr)

        # create learning rate scheduler
        # len(train_dataloader) is just the total number of batches

        num_train_optimization_steps = self.hparams.num_train_epochs * len(
            train_dataloader
        )
        lr_scheduler = {
            "scheduler": get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=self.hparams.warmup_steps,
                num_training_steps=num_train_optimization_steps,
            ),
            "name": "learning_rate",
            "interval": "step",
            "frequency": 1,
        }

        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return valid_dataloader

    def test_dataloader(self):
        return test_dataloader


model = CodeT5()

logger = CSVLogger(save_dir=LOGGER_FOLDER, name="My_Logger")

early_stop_callback = EarlyStopping(
    monitor="validation_loss", patience=3, strict=False, verbose=False, mode="min"
)
lr_monitor = LearningRateMonitor(logging_interval="step")

trainer = Trainer(
    default_root_dir=CHECKPOINT_FOLDER,
    callbacks=[early_stop_callback, lr_monitor],
    logger=logger,
)
trainer.fit(model)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
Missing logger folder: /user/w266/w266-project-carlos/logger/My_Logger

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
241.969   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(
  rank_zero_warn(


Epoch 147:   0%|          | 0/1 [00:00<?, ?it/s, v_num=0, validation_loss=3.060, training_loss=2.160]         

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
model.model.save_pretrained(MODEL_FOLDER)