In [None]:
# %load_ext lab_black
# %load_ext autoreload
# %autoreload 2

In [None]:
# Set for local or colab

import os
from os.path import join
import sys

# Check if running in colab
IN_COLAB = "google.colab" in sys.modules

# Project defaults
if IN_COLAB:
    print("ENVIRONMENT: Colab")

    # Mount drive
    from google.colab import drive

    drive.mount("/content/drive")

    # Set the project directory
    PROJECT_FOLDER = "/content/drive/MyDrive/MIDS/w266/w266-project-carlos"

    # Install dependencies
    !pip install -q transformers datasets pytorch-lightning SentencePiece #wandb
else:
    print("ENVIRONMENT: Local")
    # Set the project directory
    PROJECT_FOLDER = "/user/w266/w266-project-carlos"

os.chdir(PROJECT_FOLDER)

# FOLDERS
DATASET_FOLDER = join(PROJECT_FOLDER, "dataset/dataset_final")

print(f"Working directory is: {os.getcwd()}")

In [None]:
from pprint import pprint

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, T5ForConditionalGeneration, T5Tokenizer

from t5_model_support_functions import load_csv_files, token_to_df

### Set experiment folder and architectbase model type

In [None]:
EXPERIMENT_FOLDER = join(PROJECT_FOLDER, "experiments/exp_01_t5-base/")

MODEL_TYPE = "t5-base"

model = T5ForConditionalGeneration.from_pretrained(EXPERIMENT_FOLDER)

if "codet5" in MODEL_TYPE:
    tokenizer = RobertaTokenizer.from_pretrained(join(EXPERIMENT_FOLDER, "tokenizer"))
else:
    tokenizer = T5Tokenizer.from_pretrained(join(EXPERIMENT_FOLDER, "tokenizer"))

### Load `csv` data as `dataframes`

In [None]:
TARGET_FEATURES = ["source", "labels", "token_types"]

df_train, df_val, df_test = load_csv_files(
    [
        join(DATASET_FOLDER, "train.csv"),
        join(DATASET_FOLDER, "dev.csv"),
        join(DATASET_FOLDER, "test.csv"),
    ],
    focus_columns=TARGET_FEATURES,
    drop_duplicates=True,
    dropna=True,
    shuffle=False,
)

### Inference

#### Hyper-parameters

In [None]:
prefix = "Generate vega_zero code: "
max_input_length = 162
max_target_length = 60
batch_size = 2

DEV_TESTING = True
DEV_LENGTH = 4

device = "cuda" if torch.cuda.is_available() else "cpu"

# Calculated
total_batches = int(np.ceil(DEV_LENGTH / batch_size))

In [None]:
if DEV_TESTING:
    train_dataset = Dataset.from_pandas(df_train.head(DEV_LENGTH), split="train")
    val_dataset = Dataset.from_pandas(df_val.head(DEV_LENGTH), split="validation")
    test_dataset = Dataset.from_pandas(df_test.head(DEV_LENGTH), split="test")
else:
    train_dataset = Dataset.from_pandas(df_train, split="train")
    val_dataset = Dataset.from_pandas(df_val, split="validation")
    test_dataset = Dataset.from_pandas(df_test, split="test")


print(train_dataset)
print(val_dataset)
print(test_dataset)

In [None]:
def preprocess_examples(examples):
    """
    This function process the input and targets (labels)

    Inputs:
    - Adds a prefix to the source (for t5)
    - Tokenizes the input

    Targets (labels):
    - Tokenizes
    - Replaces the padding token index from 0 to -100
    """
    sources = examples["source"]  # inputs
    label_queries = examples["labels"]  # targets

    inputs = [prefix + source for source in sources]

    # Tokenize the inputs
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    ).to(device)

    # Tokenize the targets
    labels = (
        tokenizer(
            label_queries,
            max_length=max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        .to(device)
        .input_ids
    )

    # important: we need to replace the index of the padding tokens by -100
    # such that they are not taken into account by the CrossEntropyLoss
    labels_with_ignore_index = []
    for label_set in labels:
        label_set = [label if label != 0 else -100 for label in label_set]
        labels_with_ignore_index.append(label_set)

    model_inputs["label_tokens"] = labels_with_ignore_index

    return model_inputs


# Map the function to each dataset
train_dataset = train_dataset.map(preprocess_examples, batched=True)
val_dataset = val_dataset.map(preprocess_examples, batched=True)
test_dataset = test_dataset.map(preprocess_examples, batched=True)

columns = ["source", "input_ids", "labels"]

# This sets `__getitem__` return format (type and columns). The data formatting is applied on-the-fly.
# `__getitem__` is what pulls the batches during training
train_dataset.set_format(type="torch", columns=columns)
val_dataset.set_format(type="torch", columns=columns)
test_dataset.set_format(type="torch", columns=columns)

print("Training")
print(train_dataset)
print("*" * 100)

print("Validation")
print(val_dataset)
print("*" * 100)

print("Test")
print(test_dataset)

# Without the `.set_format`, this would get you all the columns
print(train_dataset[0].keys())

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

sources = []
predictions = []
labels = []

for i, batch in enumerate(test_dataloader):
    print(f"Processing batch {i+1} of {total_batches}...", end="")
    with torch.no_grad():
        generated_ids = model.generate(
            batch["input_ids"],
            num_beams=3,
            min_length=15,
            max_length=max_target_length,
        )

        predictions.extend(
            tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        )

        sources.extend(batch["source"])
        labels.extend(batch["labels"])
        print("COMPLETE!")

In [None]:
df_results = pd.DataFrame()
df_results["source"] = sources
df_results["labels"] = labels
df_results["prediction"] = predictions

df_results.to_csv(join(EXPERIMENT_FOLDER, "results.csv"))

df_results.head()