In [1]:
# %load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
# Set for local or colab

import os
from os.path import join
import sys

# Check if running in colab
IN_COLAB = "google.colab" in sys.modules

# Project defaults
if IN_COLAB:
    print("ENVIRONMENT: Colab")

    # Mount drive
    from google.colab import drive

    drive.mount("/content/drive")

    # Set the project directory
    PROJECT_FOLDER = "/content/drive/MyDrive/MIDS/w266/w266-project-carlos"

    # Install dependencies
    !pip install -q transformers datasets SentencePiece

    # Set timezone
    !rm /etc/localtime
    !ln -s /usr/share/zoneinfo/US/Pacific /etc/localtime

else:
    print("ENVIRONMENT: Local")
    # Set the project directory
    PROJECT_FOLDER = "/user/w266/w266-project-carlos"

os.chdir(PROJECT_FOLDER)

# FOLDERS
DATASET_FOLDER = join(PROJECT_FOLDER, "dataset/dataset_final")
EXPERIMENT_BASE_FOLDER = join(PROJECT_FOLDER, "experiments")
EXPERIMENT_RESULTS_FOLDER = join(PROJECT_FOLDER, "experiment_results")
NVBENCH_DIRECTORY = join(PROJECT_FOLDER, "ref_repos/nvBench/database")

print(f"Working directory is: {os.getcwd()}")

ENVIRONMENT: Colab
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Working directory is: /content/drive/MyDrive/MIDS/w266/w266-project-carlos


In [3]:
from pprint import pprint

from datetime import datetime
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, T5ForConditionalGeneration, T5Tokenizer

from t5_model_support_functions import (
    load_csv_files,
    token_to_df,
    get_string_equality,
    get_pd_row_accuracy,
    attach_nvBench_info,
    build_vega_zero_source,
)

### Load `csv` data as `dataframes`

In [4]:
TARGET_FEATURES = ["source", "labels", "token_types"]

df_train, df_val, df_test = load_csv_files(
    [
        join(DATASET_FOLDER, "train.csv"),
        join(DATASET_FOLDER, "dev.csv"),
        join(DATASET_FOLDER, "test.csv"),
    ],
    focus_columns=TARGET_FEATURES,
    drop_duplicates=True,
    dropna=True,
    shuffle=False,
)

Loading 'train.csv'
Number of records in /content/drive/MyDrive/MIDS/w266/w266-project-carlos/dataset/dataset_final/train.csv: 25238

Loading 'dev.csv'
Number of records in /content/drive/MyDrive/MIDS/w266/w266-project-carlos/dataset/dataset_final/dev.csv: 1430
-> Merged!!

Loading 'test.csv'
Number of records in /content/drive/MyDrive/MIDS/w266/w266-project-carlos/dataset/dataset_final/test.csv: 4920
-> Merged!!

Focusing on the following columns: ['source', 'labels', 'token_types']

Searching for duplicate rows in focus columns...
A total of 31544 records were loaded (44 records dropped after duplicate filter)

Seaching for NaN fields in foclus columns...
Rows with NaN values: 0
Dropping NaN...

Final total records 31544

returning 3 files


#### Ataching nvBench info and building new sources

In [5]:
## nvBench Info
df_train = attach_nvBench_info(df_train, NVBENCH_DIRECTORY)
df_val = attach_nvBench_info(df_val, NVBENCH_DIRECTORY)
df_test = attach_nvBench_info(df_test, NVBENCH_DIRECTORY)

In [6]:
# Rebuilding the source to only include the "used columns"
df_train["source_new"] = df_train.apply(
    build_vega_zero_source, axis=1, args=["columns_used", 2]
)

df_val["source_new"] = df_val.apply(
    build_vega_zero_source, axis=1, args=["columns_used", 2]
)

df_test["source_new"] = df_test.apply(
    build_vega_zero_source, axis=1, args=["columns_used", 2]
)

### Inference

#### Set experiment folder and architectbase model type

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device type: {device}")

Device type: cuda


In [8]:
# EXPERIMENT_NAME = "exp_01_t5-base"
# MODEL_TYPE = "t5-base"

# EXPERIMENT_NAME = "exp_02_codet5-base"
# MODEL_TYPE = "codet5-base"

EXPERIMENT_NAME = "exp_03_codet5-large"
MODEL_TYPE = "codet5-large"

EXPERIMENT_FOLDER = join(EXPERIMENT_BASE_FOLDER, EXPERIMENT_NAME)

model = T5ForConditionalGeneration.from_pretrained(EXPERIMENT_FOLDER).to(device)

if "codet5" in MODEL_TYPE:
    tokenizer = RobertaTokenizer.from_pretrained(join(EXPERIMENT_FOLDER, "tokenizer"))
else:
    tokenizer = T5Tokenizer.from_pretrained(join(EXPERIMENT_FOLDER, "tokenizer"))

#### Hyper-parameters

In [9]:
prefix = "Generate vega_zero code: "
max_input_length = 162
max_target_length = 60
batch_size = 30

DEV_TESTING = False
DEV_LENGTH = 6

if device == "cuda":
    torch.cuda.empty_cache()

# Calculated

set_len = DEV_LENGTH if DEV_TESTING else len(df_test)
total_batches = int(np.ceil(set_len / batch_size))
print(f"Total number of batches: {total_batches}")

Total number of batches: 163


In [10]:
if DEV_TESTING:
    train_dataset = Dataset.from_pandas(df_train.head(DEV_LENGTH), split="train")
    val_dataset = Dataset.from_pandas(df_val.head(DEV_LENGTH), split="validation")
    test_dataset = Dataset.from_pandas(df_test.head(DEV_LENGTH), split="test")
else:
    train_dataset = Dataset.from_pandas(df_train, split="train")
    val_dataset = Dataset.from_pandas(df_val, split="validation")
    test_dataset = Dataset.from_pandas(df_test, split="test")


columns = ["tvBench_id", "hardness", "source", "labels", "source_new"]

# This sets what is pulled when batching
train_dataset.set_format(type="torch", columns=columns)
val_dataset.set_format(type="torch", columns=columns)
test_dataset.set_format(type="torch", columns=columns)

In [11]:
print("Training")
print(train_dataset)
print("*" * 100)

print("Validation")
print(val_dataset)
print("*" * 100)

print("Test")
print(test_dataset)

# Without the `.set_format`, this would get you all the columns
print(train_dataset[0].keys())

Training
Dataset({
    features: ['tvBench_id', 'db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', 'mentioned_columns', 'mentioned_values', 'query_template', 'source', 'labels', 'token_types', 'table', 'nvBench_column_names', 'columns_used', 'source_new', '__index_level_0__'],
    num_rows: 25238
})
****************************************************************************************************
Validation
Dataset({
    features: ['tvBench_id', 'db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', 'mentioned_columns', 'mentioned_values', 'query_template', 'source', 'labels', 'token_types', 'table', 'nvBench_column_names', 'columns_used', 'source_new', '__index_level_0__'],
    num_rows: 1430
})
****************************************************************************************************
Test
Dataset({
    features: ['tvBench_id', 'db_id', 'chart', 'hardness', 'query', 'question', 'vega_zero', 'mentioned_columns', 'mentioned_values', 'query_template', 's

In [12]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

### Get test predictions

In [13]:
tvBench_id = []
hardness = []
sources = []
labels = []
predictions = []

start_time = iter_start_time = datetime.now()
print(f"Processing started {start_time.strftime('%m/%d/%Y %I:%M %p')}")

for i, batch in enumerate(test_dataloader):
    print(f"Processing batch {i+1} of {total_batches}...", end="")

    # Process batches
    texts = [prefix + src for src in batch["source_new"]]

    encoding = tokenizer(
        texts,
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    ).to(device)

    with torch.no_grad():
        generated_ids = model.generate(
            **encoding,
            num_beams=3,
            min_length=15,
            max_length=max_target_length,
        )

        predictions.extend(
            tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        )

    sources.extend(batch["source_new"])
    labels.extend(batch["labels"])
    hardness.extend(batch["hardness"])
    tvBench_id.extend(batch["tvBench_id"])

    # Update timers
    iter_end_time = datetime.now()
    print(f"COMPLETE! ({(iter_end_time - iter_start_time).seconds} seconds)")
    iter_start_time = datetime.now()

end_time = datetime.now()
time_span = end_time - start_time
print(
    f"Processing finished {end_time.strftime('%m/%d/%Y %I:%M %p')} ({time_span.seconds} seconds)"
)

if device == "cuda":
    torch.cuda.empty_cache()

Processing started 04/08/2023 08:10 PM
Processing batch 1 of 163...COMPLETE! (15 seconds)
Processing batch 2 of 163...COMPLETE! (6 seconds)
Processing batch 3 of 163...COMPLETE! (6 seconds)
Processing batch 4 of 163...COMPLETE! (5 seconds)
Processing batch 5 of 163...COMPLETE! (6 seconds)
Processing batch 6 of 163...COMPLETE! (5 seconds)
Processing batch 7 of 163...COMPLETE! (6 seconds)
Processing batch 8 of 163...COMPLETE! (7 seconds)
Processing batch 9 of 163...COMPLETE! (5 seconds)
Processing batch 10 of 163...COMPLETE! (6 seconds)
Processing batch 11 of 163...COMPLETE! (7 seconds)
Processing batch 12 of 163...COMPLETE! (7 seconds)
Processing batch 13 of 163...COMPLETE! (6 seconds)
Processing batch 14 of 163...COMPLETE! (5 seconds)
Processing batch 15 of 163...COMPLETE! (6 seconds)
Processing batch 16 of 163...COMPLETE! (6 seconds)
Processing batch 17 of 163...COMPLETE! (6 seconds)
Processing batch 18 of 163...COMPLETE! (5 seconds)
Processing batch 19 of 163...COMPLETE! (5 seconds)


In [14]:
df_results = pd.DataFrame()
df_results["tvBench_id"] = tvBench_id
df_results["hardness"] = hardness
df_results["source"] = sources
df_results["labels"] = labels
df_results["prediction"] = predictions

df_results["percent_equal"] = df_results.apply(get_pd_row_accuracy, axis=1)
df_results["equal"] = df_results["percent_equal"].apply(
    lambda var: 1.0 if var == 1.0 else 0.0
)


df_results.head()

Unnamed: 0,tvBench_id,hardness,source,labels,prediction,percent_equal,equal
0,3092@y_name@DESC,Medium,<N> Give me the comparison about Team_ID over ...,mark bar data basketball_match encoding x all_...,mark bar data basketball_match encoding x all_...,1.0,1.0
1,3092@y_name@DESC,Medium,<N> Give me the comparison about Team_ID over ...,mark bar data basketball_match encoding x all_...,mark bar data basketball_match encoding x all_...,1.0,1.0
2,2818@y_name@DESC,Hard,<N> Give me a histogram for what is the number...,mark bar data player encoding x position y agg...,mark bar data player encoding x position y agg...,1.0,1.0
3,2818@y_name@DESC,Hard,<N> Give me a histogram for what is the number...,mark bar data player encoding x position y agg...,mark bar data player encoding x position y agg...,1.0,1.0
4,2681@x_name@ASC,Medium,<N> Show different occupations along with the ...,mark bar data player encoding x occupation y a...,mark bar data player encoding x occupation y a...,1.0,1.0


In [15]:
file_name = f"{EXPERIMENT_NAME}_results_{end_time.strftime('%Y_%m_%d-%H_%M')}.csv"

print(f"saving '{file_name}'")

df_results.to_csv(join(EXPERIMENT_RESULTS_FOLDER,file_name),index=False)

saving 'exp_03_codet5-large_results_2023_04_08-20_28.csv'
