In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
# Set for local or colab

import os
from os.path import join
import sys

# Check if running in colab
IN_COLAB = "google.colab" in sys.modules

# Project defaults
if IN_COLAB:
    print("ENVIRONMENT: Colab")

    # Mount drive
    from google.colab import drive

    drive.mount("/content/drive")

    # Set the project directory
    PROJECT_FOLDER = "/content/drive/MyDrive/MIDS/w266/w266-project-carlos"

    # Install dependencies
    !pip install -q transformers datasets pytorch-lightning SentencePiece #wandb
else:
    print("ENVIRONMENT: Local")
    # Set the project directory
    PROJECT_FOLDER = "/user/w266/w266-project-carlos"

os.chdir(PROJECT_FOLDER)

# FOLDERS
DATASET_FOLDER = join(PROJECT_FOLDER, "dataset/dataset_final")

print(f"Working directory is: {os.getcwd()}")

In [None]:
import numpy as np
import pandas as pd

from transformers import RobertaTokenizer, T5ForConditionalGeneration, T5Tokenizer
from datasets import Dataset

from pprint import pprint

from t5_model_support_functions import token_to_df, load_csv_files

### Set experiment folder and architectbase model type

In [None]:
EXPERIMENT_FOLDER = join(PROJECT_FOLDER, "experiments/exp_01_t5-base/")

MODEL_TYPE = "t5-base"

model = T5ForConditionalGeneration.from_pretrained(EXPERIMENT_FOLDER)

if "codet5" in MODEL_TYPE:
    tokenizer = RobertaTokenizer.from_pretrained(join(EXPERIMENT_FOLDER, "tokenizer"))
else:
    tokenizer = T5Tokenizer.from_pretrained(join(EXPERIMENT_FOLDER, "tokenizer"))

In [None]:
def test_inference_on_loaded_model():
    TEXT_TO_SUMMARIZE = (
        "Nearly 800 thousand customers are scheduled to be affected by the shutoffs which are expected to last through at least midday tomorrow. "
        "PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. "
        "The aim is to reduce the risk of wildfires. "
        "If Pacific Gas & Electric Co, a unit of PG&E Corp, goes through with another public safety power shutoff, "
        " it would be the fourth round of mass blackouts imposed by the utility since Oct. 9, when some 730,000 customers were left in the dark. "
        "The recent wave of precautionary shutoffs have drawn sharp criticism from Governor Gavin Newsom, state regulators and consumer activists as being overly broad in scale."
        "Newsom blames PG&E for doing too little to properly maintain and secure its power lines against wind damage."
        "Utility executives have acknowledged room for improvement while defending the sprawling scope of the power cutoffs as a matter of public safety."
        "The record breaking drought has made the current conditions even worse than in previous years. "
        "It exponentially increases the probability of large scale wildfires. "
    )

    TEXT_AND_PROMPT = "summarize: " + TEXT_TO_SUMMARIZE

    print("Text to summirize:")
    pprint(TEXT_AND_PROMPT, width=100)

    # Tokenize
    inputs = tokenizer(
        TEXT_AND_PROMPT, max_length=1024, truncation=True, return_tensors="pt"
    )

    # Generate Summary
    candidate_ids = model.generate(
        inputs["input_ids"],
        num_beams=3,
        no_repeat_ngram_size=3,
        min_length=15,
        max_length=35,
    )

    # Get candidate
    candidate = tokenizer.batch_decode(
        candidate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    print("*" * 100)
    print("Candidate:")
    pprint(candidate, width=100)


test_inference_on_loaded_model()

### Load `csv` data as `dataframes`

In [None]:
TARGET_FEATURES = ["source", "labels", "token_types"]

df_train, df_val, df_test = load_csv_files(
    [
        join(DATASET_FOLDER, "train.csv"),
        join(DATASET_FOLDER, "dev.csv"),
        join(DATASET_FOLDER, "test.csv"),
    ],
    focus_columns=TARGET_FEATURES,
    drop_duplicates=True,
    dropna=True,
    shuffle=False,
)

### Inference

#### Hyper-parameters

In [None]:
prefix = "Generate vega_zero code: "
max_input_length = 162
max_target_length = 60
batch_size = 5

DEV_TESTING = True
DEV_LENGTH = 2

In [None]:
if DEV_TESTING:
    train_dataset = Dataset.from_pandas(df_train.head(DEV_LENGTH), split="train")
    val_dataset = Dataset.from_pandas(df_val.head(DEV_LENGTH), split="validation")
    test_dataset = Dataset.from_pandas(df_test.head(DEV_LENGTH), split="test")
else:
    train_dataset = Dataset.from_pandas(df_train, split="train")
    val_dataset = Dataset.from_pandas(df_val, split="validation")
    test_dataset = Dataset.from_pandas(df_test, split="test")

input_txt = test_dataset["source"][0]
ground_truth = test_dataset["labels"][0]

input_and_prompt = prefix + input_txt

print(input_txt)
print(ground_truth)

In [None]:
input_tokens = tokenizer(
    input_and_prompt, max_length=max_input_length, truncation=True, return_tensors="pt"
)

In [None]:
# Generate Summary
candidate_tokens = model.generate(
    input_tokens["input_ids"],
    num_beams=5,
    min_length=15,
    max_length=max_target_length,
)
candidate = tokenizer.batch_decode(
    candidate_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

print("Candidate")
print(candidate[0])

print("\nGround truth")
print(ground_truth)

In [None]:
candidate_list = []

print("Processing dataset")

for i in range(len(test_dataset)):
    input_txt = test_dataset["source"][i]
    ground_truth = test_dataset["labels"][i]

    input_and_prompt = prefix + input_txt

    input_tokens = tokenizer(
        input_and_prompt,
        max_length=max_input_length,
        truncation=True,
        return_tensors="pt",
    )

    candidate_tokens = model.generate(
        input_tokens["input_ids"],
        num_beams=5,
        min_length=15,
        max_length=max_target_length,
    )
    candidate = tokenizer.batch_decode(
        candidate_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

    candidate_list.append(candidate[0])

    if i % 1 == 0:
        print(f"Processed {i+1} / {len(test_dataset)}")

In [None]:
df_results = pd.DataFrame()
df_results["source"] = test_dataset["source"]
df_results["labels"] = test_dataset["labels"]
df_results["prediction"] = candidate_list

df_results.to_csv(join(EXPERIMENT_FOLDER,"results.csv"))