In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
import pandas as pd
import plotly.express as px

from spot.utils import (
    cst,
    read_file,
    write_file,
    seq_flatten,
    proj_root,
    tqdm,
)
from spot.type_env import (
    collect_annotations,
    MypyChecker,
    AnnotPath,
    mypy_checker,
    TypeInfEnv,
    TypeInfState,
    TypeInfAction,
    SelectAnnotations,
)
from spot.data import GitRepo, ModuleRemapUnpickler

from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
from typing import *

os.chdir(proj_root())

datadir = Path(os.getenv("datadir"))
repos_dir = datadir / "SPOT-data/repos"

useful_repos_path = proj_root() / "scripts" / "useful_repos.pkl"
rename_module = lambda n: "spot.data" if n == "spot.data_prepare" else n
with useful_repos_path.open("rb") as f:
    useful_repos: list[GitRepo] = ModuleRemapUnpickler(f, rename_module).load()

import random

random.seed(42)
random.shuffle(useful_repos)


In [3]:
# loading pre-trained model and tokenizer

model_dir = "./checkpoints/saved/SPOT-CodeT5-fine-tune/checkpoint-1500"

import torch
from transformers import (
    RobertaTokenizer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
)
from transformers.models.t5 import T5ForConditionalGeneration

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained(model_dir)
model: T5ForConditionalGeneration = T5ForConditionalGeneration.from_pretrained(
    model_dir
).to(device)
max_target_length = 128




In [4]:
from spot.data import tokenize_masked, mask_type_annots, output_ids_as_types

test_code = """
# a, b, c are all ints.
def int_add(a: int, b: int, c: int) -> int:
    return a + b + c
"""


def run_model(code: str, num_beams=16):
    tks = tokenize_masked(mask_type_annots(code), tokenizer, device)
    with torch.no_grad():
        loss = model.forward(**tks).loss
        dec = model.generate(
            tks["input_ids"],
            max_length=max_target_length,
            num_beams=num_beams,
            # do_sample=True,
        )[0]
    return {
        "loss": loss,
        "predicted types": output_ids_as_types(dec, tokenizer),
        "labels": output_ids_as_types(tks["labels"][0], tokenizer),
        "generation": tokenizer.decode(dec),
    }


run_model(test_code, num_beams=10)


{'loss': tensor(3.8123, device='cuda:0'),
 'predicted types': [int[...], int[...], int[...], int[...]],
 'labels': [int, int, int, int],
 'generation': '<pad><s><extra_id_0>int [... ]<extra_id_1>int [... ]<extra_id_2>int [... ]<extra_id_3>int [... ]</s>'}

In [5]:
if False:
    test_code_2 = read_file(proj_root() / "src" / "spot" / "type_env.py")
    test_code_2.replace("SPOT_TYPE_MASK", "MAGIC_STR")
    run_model(test_code_2, num_beams=8)


In [6]:
# test accuracy on the test set
from datasets import Dataset
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EvalPrediction
from transformers.trainer import Trainer
import numpy as np


testset = Dataset.load_from_disk(datadir / "SPOT-data/repos-processed/test")
# testset = Dataset.from_dict(testset[0:100])  # for quick test


def compute_metrics(evalp: EvalPrediction):
    # apply the tokenizer decoder to each rows
    out_ids: np.array = evalp.predictions
    label_ids: np.array = evalp.label_ids
    assert len(out_ids.shape) == 2
    assert (n_rows := out_ids.shape[0]) == label_ids.shape[0]
    n_labels = 0
    n_preds = 0
    n_correct_partial = 0
    n_correct_full = 0
    for i in tqdm(range(n_rows), desc="decoding types"):
        pred = output_ids_as_types(out_ids[i, :], tokenizer)
        label = output_ids_as_types(label_ids[i, :], tokenizer)
        n_labels += len(label)
        n_preds += len(pred)
        for (p, l) in zip(pred, label):
            if p == l:
                n_correct_full += 1
            if p.head_name() == l.head_name():
                n_correct_partial += 1

    return {
        "accuracy_partial": n_correct_partial / n_labels,
        "accuracy_full": n_correct_full / n_labels,
        "n_predictions": n_preds,
        "n_labels": n_labels,
    }


test_args = Seq2SeqTrainingArguments(
    "checkpoints/test-SPOT-CodeT5-fine-tune",
    per_device_eval_batch_size=16,
    fp16=True,
    eval_accumulation_steps=8,
    predict_with_generate=True,
    generation_max_length=128,
    generation_num_beams=10,
)

test_trainer: Trainer = Seq2SeqTrainer(
    model,
    args=test_args,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


Using amp half precision backend


In [7]:
test_trainer.evaluate(testset, metric_key_prefix="test")

The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: types. If types are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3191
  Batch size = 16


decoding types:   0%|          | 0/3191 [00:00<?, ?it/s]

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmrvplusone[0m (use `wandb login --relogin` to force relogin)


{'test_loss': 0.4768402576446533,
 'test_accuracy_partial': 0.6298545923062765,
 'test_accuracy_full': 0.5494202098288239,
 'test_n_predictions': 16140,
 'test_n_labels': 16299,
 'test_runtime': 1370.2288,
 'test_samples_per_second': 2.329,
 'test_steps_per_second': 0.073}

wandb: Network error (ReadTimeout), entering retry loop.
