In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
import pandas as pd
import plotly.express as px

from spot.utils import (
    cst,
    read_file,
    write_file,
    seq_flatten,
    proj_root,
    tqdm,
)
from spot.type_env import (
    collect_annotations,
    MypyChecker,
    AnnotPath,
    mypy_checker,
    TypeInfEnv,
    TypeInfState,
    TypeInfAction,
    SelectAnnotations,
)
from spot.data_prepare import GitRepo

from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
from typing import *

os.chdir(proj_root())

datadir = Path(os.getenv("datadir"))
repos_dir = datadir / "SPOT-data/repos"

useful_repos_path = proj_root() / "scripts" / "useful_repos.pkl"
with useful_repos_path.open("rb") as f:
    useful_repos: list[GitRepo] = pickle.load(f)

import random

random.seed(42)
random.shuffle(useful_repos)


In [46]:
# loading pre-trained model and tokenizer

model_dir = "./checkpoints/SPOT-CodeT5-fine-tune/checkpoint-1500"

import torch
from transformers import RobertaTokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq
from transformers.models.t5 import T5ForConditionalGeneration

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained(model_dir)
model: T5ForConditionalGeneration = T5ForConditionalGeneration.from_pretrained(model_dir).to(device)
max_target_length=128

Didn't find file ./checkpoints/SPOT-CodeT5-fine-tune/checkpoint-1500/added_tokens.json. We won't load it.
loading file ./checkpoints/SPOT-CodeT5-fine-tune/checkpoint-1500/vocab.json
loading file ./checkpoints/SPOT-CodeT5-fine-tune/checkpoint-1500/merges.txt
loading file None
loading file ./checkpoints/SPOT-CodeT5-fine-tune/checkpoint-1500/special_tokens_map.json
loading file ./checkpoints/SPOT-CodeT5-fine-tune/checkpoint-1500/tokenizer_config.json
loading configuration file ./checkpoints/SPOT-CodeT5-fine-tune/checkpoint-1500/config.json
Model config T5Config {
  "_name_or_path": "Salesforce/codet5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 1,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "label2id": {
    

In [None]:
from spot.data_prepare import tokenize_masked, mask_type_annots, chunk_masked_code

test_code = """
# a, b, c are all ints.
def int_add(a: int, b: int, c: int) -> int:
    return a + b + c
"""

def run_model(code: str):
    tks = tokenize_masked(mask_type_annots(code), tokenizer, device)
    loss = model.forward(**tks).loss
    dec = model.generate(
        tks['input_ids'],
        max_length=max_target_length,
        num_beams=32,
        # do_sample=True,
    )[0]
    return {'loss': loss, 'generation': tokenizer.decode(dec)}

run_model(test_code)

In [45]:
test_code_2 = read_file(proj_root() / "src" / "spot" / "utils.py")
run_model(test_code_2)

{'loss': tensor(0.3881, device='cuda:0', grad_fn=<NllLossBackward0>),
 'generation': '<pad><s><extra_id_0>str<extra_id_1>str<extra_id_2>None<extra_id_3>Path<extra_id_4>Sequence[T1]<extra_id_5>Sequence[T2]<extra_id_6>Sequence[str]<extra_id_7>Sequence[str]<extra_id_8>str</s>'}