In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
import pandas as pd
import plotly.express as px

from spot.utils import (
    cst,
    read_file,
    write_file,
    seq_flatten,
    proj_root,
    tqdm,
)
from spot.type_env import (
    collect_annotations,
    MypyChecker,
    AnnotPath,
    mypy_checker,
    TypeInfEnv,
    TypeInfState,
    TypeInfAction,
    SelectAnnotations,
)
from spot.data import GitRepo

from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
from typing import *

os.chdir(proj_root())

datadir = Path(os.getenv("datadir"))
repos_dir = datadir / "SPOT-data/repos"

useful_repos_path = proj_root() / "scripts" / "useful_repos.pkl"
with useful_repos_path.open("rb") as f:
    useful_repos: list[GitRepo] = pickle.load(f)

import random

random.seed(42)
random.shuffle(useful_repos)


In [3]:
# loading pre-trained model and tokenizer

import torch
from transformers import RobertaTokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq
from transformers.models.t5 import T5ForConditionalGeneration

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer: RobertaTokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-base")
model: T5ForConditionalGeneration = T5ForConditionalGeneration.from_pretrained(
    "Salesforce/codet5-base"
).to(device)
max_target_length=128



In [4]:
from spot.data import tokenize_masked, mask_type_annots, chunk_masked_code

test_code = """
# a, b, c are all ints.
def int_add(a: int, b: int, c: int) -> int:
    return a + b + c
"""

def run_model(code: str):
    tks = tokenize_masked(mask_type_annots(code), tokenizer, device)
    loss = model.forward(**tks).loss
    dec = model.generate(
        tks['input_ids'],
        max_length=max_target_length,
        num_beams=32,
        # do_sample=True,
    )[0]
    return {'loss': loss, 'generation': tokenizer.decode(dec)}

run_model(test_code)


{'loss': tensor(4.6308, device='cuda:0', grad_fn=<NllLossBackward0>),
 'generation': '<pad><s><extra_id_0>int, b: int, c: int,<extra_id_1>int, c: int, a: int<extra_id_2>int, b: int, c: int<extra_id_3>int : # noinspection PyProtectedMember,PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker,PyType</s>'}

In [18]:
from spot.data import load_or_process_datasets
from spot.utils import process_map
from datasets import Dataset

tk_datasets, repos_split = load_or_process_datasets(
    datadir / "SPOT-data/repos-processed",
    tokenizer,
    repos_dir,
    repos_test=useful_repos[0:30],
    repos_valid=useful_repos[30:60],
    repos_train = useful_repos[60:],
    regenerate=False,
)

Loading datasets from: /mnt/data0/jiayi/SPOT-data/repos-processed


In [6]:
import wandb
wandb.init(project="SPOT-CodeT5-fine-tune", config={"dataset": "full"})

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmrvplusone[0m (use `wandb login --relogin` to force relogin)


In [7]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
from transformers.trainer import Trainer

data_collator = DataCollatorForSeq2Seq(tokenizer, model)
for d in tk_datasets.values():
    d.set_format('torch', columns=['input_ids', 'labels'])

args = Seq2SeqTrainingArguments(
    f"checkpoints/SPOT-CodeT5-fine-tune",
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    prediction_loss_only=True,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    learning_rate=2e-5,
    per_device_train_batch_size=30,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    num_train_epochs=3,
    # predict_with_generate=True,
    load_best_model_at_end=True,
    fp16=True,
    push_to_hub=False,
    report_to='wandb',
)

trainer: Trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tk_datasets['train'],
    eval_dataset=tk_datasets['valid'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(2)],
    # compute_metrics=compute_metrics,
)

max_target_length = 128

Using amp half precision backend


In [8]:
init_perf=trainer.evaluate(max_length=max_target_length)
print(init_perf)

The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: types. If types are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2524
  Batch size = 64


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


{'eval_loss': 2.7134897708892822, 'eval_runtime': 21.3023, 'eval_samples_per_second': 118.485, 'eval_steps_per_second': 0.939}


In [9]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: types. If types are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 78603
  Num Epochs = 3
  Instantaneous batch size per device = 30
  Total train batch size (w. parallel, distributed & accumulation) = 60
  Gradient Accumulation steps = 1
  Total optimization steps = 3933


Step,Training Loss,Validation Loss
100,1.0005,0.759564
200,0.7978,0.715636
300,0.6994,0.689998
400,0.6671,0.668939
500,0.6417,0.654426
600,0.6374,0.644231
700,0.6141,0.631834
800,0.5918,0.622236
900,0.5692,0.620083
1000,0.5609,0.616649


The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: types. If types are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2524
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: types. If types are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2524
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: types. If types are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2524
  Batch size = 64
The following columns 

TrainOutput(global_step=1700, training_loss=0.6090784274830537, metrics={'train_runtime': 2162.1728, 'train_samples_per_second': 109.061, 'train_steps_per_second': 1.819, 'total_flos': 6.207899423735808e+16, 'train_loss': 0.6090784274830537, 'epoch': 1.3})

In [10]:
print(trainer.evaluate(max_length=max_target_length))

The following columns in the evaluation set  don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: types. If types are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2524
  Batch size = 64


{'eval_loss': 0.589619517326355, 'eval_runtime': 20.1007, 'eval_samples_per_second': 125.568, 'eval_steps_per_second': 0.995, 'epoch': 1.3}


In [14]:
test_code = '''
def int_add(a: int, b: int, c: int) -> str:
    """a, b, c are all ints."""
    ''.join(a, b, c)

from transformers import RobertaTokenizer

def tokenize_masked(masked: Dict, tokenizer: RobertaTokenizer, device: torch.device) -> Dict:
    mask_tokens = [f"<extra_id_{i}>" for i in range(len(masked['types']))]
    input_ids = tokenizer.encode(join_str(masked["code_segs"], mask_tokens), return_tensors="pt")
    label_str = "".join(a + str(b) for a, b in zip(mask_tokens, masked["types"]))
    labels = tokenizer.encode(label_str, return_tensors="pt")
    return {"input_ids": input_ids.to(device), "labels": labels.to(device)}
'''
run_model(test_code)

{'loss': tensor(1.1911, device='cuda:0', grad_fn=<NllLossBackward0>),
 'generation': '<pad><s><extra_id_0>typing.List[int]<extra_id_1>typing.List[int]<extra_id_2>typing.List[int]<extra_id_3>typing.List[int]<extra_id_4>typing.Dict[str, typing.Any]<extra_id_5>RobertaTokenizer<extra_id_6>typing.Any<extra_id_7>typing.Dict[str, typing.Any]</s>'}

In [45]:
test_code_2 = read_file(proj_root() / "src" / "spot" / "utils.py")
run_model(test_code_2)

{'loss': tensor(0.3881, device='cuda:0', grad_fn=<NllLossBackward0>),
 'generation': '<pad><s><extra_id_0>str<extra_id_1>str<extra_id_2>None<extra_id_3>Path<extra_id_4>Sequence[T1]<extra_id_5>Sequence[T2]<extra_id_6>Sequence[str]<extra_id_7>Sequence[str]<extra_id_8>str</s>'}

In [35]:
run_model(test_code_2)

{'loss': tensor(0.0269, device='cuda:0', grad_fn=<NllLossBackward0>),
 'generation': '<pad><s><extra_id_0>t.Callable<extra_id_1>None<extra_id_2>t.Callable<extra_id_3>None<extra_id_4>t.Callable<extra_id_5>t.Callable<extra_id_6>None<extra_id_7>t.Callable<extra_id_8>t.Callable<extra_id_9>None<extra_id_10>t.Callable<extra_id_11>t.Callable<extra_id_12>None<extra_id_13>None<extra_id_14>None<extra_id_15>None</s>'}