In [1]:
%load_ext autoreload
%autoreload 2

import os
import pickle
from pathlib import Path
from typing import *

import pandas as pd
import plotly.express as px

from spot.data import GitRepo
from spot.utils import cst, proj_root, pushover_alert, run_long_task, tqdm

os.chdir(proj_root())

datadir = Path(os.getenv("datadir"))
repos_dir = datadir / "SPOT-data/repos"

useful_repos_path = proj_root() / "scripts" / "useful_repos.pkl"
with useful_repos_path.open("rb") as f:
    useful_repos: list[GitRepo] = pickle.load(f)

repos_split_path = datadir / "SPOT-data/repos-processed-with_margin/repos_split.pkl"
with repos_split_path.open("rb") as f:
    repos_split: dict[str, list[GitRepo]] = pickle.load(f)

In [2]:
from spot.data import SrcDataset

src_datasets_path = datadir / f"SPOT-data/src_datasets"
src_datasets = dict[str, SrcDataset]()
for n in ["train", "valid", "test"]:
    with open(src_datasets_path / f"{n}.pkl", "rb") as f:
        src_datasets[n] = pickle.load(f)
        src_datasets[n].repos_root = repos_dir


In [3]:
import numpy as np
import torch
from datasets import Dataset

from spot.data import ChunkedDataset
from spot.model import CtxArgs, DecodingArgs, ModelSPOT, ModelWrapper, TokenizerSPOT
from spot.utils import TaskLoggingMonitor

train_r0 = False  # whether to train or load trained R0 model
with_margin = True
data_reduction = 1

margin_tag = "with_margin" if with_margin else "no_margin"
data_tag = "data_full" if data_reduction == 1 else f"data_1-{data_reduction}"

ctx_args = CtxArgs(
    ctx_size=512,
    ctx_margin=128,
    types_in_ctx=False,
)

tokenizer: TokenizerSPOT = TokenizerSPOT.from_pretrained("Salesforce/codet5-base")

r0_model_name = f"SPOT-R0-{margin_tag}-{data_tag}"

if train_r0:
    r0_model_path = "Salesforce/codet5-base"
else:
    r0_model_path = datadir / f"checkpoints/saved/{r0_model_name}"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

r0_model: ModelSPOT = ModelSPOT.from_pretrained(r0_model_path).to(device)
r0_monitor = TaskLoggingMonitor("R0")
r0_args = DecodingArgs(
    sampling_batch_size=512,
    ctx_args=ctx_args,
    max_workers=20,
)
r0_wrapper = ModelWrapper(r0_model, tokenizer, r0_args, r0_monitor)




In [4]:
r0_datasets: dict[str, ChunkedDataset] = {}

with run_long_task("Preparing chunked datasets", notify=False):
    for n, d in src_datasets.items():
        r0_datasets[n] = d.to_chunks(tokenizer, ctx_args, max_workers=20)

n_train = len(r0_datasets["train"].data) // data_reduction
r0_datasets["train"] = r0_datasets["train"][:n_train]

processing chunks:   0%|          | 0/108759 [00:00<?, ?it/s]

processing chunks:   0%|          | 0/7607 [00:00<?, ?it/s]

processing chunks:   0%|          | 0/5686 [00:00<?, ?it/s]

Pushover: (Finished: Preparing chunked datasets.) Time taken: 62.9s


In [5]:
import wandb
from spot.model import ModelTrainingArgs

r0_train_args = ModelTrainingArgs(
    train_batch_size=38,
    eval_batch_size=256,
    max_epochs=3,
)
r0_trainer = r0_wrapper.build_trainer(
    datadir / "checkpoints" / r0_model_name,
    r0_train_args,
    dataset=r0_datasets["train"].data,
    eval_dataset=r0_datasets["valid"].data,
)

if train_r0:
    wandb.init(
        project=r0_model_name,
        dir=str(datadir),
        config={"r0_decoding_args": r0_args, "r0_train_args": r0_train_args},
    )

    with run_long_task(f"Training {r0_model_name}"):
        init_perf = r0_trainer.evaluate(max_length=r0_args.generation_max_length)
        print("initial eval loss:", init_perf)
        r0_trainer.train()

    wandb.log({"time_stats": r0_monitor.timer.total_times()})

    final_perf = r0_trainer.evaluate(max_length=r0_args.generation_max_length)
    print("final eval loss:", final_perf)
    wandb.finish()


Using amp half precision backend


In [5]:
from spot.data import preds_to_accuracies, pretty_print_accuracies

r0_preds = r0_wrapper.predict(r0_datasets["test"], tqdm_args={})

pretty_print_accuracies(preds_to_accuracies(r0_preds, r0_datasets["test"]))


predict:   0%|          | 0/3429 [00:00<?, ?it/s]

partial_acc: 77.39%
partial_acc_wo_any: 77.90%
partial_accs:
   FuncArg: 73.52%
   FuncReturn: 84.09%
   ClassAtribute: 73.71%
   GlobalVar: 89.80%
   LocalVar: 81.63%
full_acc: 70.09%
full_accs:
   FuncArg: 67.42%
   FuncReturn: 78.43%
   ClassAtribute: 63.07%
   GlobalVar: 71.43%
   LocalVar: 57.60%
n_labels: 8831


In [7]:
from spot.data import preds_to_accuracies, pretty_print_accuracies

size_factors = [1, 2, 3, 4]
with run_long_task("Evaluate R0 accuracy vs ctx_size"):
    acc_series = []
    for factor in size_factors:
        wrapper = r0_wrapper.scale_ctx_size(factor)
        wrapper.args.ctx_args.types_in_ctx = True
        accs = wrapper.eval_on_dataset(src_datasets["test"], tqdm_args={"leave": False})
        acc_series.append(accs)


processing chunks:   0%|          | 0/5686 [00:00<?, ?it/s]

predict:   0%|          | 0/3429 [00:00<?, ?it/s]

processing chunks:   0%|          | 0/2843 [00:00<?, ?it/s]

predict:   0%|          | 0/2223 [00:00<?, ?it/s]

processing chunks:   0%|          | 0/1896 [00:00<?, ?it/s]

predict:   0%|          | 0/1624 [00:00<?, ?it/s]

processing chunks:   0%|          | 0/1422 [00:00<?, ?it/s]

predict:   0%|          | 0/1277 [00:00<?, ?it/s]

Pushover: (Finished: Evaluate R0 accuracy vs ctx_size.) Time taken: 851.0s


In [11]:
import plotly.express as px

acc_df = pd.DataFrame(
    {
        "ctx_size": size_factors,
        "partial_acc": [x["partial_acc"] for x in acc_series],
        "full_acc": [x["full_acc"] for x in acc_series],
    }
)
px.line(acc_df, x="ctx_size", y=["partial_acc", "full_acc"], title=r0_model_name)


In [6]:
# Set this to the best ctx_size
best_r0_ctx_factor = 3

In [18]:
print("ctx_size factor: 1")
pretty_print_accuracies(acc_series[0])
print("ctx_size factor: 3")
pretty_print_accuracies(acc_series[best_r0_ctx_factor - 1])


ctx_size factor: 1
partial_acc: 79.91%
partial_acc_wo_any: 80.40%
partial_accs:
   FuncArg: 77.32%
   FuncReturn: 84.88%
   ClassAtribute: 76.66%
   GlobalVar: 91.84%
   LocalVar: 80.92%
full_acc: 73.46%
full_accs:
   FuncArg: 71.96%
   FuncReturn: 79.70%
   ClassAtribute: 67.87%
   GlobalVar: 71.43%
   LocalVar: 58.66%
n_labels: 8831
ctx_size factor: 3
partial_acc: 84.25%
partial_acc_wo_any: 84.78%
partial_accs:
   FuncArg: 84.07%
   FuncReturn: 87.44%
   ClassAtribute: 77.03%
   GlobalVar: 87.76%
   LocalVar: 87.94%
full_acc: 78.40%
full_accs:
   FuncArg: 79.65%
   FuncReturn: 81.77%
   ClassAtribute: 69.79%
   GlobalVar: 69.39%
   LocalVar: 67.73%
n_labels: 8824


KeyboardInterrupt: 

In [7]:
train_r1 = True

r1_model_name = f"SPOT-R1-{margin_tag}-{data_tag}"

if train_r1:
    r1_model_path = "Salesforce/codet5-base"
else:
    r1_model_path = datadir / f"checkpoints/saved/{r1_model_name}"

r1_model: ModelSPOT = ModelSPOT.from_pretrained(r1_model_path).to(device)
r1_monitor = TaskLoggingMonitor("R1")
r1_args = DecodingArgs(
    sampling_batch_size=512,
    ctx_args=CtxArgs(
        ctx_size=512,
        ctx_margin=128,
        types_in_ctx=False,
    ),
    max_workers=20,
)
r1_wrapper = ModelWrapper(r1_model, tokenizer, r1_args, r1_monitor)

loading configuration file https://huggingface.co/Salesforce/codet5-base/resolve/main/config.json from cache at /mnt/data0/jiayi/hugface_cache/transformers/f1adf9032ebe26d0dd0b9c4917416e2db960b7e8b8e68f0612e8e5d5379488f5.20220fde7ff6c94c24bdcd615678f6a4374f3176abdc061beecc43a906725837
Model config T5Config {
  "_name_or_path": "/content/drive/MyDrive/CodeT5/pretrained_models/codet5_base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 1,
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_atte

In [18]:
import pickle

from spot.data import ChunkedDataset, save_datasets

test_r1_generation = False
use_file_level_feedback = False

feedback_tag = "per_file" if use_file_level_feedback else "per_project"
r1_data_path = (
    datadir
    / f"SPOT-data/{'test-' if test_r1_generation else ''}src_datasets-R1-{feedback_tag}"
)
r1_src_datasets: Dict[str, SrcDataset]
load_r1_data = r1_data_path.exists() and not test_r1_generation

if load_r1_data:
    print(f"Loading R1 datasets from {r1_data_path}...")
    for n in ["train", "valid", "test"]:
        with open(r1_data_path / f"{n}.pkl", "rb") as f:
            r1_src_datasets[n] = pickle.load(f)
            r1_src_datasets[n].repos_root = repos_dir
else:
    # compute the r0_predictions first in case the next step fails
    r0_cache_path = r1_data_path / "r0_predictions.pkl"

    if r0_cache_path.exists():
        print(f"Loading R0 predictions from {r0_cache_path}...")
        with open(r0_cache_path, "rb") as f:
            r0_predictions = pickle.load(f)
    else:
        r0_predictions = dict()
        with run_long_task("Make R0 predictions", notify=False):
            for name in ["valid", "test", "train"]:
                print("Predicting on:", name)
                r0_data = r0_datasets[name]
                if test_r1_generation:
                    r0_data = r0_data[:64]
                r0_predictions[name] = r0_wrapper.scale_ctx_size(
                    best_r0_ctx_factor
                ).predict(r0_data, tqdm_args={"leave": False})

        r1_data_path.mkdir(parents=True, exist_ok=True)
        with open(r1_data_path / "r0_predictions.pkl", "wb") as f:
            pickle.dump(r0_predictions, f)


Predicting on: valid


predict:   0%|          | 0/4040 [00:00<?, ?it/s]

Predicting on: test


predict:   0%|          | 0/3429 [00:00<?, ?it/s]

Predicting on: train


predict:   0%|          | 0/61503 [00:00<?, ?it/s]

In [None]:
if not load_r1_data:
    r1_datasets = {}
    
    with run_long_task("Generate R1 inputs", notify=False):
        for name in ["test", "train", "valid"]:
            print("Processing dataset:", name)
            repos = [r.repo_dir(repos_dir) for r in repos_split[name]]
            r0_data = r0_datasets[name]
            r0_preds = r0_predictions[name]
            if test_r1_generation:
                r0_data = r0_data[:16]
                r0_preds = r0_preds[:16]
            r1_datasets[name] = r1_wrapper.generate_r1_inputs(
                r0_data,
                r0_preds,
                tqdm_args={"leave": False},
                use_file_level_feedback=use_file_level_feedback,
            )
    save_datasets(r1_datasets, repos_split, r1_data_path)


Processing dataset: test
[R1] Starting task: 'get_type_checked_inputs'


reading orginal srcs:   0%|          | 0/2 [00:00<?, ?it/s]

[R1] Starting task: 'get_type_checked_inputs > Call mypy'


calling mypy:   0%|          | 0/1 [00:00<?, ?it/s]

[R1] 'get_type_checked_inputs > Call mypy' finished in 0.3348526954650879 seconds


generating augmented inputs:   0%|          | 0/2 [00:00<?, ?it/s]

[R1] 'get_type_checked_inputs' finished in 0.889845609664917 seconds
[R1] Starting task: 'chunk_masked_code'


tokenizing sources:   0%|          | 0/2 [00:00<?, ?it/s]

processing chunks:   0%|          | 0/27 [00:00<?, ?it/s]

[R1] 'chunk_masked_code' finished in 3.9471073150634766 seconds
Processing dataset: train
[R1] Starting task: 'get_type_checked_inputs'


reading orginal srcs:   0%|          | 0/10 [00:00<?, ?it/s]

[R1] Starting task: 'get_type_checked_inputs > Call mypy'


calling mypy:   0%|          | 0/1 [00:00<?, ?it/s]

[R1] 'get_type_checked_inputs > Call mypy' finished in 0.1553797721862793 seconds


generating augmented inputs:   0%|          | 0/10 [00:00<?, ?it/s]

[R1] 'get_type_checked_inputs' finished in 1.586441993713379 seconds
[R1] Starting task: 'chunk_masked_code'


tokenizing sources:   0%|          | 0/10 [00:00<?, ?it/s]

processing chunks:   0%|          | 0/26 [00:00<?, ?it/s]

[R1] 'chunk_masked_code' finished in 4.452892541885376 seconds
Processing dataset: valid
[R1] Starting task: 'get_type_checked_inputs'


reading orginal srcs:   0%|          | 0/8 [00:00<?, ?it/s]

[R1] Starting task: 'get_type_checked_inputs > Call mypy'


calling mypy:   0%|          | 0/1 [00:00<?, ?it/s]

[R1] 'get_type_checked_inputs > Call mypy' finished in 1.1858718395233154 seconds


generating augmented inputs:   0%|          | 0/8 [00:00<?, ?it/s]

[R1] 'get_type_checked_inputs' finished in 2.1966168880462646 seconds
[R1] Starting task: 'chunk_masked_code'


tokenizing sources:   0%|          | 0/8 [00:00<?, ?it/s]

processing chunks:   0%|          | 0/42 [00:00<?, ?it/s]

[R1] 'chunk_masked_code' finished in 5.094545125961304 seconds
Deleting old datasets at: /mnt/data0/jiayi/SPOT-data/test-src_datasets-R1-per_project
268K	/mnt/data0/jiayi/SPOT-data/test-src_datasets-R1-per_project


In [None]:
r1_train_args = ModelTrainingArgs(
    train_batch_size=38,
    eval_batch_size=256,
    max_epochs=3,
)
r1_trainer = r1_wrapper.build_trainer(
    datadir / "checkpoints" / r1_model_name,
    r1_train_args,
    dataset=r1_datasets["train"].data,
    eval_dataset=r1_datasets["valid"].data,
)

if train_r1:
    wandb.init(
        project=r1_model_name,
        dir=str(datadir),
        config={"r1_decoding_args": r1_args, "r1_train_args": r1_train_args},
    )

    with run_long_task(f"Training {r1_model_name}"):
        init_perf = r1_trainer.evaluate(max_length=r1_args.generation_max_length)
        print("initial performance:", init_perf)
        r1_trainer.train()

    wandb.log({"time_stats": r1_monitor.timer.total_times()})

    final_perf = r1_trainer.evaluate(max_length=r1_args.generation_max_length)
    print("final performance:", final_perf)
    wandb.finish()


PyTorch: setting up devices
Using amp half precision backend
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmrvplusone[0m. Use [1m`wandb login --relogin`[0m to force relogin


***** Running Evaluation *****
  Num examples = 5347
  Batch size = 200


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
***** Running training *****
  Num examples = 81003
  Num Epochs = 3
  Instantaneous batch size per device = 36
  Total train batch size (w. parallel, distributed & accumulation) = 36
  Gradient Accumulation steps = 1
  Total optimization steps = 6753


initial performance: {'eval_loss': 2.967663049697876, 'eval_runtime': 45.7404, 'eval_samples_per_second': 116.899, 'eval_steps_per_second': 0.59}


Step,Training Loss,Validation Loss
500,0.5122,0.511515
1000,0.428,0.471736
1500,0.3989,0.462796
2000,0.3862,0.460489
2500,0.3513,0.457103
3000,0.329,0.445091
3500,0.3235,0.446647
4000,0.3172,0.43407
4500,0.3104,0.434677
5000,0.286,0.441188


***** Running Evaluation *****
  Num examples = 5347
  Batch size = 200
Saving model checkpoint to /mnt/data0/jiayi/checkpoints/SPOT-R1-with_margin-data_full/checkpoint-500
Configuration saved in /mnt/data0/jiayi/checkpoints/SPOT-R1-with_margin-data_full/checkpoint-500/config.json
Model weights saved in /mnt/data0/jiayi/checkpoints/SPOT-R1-with_margin-data_full/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /mnt/data0/jiayi/checkpoints/SPOT-R1-with_margin-data_full/checkpoint-500/tokenizer_config.json
Special tokens file saved in /mnt/data0/jiayi/checkpoints/SPOT-R1-with_margin-data_full/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 5347
  Batch size = 200
Saving model checkpoint to /mnt/data0/jiayi/checkpoints/SPOT-R1-with_margin-data_full/checkpoint-1000
Configuration saved in /mnt/data0/jiayi/checkpoints/SPOT-R1-with_margin-data_full/checkpoint-1000/config.json
Model weights saved in /mnt/data0/jiayi/checkpoints/SPOT-R1-with_

Alert: (Training SPOT-R1-with_margin-data_full finished) Time taken: 5170.9s


***** Running Evaluation *****
  Num examples = 5347
  Batch size = 200


final performance: {'eval_loss': 0.43407025933265686, 'eval_runtime': 45.9524, 'eval_samples_per_second': 116.36, 'eval_steps_per_second': 0.588, 'epoch': 2.44}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁█▇▇█▇▇█▇▇▇██
eval/samples_per_second,█▁▂▂▁▂▂▁▂▂▂▁▁
eval/steps_per_second,█▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch,▁▁▂▂▂▂▃▃▄▄▅▅▅▅▆▆▇▇▇▇████
train/global_step,▁▂▂▂▂▃▃▄▄▄▄▅▅▅▅▆▆▇▇▇▇████
train/learning_rate,█▇▇▆▅▅▄▃▂▂▁
train/loss,█▅▅▄▃▃▂▂▂▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.43407
eval/runtime,45.9524
eval/samples_per_second,116.36
eval/steps_per_second,0.588
train/epoch,2.44
train/global_step,5500.0
train/learning_rate,0.0
train/loss,0.279
train/total_flos,1.2053347111010304e+17
train/train_loss,0.35653


In [None]:
from spot.data import preds_to_accuracies, pretty_print_accuracies

r1_preds = r1_wrapper.predict(r1_datasets["test"], tqdm_args={})
r1_accs = preds_to_accuracies(r1_preds, r1_datasets["test"])
pretty_print_accuracies(r1_accs)


predict:   0%|          | 0/16 [00:00<?, ?it/s]

partial_acc: 2.50%
partial_acc_wo_any: 2.50%
partial_accs:
   FuncArg: 4.17%
full_acc: 2.50%
full_accs:
   FuncArg: 4.17%
n_labels: 40


In [17]:
from spot.data import preds_to_accuracies, pretty_print_accuracies

r1_wrapper_large = r1_wrapper.scale_ctx_size(3)
r1_test_data_large = r1_wrapper_large.generate_r1_inputs(
    [r.repo_dir(repos_dir) for r in repos_split["test"]],
    r0_datasets["test"],
    r0_preds,
    tqdm_args={"leave": False},
    use_file_level_feedback=use_file_level_feedback,
)

r1_preds_large = r1_wrapper_large.predict(r1_test_data_large, tqdm_args={})
r1_accs_large = preds_to_accuracies(r1_preds_large, r1_test_data_large)
pretty_print_accuracies(r1_accs_large)


[R1] Starting task: 'get_type_checked_inputs'


reading orginal srcs:   0%|          | 0/950 [00:00<?, ?it/s]

[R1] Starting task: 'get_type_checked_inputs > Call mypy'


calling mypy:   0%|          | 0/50 [00:00<?, ?it/s]



[R1] 'get_type_checked_inputs > Call mypy' finished in 3.115276336669922 seconds


generating augmented inputs:   0%|          | 0/950 [00:00<?, ?it/s]

[R1] 'get_type_checked_inputs' finished in 34.11896514892578 seconds
[R1] Starting task: 'chunk_masked_code'


tokenizing sources:   0%|          | 0/950 [00:00<?, ?it/s]

processing chunks:   0%|          | 0/2164 [00:00<?, ?it/s]

[R1] 'chunk_masked_code' finished in 18.242862224578857 seconds


predict:   0%|          | 0/1920 [00:00<?, ?it/s]

partial_acc: 75.35%
partial_acc_wo_any: 76.03%
partial_accs:
   FuncArg: 73.97%
   FuncReturn: 82.43%
   ClassAtribute: 62.58%
   GlobalVar: 78.50%
   LocalVar: 81.77%
full_acc: 67.61%
full_accs:
   FuncArg: 67.72%
   FuncReturn: 75.93%
   ClassAtribute: 52.71%
   GlobalVar: 45.79%
   LocalVar: 53.25%
n_labels: 17747


In [13]:
from spot.visualization import display_code_sequence, visualize_batch

display_code_sequence(
    [
        visualize_batch(
            r0_datasets['test'],
            i,
            r0_predictions['test'],
            tokenizer,
            r0_wrapper.args.ctx_args,
        )
        for i in range(16)
    ]
)


Tab(children=(HTML(value="<pre style='line-height: 1.2; padding: 10px; color: rgb(212,212,212); background-col…

In [17]:
from spot.visualization import display_code_sequence, visualize_batch

display_code_sequence(
    [
        visualize_batch(
            r1_datasets['test'],
            i,
            r1_preds,
            tokenizer,
            r1_wrapper.args.ctx_args,
        )
        for i in range(16)
    ]
)


Tab(children=(HTML(value="<pre style='line-height: 1.2; padding: 10px; color: rgb(212,212,212); background-col…