In [1]:
%load_ext autoreload
%autoreload 2

import os
import pickle
from pathlib import Path
from typing import *

import pandas as pd
import plotly.express as px

from spot.utils import cst, proj_root, run_long_task, tqdm

os.chdir(proj_root())

datadir = Path(os.getenv("datadir"))
repos_dir = datadir / "SPOT-data/repos"

In [None]:
from spot.data import SrcDataset

src_datasets_path = datadir / f"SPOT-data/src_datasets"
src_datasets = dict[str, SrcDataset]()
for n in ["train", "valid", "test"]:
    with open(src_datasets_path / f"{n}.pkl", "rb") as f:
        src_datasets[n] = pickle.load(f)
        src_datasets[n].repos_root = repos_dir


In [None]:
import torch

from spot.data import ChunkedDataset
from spot.model import CtxArgs, DecodingArgs, ModelSPOT, ModelWrapper, TokenizerSPOT
from spot.utils import TaskLoggingMonitor

train_r0 = True  # whether to train or load trained R0 model
with_margin = True
data_reduction = 1

margin_tag = "with_margin" if with_margin else "no_margin"
data_tag = "data_full" if data_reduction == 1 else f"data_1-{data_reduction}"

ctx_args = CtxArgs(
    ctx_size=1024,
    ctx_margin=256 if with_margin else 0,
    types_in_ctx=False,
)

tokenizer: TokenizerSPOT = TokenizerSPOT.from_pretrained("Salesforce/codet5-base")

r0_model_name = f"SPOT-R0-{margin_tag}-{data_tag}"
print("R0 model name: ", r0_model_name)

if train_r0:
    r0_model_path = "Salesforce/codet5-base"
else:
    r0_model_path = datadir / f"checkpoints/saved/{r0_model_name}"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

r0_model: ModelSPOT = ModelSPOT.from_pretrained(r0_model_path).to(device)
r0_monitor = TaskLoggingMonitor("R0")
r0_args = DecodingArgs(
    sampling_batch_size=128,
    ctx_args=ctx_args,
    max_workers=20,
)
r0_wrapper = ModelWrapper(r0_model, tokenizer, r0_args, r0_monitor)


R0 model name:  SPOT-R0-with_margin-data_full




In [None]:
import wandb
from spot.model import ModelTrainingArgs

r0_train_args = ModelTrainingArgs(
    train_batch_size=8,
    eval_batch_size=64,
    max_epochs=3,
)

if train_r0:
    r0_chunks: dict[str, ChunkedDataset] = {}
    with run_long_task("Preparing chunked datasets", notify=False):
        for n in ["valid", "train"]:
            r0_chunks[n] = src_datasets[n].to_chunks(
                tokenizer, ctx_args, max_workers=20
            )

    n_train = len(r0_chunks["train"].data) // data_reduction
    r0_chunks["train"] = r0_chunks["train"][:n_train]

    r0_trainer = r0_wrapper.build_trainer(
        datadir / "checkpoints" / r0_model_name,
        r0_train_args,
        dataset=r0_chunks["train"].data,
        eval_dataset=r0_chunks["valid"].data,
    )

    wandb.init(
        project=r0_model_name,
        dir=str(datadir),
        config={"r0_decoding_args": r0_args, "r0_train_args": r0_train_args},
    )

    with run_long_task(f"Training {r0_model_name}"):
        init_perf = r0_trainer.evaluate(max_length=r0_args.generation_max_length)
        print("initial eval loss:", init_perf)
        r0_trainer.train()

    wandb.log({"time_stats": r0_monitor.timer.total_times()})

    final_perf = r0_trainer.evaluate(max_length=r0_args.generation_max_length)
    print("final eval loss:", final_perf)
    wandb.finish()

    r0_wrapper.save_pretrained(datadir / "checkpoints/saved" / r0_model_name)


processing chunks:   0%|          | 0/3675 [00:00<?, ?it/s]

processing chunks:   0%|          | 0/51966 [00:00<?, ?it/s]

Pushover: (Finished: Preparing chunked datasets.) Time taken: 66.2s


Using amp half precision backend
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmrvplusone[0m. Use [1m`wandb login --relogin`[0m to force relogin


***** Running Evaluation *****
  Num examples = 2698
  Batch size = 64


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
***** Running training *****
  Num examples = 39054
  Num Epochs = 3
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 11718


initial eval loss: {'eval_loss': 1.8910937309265137, 'eval_runtime': 73.4316, 'eval_samples_per_second': 36.742, 'eval_steps_per_second': 0.586}


Step,Training Loss,Validation Loss


In [None]:
from spot.data import pretty_print_accuracies

size_factors = [1, 2, 3, 4]
with run_long_task("Evaluate R0 accuracy vs ctx_size"):
    acc_series = []
    for factor in size_factors:
        wrapper = r0_wrapper.scale_ctx_size(factor)
        wrapper.args.ctx_args.types_in_ctx = True
        accs = wrapper.eval_on_dataset(
            src_datasets["test"], tqdm_args={"leave": False}
        )[0]
        acc_series.append(accs)

import plotly.express as px

acc_df = pd.DataFrame(
    {
        "ctx_size": size_factors,
        "partial_acc": [x["partial_acc"] for x in acc_series],
        "full_acc": [x["full_acc"] for x in acc_series],
    }
)
px.line(acc_df, x="ctx_size", y=["partial_acc", "full_acc"], title=r0_model_name)


In [None]:
for i, n in enumerate(size_factors):
    print(f"ctx_size factor: {n}")
    pretty_print_accuracies(acc_series[i])
