# Training

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8" # this is needed to get rid of weird colab locale
# if you are still running into error, please restart the runtime to initialize a new environment

!wget https://zenodo.org/records/7908468/files/python.zip
!unzip python.zip
!gzip -d python/final/jsonl/train/python_train_0.jsonl.gz

In [None]:
import numpy as np
import json

def grab_raw_dataset():
    raw_dataset = []
    file = "python/final/jsonl/train/python_train_0.jsonl"
    with open(file, "r") as f:
        raw_dataset.extend([json.loads(x) for x in f.readlines()])
    print("grabbed data from file {}".format(file))
    print("Number of raw functions: {}".format(len(raw_dataset)))
    return raw_dataset

raw_dataset = grab_raw_dataset()

In [None]:
import torch

from torch.utils.data import DataLoader
from transformers import AdamW, AutoTokenizer, T5ForConditionalGeneration

def load_base_model():
    tokenizer = AutoTokenizer.from_pretrained("t5-small")
    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    return model, tokenizer

model, tokenizer = load_base_model()

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        return self.encodings['input_ids'][i], self.encodings['labels'][i]

def prepare_dataset(raw_dataset, tokenizer):
    # TODO: complete the implementation of this function
    # ....

    encodings = {"input_ids": input_ids, "labels": labels}
    dataset = Dataset(encodings)

    return dataset

In [None]:
# obtain the training dataset
training_dataset = prepare_dataset(raw_dataset=raw_dataset, tokenizer=tokenizer)

In [None]:
from tqdm.auto import tqdm
from torch.utils.data import DataLoader

def train(model, train_dataset, lr, batch_size, num_epochs):

  optimizer = AdamW(model.parameters(), lr=lr)

  if torch.cuda.is_available():
    print("cuda is available")
    model.cuda()

  loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

  total_losses = []
  for epoch in range(num_epochs):
      model.train()
      total_loss = 0
      for input_ids, labels in tqdm(loader):
        input_ids = input_ids.cuda()
        labels = labels.cuda()

        outputs = model(input_ids=input_ids, labels=labels)

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

  model.eval()
  return model

In [None]:
# TODO: Apply hyperparameters
# NOTE: you may want to use a smaller epoch amount to reduce the time cost of training
# NOTE: you may also want to add additional checks to observe the performance of the model
# (validation dataset can be found here: https://zenodo.org/records/7908468 with the same format
# as the training dataset) they are also already downloaded to this notebook if you have ran
# the previous initialization steps

batch_size = # TODO
learning_rate = # TODO
num_epochs = # TODO

trained_model = train(model, training_dataset, lr=learning_rate, batch_size=batch_size, num_epochs=num_epochs)

# Inference

In [None]:
!wget https://raw.githubusercontent.com/uiuc-cs598lmz-s24/hw1/main/hw_1_inference_dataset.jsonl

In [None]:
# reload the base model again
base_model, _ = load_base_model()

In [None]:
def grab_inference_dataset():
    inference_dataset = []
    file = "hw_1_inference_dataset.jsonl"
    with open(file, "r") as f:
        inference_dataset.extend([json.loads(x) for x in f.readlines()])
    print("grabbed data from file {}".format(file))
    print("Number of tasks: {}".format(len(inference_dataset)))
    return inference_dataset

inference_dataset = grab_inference_dataset()

In [None]:
def code_infill(code_model, tokenizer, prefix, suffix):
    # TODO: complete the implementation of this function
    return model_output


def evaluate(code_model, tokenizer, inference_dataset):

    def edit_distance(model_output, gt):
        # TODO: implement this
        return score

    ed_scores = []
    for data in inference_dataset:

        prefix = data['prefix']
        suffix = data['suffix']
        gt = data['gt']

        model_output = code_infill(code_model, tokenizer, prefix, suffix)
        ed_scores.append(edit_distance(model_output, gt))

    import statistics as st
    return st.mean(ed_scores)


# Report the results
base_model_ed = evaluate(base_model, tokenizer, inference_dataset)
trained_model_ed = evaluate(trained_model, tokenizer, inference_dataset)