In [None]:
%export CUDA_VISIBLE_DEVICES=0,1,2,3,4

In [1]:
%%capture --no-display --no-stderr --no-stdout
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.optim import Adam
from torch.utils.data import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import os
import pickle
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import matplotlib.pyplot as plt
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("DSL.txt", 'r') as f:
    prompt = f.read()

In [3]:
class CodeDataset(Dataset): # NOTE: this isn't very space-efficient since it has to load in the entire dataset at once
    def __init__(self, data_dir):
        self.data_dir = data_dir

        self.data = []
        for d in os.listdir(self.data_dir):
            code_dir = os.path.join(self.data_dir, d, "code.txt")
            prompts_dir = os.path.join(self.data_dir, d, "prompts.txt")

            with open(code_dir, 'r') as f:
                code = f.read()
            
            with open(prompts_dir, 'r') as f:
                prompts = f.read()
            
            prompts = prompts.split("\n")

            for p in prompts:
                self.data.append((code, prompt + p + "\n\t"))

    def __len__(self):
        return len(os.listdir(self.data_dir))
    def __getitem__(self, idx):
        input_ids = tokenizer(self.data[idx][0], padding='max_length', max_length=256)["input_ids"]
        labels = tokenizer(self.data[idx][1], padding='max_length', max_length=256)["input_ids"]
        return {"input_ids": torch.LongTensor(input_ids), "labels": torch.LongTensor(labels)}

In [4]:
checkpoint = "bigcode/santacoder"
revision = "dedup-alt"
device = "cuda" # for GPU usage or "cpu" for CPU usage

tokenizer = AutoTokenizer.from_pretrained(checkpoint,revision=revision)
model = AutoModelForCausalLM.from_pretrained(checkpoint, revision=revision, trust_remote_code=True).to(device)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
dataset = CodeDataset("./train")
max([len(dataset[i]['labels']) for i in range(len(dataset))])

256

In [6]:
batch_size = 4

args = Seq2SeqTrainingArguments(
    f"codegen_attempt_1",
    evaluation_strategy = "epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=50,
    predict_with_generate=True,
    push_to_hub=False,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [7]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=CodeDataset("./train"),
    eval_dataset=CodeDataset("./test"),
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [8]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,9.713099


TrainOutput(global_step=50, training_loss=4.42640380859375, metrics={'train_runtime': 122.8272, 'train_samples_per_second': 3.257, 'train_steps_per_second': 0.407, 'total_flos': 626544712089600.0, 'train_loss': 4.42640380859375, 'epoch': 50.0})