In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
)
from transformers.trainer_pt_utils import LengthGroupedSampler
import torch

import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.optim import Adam
from torch.utils.data import Dataset
import numpy as np
import os
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
from pathlib import Path
import wandb
from datasets import load_dataset
import datasets

from generate_with_embeddings import GenerateWithEmbeddings
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
os.environ["WANDB_MODE"] = "dryrun"

In [3]:
class ConstantLengthDataset(torch.utils.data.IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=1024,
        num_of_sequences=1024,
        chars_per_token=3.6,
        content_field="content",
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = (
            tokenizer.eos_token_id if tokenizer.eos_token_id else "<|endoftext|>"
        )
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
        self.content_field = content_field

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    buffer.append(next(iterator)[self.content_field])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    self.current_size += 1
                    yield {
                        "input_ids": torch.LongTensor(input_ids),
                        "labels": torch.LongTensor(input_ids),
                    }



In [4]:
wandb.init(project="lua-training", 
           config={
                "batch_size": 32,
                "embedding_size": 64,
                "lm_prefix_size": 2048,
                "num_epochs": 30,
                "learning_rate": 3e-5,
                "checkpoint": "bigcode/santacoder",
                "revision": "dedup-alt",
                "device": "cuda:5",
                "time_tag": time.strftime("%Y%m%d-%H%M%S"),
                "record_step_every": 1000
           })

checkpoint = wandb.config["checkpoint"]
revision = wandb.config["revision"]
device = wandb.config["device"] 

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


In [6]:
access_token = "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"
lua_data = load_dataset("bigcode/the-stack-smol", data_dir="data/lua",split="train", use_auth_token=access_token)
lua_data.shuffle()
lua_data = lua_data.train_test_split(test_size=0.1)
train_data = lua_data["train"]
test_data = lua_data["test"]

Found cached dataset json (/home/saxenaya/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-88fa5373c749e3eb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [7]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint,revision=revision)
model = AutoModelForCausalLM.from_pretrained(checkpoint, revision=revision, trust_remote_code=True).to(device)
tokenizer.pad_token = tokenizer.eos_token


In [8]:
train_data_cl = ConstantLengthDataset(tokenizer, train_data, infinite=True, seq_length=1024, num_of_sequences=1024, chars_per_token=3.6)

In [9]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="pt")
dataloader = torch.utils.data.DataLoader(train_data_cl, 
                                         batch_size=wandb.config["batch_size"],
                                         collate_fn=data_collator)

In [10]:
for item in dataloader:
    print(item)
    break

Token indices sequence length is longer than the specified maximum sequence length for this model (3724 > 2048). Running this sequence through the model will result in indexing errors
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  299, 10804,  3764,  ...,   450,   185,   185],
        [ 1357,   577,  1030,  ...,   515,   287, 35700],
        [  349,   287, 11574,  ...,    87,     8,   720],
        ...,
        [   62,   859,    76,  ...,  1050,   256, 24284],
        [   62,  6235,    62,  ...,  1615,  3127,  1189],
        [ 6582, 29868,   404,  ...,  3091,   258,   363]]), 'labels': tensor([[  299, 10804,  3764,  ...,   450,   185,   185],
        [ 1357,   577,  1030,  ...,   515,   287, 35700],
        [  349,   287, 11574,  ...,    87,     8,   720],
        ...,
        [   62,   859,    76,  ...,  1050,   256, 24284],
        [   62,  6235,    62,  ...,  1615,  3127,  1189],
        [ 6582, 29868,   404,  ...,  3091,   258,   363]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}


In [11]:
tokenized_code = tokenizer(train_data["content"], padding=True, truncation=True, return_tensors="pt")
labels = tokenized_code["input_ids"].clone()
if tokenizer._pad_token is not None:
    labels[labels == tokenizer.pad_token_id] = -100
tokenized_code["labels"] = labels

In [None]:
num_epochs = wandb.config["num_epochs"]
step = 0
train_table_cols = ["epoch", "step", "loss", "label", "predictions"]
next_step_in_table = wandb.config["record_step_every"]
train_table_rows = []
train_table = wandb.Table(columns=train_table_cols)

model.train()

In [13]:
optimizer = torch.optim.Adam(model.parameters(), lr=wandb.config["learning_rate"])

In [17]:
import random

for epoch in tqdm(range(num_epochs), desc="Epoch", position=0):
    running_loss = 0

    stats = {"correct": 0, "total": 0}
    # Select a random minibatch of examples
    indices = random.sample(range(0, len(train_data)), wandb.config["batch_size"])

    for i in tqdm(indices, desc="Batch", position=1, leave=False):
        # Hack(ytzi): This is a hack to make sure that the model is not too big: just use 50 characters
        tokenized_code = tokenizer(tokenizer.bos_token + train_data[i]["content"][:50] + tokenizer.eos_token,
                                   return_tensors="pt",
                                   padding=True,
                                   truncation=True).to(device)
        inputs = data_collator(tokenized_code) 

        gwe = GenerateWithEmbeddings(model,
                                     tokenizer,
                                     None, 
                                     prompt=prompt,
                                     labels=labels,
                                     max_length=tokenized_code["input_ids"].shape[1] + 1,
                                     mode="train",
                                     device=device)
        

        losses = []
        while(gwe.can_generate_more()):
            optimizer.zero_grad()
            while(gwe.can_generate_more()):
                loss = gwe.generate_step()
                losses.append(loss)
                stats["total"] += 1
                if gwe.is_last_prediction_correct:
                    stats["correct"] += 1

            total_loss = torch.stack(losses).sum()

            if step > next_step_in_table:
                next_step_in_table += wandb.config["record_step_every"]
                train_table_rows.append([epoch, step, total_loss.item(), tokenizer.decode(tokenized_code["input_ids"][0]), gwe.last_predictions])
                train_table = wandb.Table(data=train_table_rows, columns=train_table_cols)
            wandb.log({"loss": total_loss.item(), "Training": train_table})
            step += 1

            total_loss.backward()
            optimizer.step()
            running_loss += total_loss.item()

    running_loss /= BATCH_SIZE
    wandb.log({"running_loss": running_loss, "learning_rate": lr_scheduler.get_lr()[0], "accuracy_per_epoch": stats["correct"] / stats["total"]})
    step += 1
    lr_scheduler.step()

time_tag = wandb.config["time_tag"]
model.save_pretrained(f"models/lua/{time_tag}.pt")

Epoch:   0%|          | 0/30 [00:03<?, ?it/s]


AttributeError: 'tokenizers.Encoding' object has no attribute 'size'