In [1]:
import wandb
from types import SimpleNamespace
from pathlib import Path
from datasets import load_dataset
import transformers

In [2]:
DATASET_AT = "capecape/wizard/lm_dataset:latest"
MODEL_NAME = "WizardLM/WizardCoder-15B-V1.0"

config = SimpleNamespace(
    dataset_at=DATASET_AT,
    model_name=MODEL_NAME,
)

In [3]:
# # download artifact without creating a run
# api = wandb.Api()
# artifact = api.artifact(config.dataset_at, type='dataset')
# artifact_dir = artifact.download()
# path = Path(artifact_dir)

In [4]:
path = Path("artifacts/lm_dataset:v0")
dataset_file = path/'dataset.jsonl'

In [23]:
raw_train_dataset = load_dataset("json", data_files=[str(dataset_file)], split="train")
raw_train_dataset

Found cached dataset json (/Users/tcapelle/.cache/huggingface/datasets/json/default-659b07a3902b3a7f/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


Dataset({
    features: ['source', 'text'],
    num_rows: 3086
})

Splitting in Train/Val

In [None]:
# dataset = raw_train_dataset["train"].train_test_split(test_size=0.1)

## Tokenizer

In [24]:
from utils import *

In [25]:
training_args = TrainingArguments("output/", 
                                  model_max_length=2048,
                                  num_train_epochs=3,)

In [26]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    config.model_name,
    cache_dir=training_args.cache_dir,
    model_max_length=training_args.model_max_length,
    padding_side="right",
    use_fast=True,
)

In [27]:
config.model_name

'WizardLM/WizardCoder-15B-V1.0'

In [28]:
# if tokenizer.pad_token is None:
#     smart_tokenizer_and_embedding_resize(
#         special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
#         tokenizer=tokenizer,
#         model=model,
#     )
if "Coder" in config.model_name:
    tokenizer.add_special_tokens(
        {
            "eos_token": DEFAULT_EOS_TOKEN,
            "bos_token": DEFAULT_BOS_TOKEN,
            "unk_token": DEFAULT_UNK_TOKEN,
            "pad_token": DEFAULT_PAD_TOKEN,
        }
    )

In [29]:
raw_train_dataset.column_names

['source', 'text']

In [34]:
column_names = raw_train_dataset.column_names
tokenized_train_ds = raw_train_dataset.map(lambda examples: tokenizer(examples["text"]), 
                                           remove_columns=column_names,
                                           batched=True)

Map:   0%|          | 0/3086 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2954 > 2048). Running this sequence through the model will result in indexing errors


In [37]:
from itertools import chain

block_size = 2048

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, and if the total_length < block_size  we exclude this batch and return an empty dict.
    # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [41]:
lm_dataset = tokenized_train_ds.map(
                group_texts,
                batched=True,
                num_proc=4,
                load_from_cache_file=False,
                desc=f"Grouping texts in chunks of {block_size}",
            )

Grouping texts in chunks of 2048 (num_proc=4):   0%|          | 0/3086 [00:00<?, ? examples/s]