In [1]:
# !pip install -Uq wandb accelerate transformers datasets einops

In [2]:
import torch
import math
import wandb
from types import SimpleNamespace
from pathlib import Path

from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import (
    get_cosine_schedule_with_warmup,
    set_seed,
    AutoModelForCausalLM,
    AutoTokenizer
)
from tqdm.auto import tqdm

In [3]:
config = SimpleNamespace(
    BATCH_SIZE = 1,
    PRECISION = "fp16", # on A100 you can use `bf16` instead
    SCALE = False, # Scale grads
    N_FREEZE = 28, # we will freeze up to the 28 layer
    TYPE = "instruct", # can be ["completion", "instruct"] depending on your dataset
    CHECKPOINTING_STEPS = 1000,
    SEED = 42,
    GRADIENT_ACCUMULATION_STEPS = 4,
    MAX_TRAIN_STEPS = None,
    OUTPUT_DIR = "./output_dir",
    WANDB_ENTITY = None,
    WANDB_PROJECT = "ft_llm",
    MODEL_NAME = "meta-llama/Llama-2-7b-hf",  # you need the license here!
    # DATASET_NAME = "Abirate/english_quotes",  # Replace "Your dataset name here" with the actual dataset name
    # DATASET_NAME = "vicgalle/alpaca-gpt4",
    DATASET_NAME = "knowrohit07/know_sql",
)

Load the model from the Huggingface Hub (you may need to accept the license of the model and the code execution)

In [4]:
# !huggingface-cli login

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    config.MODEL_NAME,
    torch_dtype=torch.bfloat16 if config.PRECISION=="bf16" else torch.float16,
    device_map=0,
    low_cpu_mem_usage=True,
    use_cache=False
)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 076d554f-ba3d-4b42-97c0-fccc600c0b35)')' thrown while requesting HEAD https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 993f4ea2-b3cc-4d49-9345-0307f6894f96)')' thrown while requesting HEAD https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/generation_config.json


In [6]:
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 36d6d0cd-2d89-4fe7-a622-30ca41c1b3d3)')' thrown while requesting HEAD https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/tokenizer_config.json


In [7]:
def generate(prompt, max_new_tokens):
    with torch.inference_mode():
        tokenized_prompt = tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
        output = model.generate(
            tokenized_prompt,
            max_new_tokens=max_new_tokens)
    return tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True)

In [8]:
generate("What is the meaning of life? \nAnswer:", 50)

'\nI believe that the meaning of life is to be happy.\nAnd I believe that the meaning of life is to be happy.\nAnd I believe that the meaning of life is to be happy.\nAnd I believe that the meaning of life'

In [9]:
for param in model.parameters(): param.requires_grad = False
for param in model.model.layers[-1].parameters(): param.requires_grad = True # the head of the model
for param in model.model.layers[config.N_FREEZE:].parameters(): param.requires_grad = True

params = sum([p.numel() for p in model.parameters()])/1_000_000
trainable_params = sum([p.numel() for p in model.parameters() if p.requires_grad])/1_000_000
print(f"Total params: {params:.2f}M, Trainable: {trainable_params:.2f}M")

Total params: 6738.42M, Trainable: 809.53M


Create a DataLoader from your Dataset

In [10]:
train_dataset = load_dataset("knowrohit07/know_sql",
                       revision='f33425d13f9e8aab1b46fa945326e9356d6d5726', split="train")
train_dataset

Dataset({
    features: ['question', 'context', 'answer'],
    num_rows: 78562
})

In [11]:
def to_text(x):
    x['text'] = 'Context: ' + x['context'] + '\nQuestion: ' + x['question'] + '\nAnswer: ' + x['answer']
    # tokenize here?
    return x

to_text(train_dataset[0])

{'question': "Which head's name has the substring 'Ha'? List the id and name.",
 'context': 'CREATE TABLE head (head_id VARCHAR, name VARCHAR)',
 'answer': "SELECT head_id, name FROM head WHERE name LIKE '%Ha%'",
 'text': "Context: CREATE TABLE head (head_id VARCHAR, name VARCHAR)\nQuestion: Which head's name has the substring 'Ha'? List the id and name.\nAnswer: SELECT head_id, name FROM head WHERE name LIKE '%Ha%'"}

In [12]:
train_dataset = train_dataset.shuffle(42).map(to_text).filter(lambda x:len(x['text'])<380)

In [13]:
# Collate function for DataLoaders
# depending on the dataset and model input format these need to be adjusted
def collate_fn(examples, debug=False):
    batch_size = len(examples)
    input_ids = tokenizer([e['text'] for e in examples], return_tensors='pt', padding=True)['input_ids']
    batch = {'input_ids': input_ids[:, :-1], 'labels': input_ids[:, 1:]}
    return batch

In [14]:
train_loader = DataLoader(
    train_dataset,
    collate_fn=collate_fn,
    shuffle=True,
    batch_size=config.BATCH_SIZE,
)

# one batch of data!
b = next(iter(train_loader))
b

{'input_ids': tensor([[    1, 15228, 29901, 14602, 10911,  1591, 29918,   978, 29918, 29896,
          29906,   313,  1853, 21748, 29892,  6496, 21748, 29897,    13, 16492,
          29901,  8449,  1134,  6496,   297, 29871, 29896, 29929, 29947, 29906,
          29973,    13, 22550, 29901,  5097,  1134,  3895,  1591, 29918,   978,
          29918, 29896, 29906,  5754,  6496,   353,   376, 29896, 29929, 29947,
          29906]]),
 'labels': tensor([[15228, 29901, 14602, 10911,  1591, 29918,   978, 29918, 29896, 29906,
            313,  1853, 21748, 29892,  6496, 21748, 29897,    13, 16492, 29901,
           8449,  1134,  6496,   297, 29871, 29896, 29929, 29947, 29906, 29973,
             13, 22550, 29901,  5097,  1134,  3895,  1591, 29918,   978, 29918,
          29896, 29906,  5754,  6496,   353,   376, 29896, 29929, 29947, 29906,
          29908]])}

In [15]:
def to_gpu(tensor_dict):
    for key in tensor_dict.keys():
        if torch.is_tensor(tensor_dict[key]):
            tensor_dict[key] = tensor_dict[key].to('cuda')
    return tensor_dict

In [16]:
optim = torch.optim.SGD(model.parameters(), lr=5e-4)
scaler = torch.cuda.amp.GradScaler(enabled=(config.PRECISION == "fp16" and config.SCALE)) # no-op if enabled=False

# we could train for shorter than the full dataset...
if config.MAX_TRAIN_STEPS is None:
    config.MAX_TRAIN_STEPS = len(train_loader)

scheduler = get_cosine_schedule_with_warmup(
    optim,
    num_training_steps=config.MAX_TRAIN_STEPS,
    num_warmup_steps=100,
)

In [17]:
def save_model(model, model_name, models_folder="models"):
    """Save the model to wandb as an artifact
    Args:
        model (nn.Module): Model to save.
        model_name (str): Name of the model.
        models_folder (str, optional): Folder to save the model. Defaults to "models".
    """
    model_name = f"{wandb.run.id}_{model_name}"
    file_name = Path(f"{models_folder}/{model_name}.pth")
    file_name.parent.mkdir(parents=True, exist_ok=True)
    torch.save(model.state_dict(), file_name)
    at = wandb.Artifact(model_name, type="model")
    at.add_file(file_name)
    wandb.log_artifact(at)

class TokenAccuracy:
    "A simple Accuracy function compatible with HF models"
    def __init__(self):
        self.count = 0
        self.sum = 0.
    def update(self, logits, labels):
        logits = logits.argmax(dim=-1).view(-1)
        labels = labels.view(-1)
        sum = (logits == labels).sum()
        self.count += len(logits)
        self.sum += sum
        return sum / len(logits)
    def compute(self):
        return self.sum / self.count

In [18]:
wandb.init(project=config.WANDB_PROJECT, # the project I am working on
           entity=config.WANDB_ENTITY, # the team or user where the project is
           config=config) # the Hyperparameters I want to keep track of

# Training
acc = TokenAccuracy()
ptdtype = {"bf16": torch.bfloat16, "fp16": torch.float16}[config.PRECISION]

model.train()
for step, batch in enumerate(tqdm(train_loader)):
    optim.zero_grad(set_to_none=True)
    for micro_step in range(config.GRADIENT_ACCUMULATION_STEPS):
        with torch.amp.autocast("cuda", dtype=ptdtype):
            out = model(**to_gpu(batch))
            loss = out.loss / config.GRADIENT_ACCUMULATION_STEPS
    scaler.scale(loss).backward()
    scaler.step(optim)
    scaler.update()
    scheduler.step()

    # we can log the metrics to W&B
    wandb.log({"loss": loss.item() * config.GRADIENT_ACCUMULATION_STEPS,
               "accuracy": acc.update(out.logits, batch["labels"])})

    # we save the model checkpoints every now and then =P
    if (step+1) % config.CHECKPOINTING_STEPS == 0 or step == config.MAX_TRAIN_STEPS:
        save_model(model, model_name=config.MODEL_NAME.replace("/", "_"), models_folder=config.OUTPUT_DIR)

    if step >= config.MAX_TRAIN_STEPS:
        break

wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mcapecape[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/76319 [00:00<?, ?it/s]

KeyboardInterrupt: 