In [1]:
import re
import os
import sys
import json
import glob
import math
import logging
import argparse
import numpy as np
from typing import Dict, List, Optional, Sequence
from dataclasses import dataclass, field

import torch
import datasets
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    LlamaTokenizer,
    Trainer,
    HfArgumentParser,
    TrainingArguments,
    BitsAndBytesConfig,
)
from datasets import load_dataset, concatenate_datasets, DatasetDict

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    prepare_model_for_kbit_training,
)

In [17]:
model_name_or_path = "meta-llama/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    # load_in_8bit=True,
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

## Load tokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
print(json.dumps(tokenizer.special_tokens_map, indent=2))
print(tokenizer.vocab_size)

{
  "bos_token": "<s>",
  "eos_token": "</s>",
  "unk_token": "<unk>"
}
32000


In [12]:
tokenizer = LlamaTokenizer.from_pretrained(
    model_name_or_path,
    padding_side="left",
    use_fast=False,
)
print(json.dumps(tokenizer.special_tokens_map, indent=2))
print(tokenizer.vocab_size)
print(tokenizer.eos_token or "No eos token")
print(tokenizer.bos_token or "No bos token")
print(tokenizer.unk_token or "No unk token")
print(tokenizer.pad_token or "No pad token")

Using pad_token, but it is not set yet.


{
  "bos_token": "<s>",
  "eos_token": "</s>",
  "unk_token": "<unk>"
}
32000
</s>
<s>
<unk>
No pad token


In [13]:
# num_new_tokens = tokenizer.add_special_tokens({
#     "pad_token": "<pad>",
#     "bos_token": "<s>",
#     "eos_token": "</s>",
#     "unk_token": "<unk>",
# })
# print(tokenizer.all_special_ids)
# print(tokenizer.vocab_size)

In [14]:
tokenizer.pad_token_id = tokenizer.unk_token_id
tokenizer.pad_token = tokenizer.unk_token

In [15]:
input_ids = tokenizer(
    ["hello world", "hello world I am going to school"],
    return_tensors="pt",
    padding=True,
    add_special_tokens=True,
)
print(input_ids)

{'input_ids': tensor([[    0,     0,     0,     0,     0,     1, 22172,  3186],
        [    1, 22172,  3186,   306,   626,  2675,   304,  3762]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1]])}


In [16]:
decoded_input_ids = tokenizer.batch_decode(input_ids["input_ids"],)
print(decoded_input_ids)

['<unk><unk><unk><unk><unk><s> hello world', '<s> hello world I am going to school']


## Load model

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    quantization_config=bnb_config,
    cache_dir="../cache"
)

## Load dataset

In [18]:
dataset = load_dataset(
    "b-mc2/sql-create-context",
    cache_dir="../cache",
)

In [19]:
dataset['train'][0]

{'context': 'CREATE TABLE head (age INTEGER)',
 'question': 'How many heads of the departments are older than 56 ?',
 'answer': 'SELECT COUNT(*) FROM head WHERE age > 56'}

In [22]:
def generate_prompt_sql(question, context, answer=""):
    return f"""You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables. 

You must output the SQL query that answers the question.

### Question:
{question}

### Context:
{context}

### Response:
{answer}"""


def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < 512
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt_sql(
        data_point["question"],
        data_point["context"],
        data_point["answer"],
    )
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [23]:
train_data = dataset["train"].map(
    generate_and_tokenize_prompt,
    num_proc=os.cpu_count(),
    remove_columns=next(iter(dataset.values())).column_names,
    desc="preprocess train data set",
)

preprocess train data set (num_proc=16):   0%|          | 0/78577 [00:00<?, ? examples/s]

In [46]:
collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)

In [49]:
collator([train_data[0]])

{'input_ids': tensor([[    0,     0,     0,     0,     0,     0,     0,     1,   887,   526,
           263, 13988,  1426, 29899,   517, 29899,  4176,  1904, 29889,  3575,
          4982,   338,   304,  1234,  5155,  1048,   263,  2566, 29889,   887,
           526,  2183,   263,  1139,   322,  3030, 11211,   697,   470,   901,
          6131, 29889, 29871,    13,    13,  3492,  1818,  1962,   278,  3758,
          2346,   393,  6089,   278,  1139, 29889,    13,    13,  2277, 29937,
           894, 29901,    13,  5328,  1784, 15883,   310,   278,  5840,  1860,
           526,  9642,  1135, 29871, 29945, 29953,  1577,    13,    13,  2277,
         29937, 15228, 29901,    13, 27045, 10911,  2343,   313,   482,  2672,
          4330, 17070, 29897,    13,    13,  2277, 29937, 13291, 29901,    13,
          6404, 21122, 22798,  3895,  2343,  5754,  5046,  1405, 29871, 29945,
         29953,     2]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1