In [None]:
pip install "sagemaker>=2.140.0" "transformers==4.26.1" "datasets[s3]==2.10.1" --upgrade

In [None]:
!pip install torch tensorboard --quiet
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes --quiet
!pip install peft --quiet
!pip install datasets trl ninja packaging --quiet
!pip install diffusers safetensors  --quiet


In [None]:
from huggingface_hub import login
import os

access_token = os.getenv("")
login(
 token=access_token,
)

In [None]:
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

## Create dataset

Using [sql-create-context]('https://huggingface.co/datasets/b-mc2/sql-create-context) dataset. Specifically built for text to sql using CREATE statements in context

In [None]:
from datasets import load_dataset

# Model expects conversation input:
system_message = """You are an exert text-to-SQL query translator. Users will ask you questions in English and you will generate a syntactically correct SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Load subset of dataset from the huggingface hub
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(20000))

# Convert dataset to messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features, batched=False)
dataset = dataset.train_test_split(test_size=2500/17500)

print(dataset["train"][1]["messages"])

dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

In [None]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

## Benchmarking cmdR

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# model_id = "mistralai/Mistral-7B-Instruct-v0.2"
model_id = "CohereForAI/c4ai-command-r-v01-4bit"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

# load into pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
from datasets import load_dataset
from random import randint
from tqdm import tqdm

test_dataset = load_dataset("json", data_files="test_dataset.json", split="train")

def test(sample):
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][1:2], tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, temperature=0, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
    pred = outputs[0]['generated_text'][len(prompt):].strip()
    if pred == sample["messages"][2]["content"]:
        return 1
    else:
        return 0

all_preds = []
n_test = 2500

for s in tqdm(test_dataset.shuffle().select(range(n_test))):
    all_preds.append(test(s))

# compute accuracy
accuracy = sum(all_preds)/len(all_preds)

print(f"Accuracy: {accuracy*100:.2f}%")

## Finetuning Mistral 7b

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format

model_id = "mistralai/Mistral-7B-Instruct-v0.2"

# 4-bit quantization using BitsAndBytes
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

# Ensure using openAI chat format
model, tokenizer = setup_chat_format(model, tokenizer)

In [None]:
from transformers import TrainingArguments
from peft import LoraConfig
# PEFT = parameter efficient tuning, of which LoRA is a particular method

# config from QLoRA paper (quantization + LoRA)
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

args = TrainingArguments(
    output_dir="walexand3r/Mistral-7B-v0.2-text-to-sql",  # Save to hugging face   
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=3,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backwards pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

In [None]:
from trl import SFTTrainer
# TRL is huggingfaces's transformer reinforcement learning framework
# SFT is a supervised finetuning package
 
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=3072,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()
trainer.save_model()

## Next steps

- Repeat benchmarking for finetuned model
- Save model to s3 rather than huggingface
- Convert to script and use SageMaker experiments to run managed training run