In [None]:
from unsloth import FastLanguageModel
import torch


In [None]:
max_seq_length = 2048
dtype = None
load_in_4bit = True


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,

)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules= ["q_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj",],
    lora_alpha= 16,
    lora_dropout= 0,
    bias = "none",
    use_gradient_checkpointing= "unsloth",
    random_state = 3407,
    use_rslora= False,
    loftq_config= None,
)

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an instruction
### Instruction:
Company database: {}

### Input:
SQL Prompt: {}

### Response:
SQL: {}

Explanation: {}
"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    company_database = examples["sql_context"]
    prompts = examples["sql_prompt"]
    sqls = examples["sql"]
    explanations = examples["sql_explanation"]
    texts = []
    for company_database, prompt, sql, explanation in zip(company_database, prompts, sqls, explanations):
        # Must add EOS_Token,  otherwise yor generation will go on forever!
        text = alpaca_prompt.format(company_database, prompt, sql, explanation) + EOS_TOKEN
        texts.append(text)
    return {"text": texts,}

In [None]:
from datasets import load_dataset
dataset = load_dataset("gretelai/synthetic_text_to_sql", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

In [None]:
dataset['text']

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir= "outputs",
    ),
)


In [None]:
trainer.train()

In [None]:
model.save_pretrained_gguf("model", tokenizer, quantization_method="f16")