In [1]:
%%capture
%pip install -U transformers
%pip install -U datasets
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U bitsandbytes
%pip install -U wandb
%pip install -U torch
%pip install -U predibase

In [2]:
%pip install -U predibase

Collecting predibase
  Downloading predibase-2024.8.5.tar.gz (87 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting dataclasses-json==0.5.7 (from predibase)
  Downloading dataclasses_json-0.5.7-py3-none-any.whl.metadata (22 kB)
Collecting deprecation (from predibase)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting ipyplot (from predibase)
  Downloading ipyplot-1.1.2-py3-none-any.whl.metadata (7.2 kB)
Collecting predibase-api==2024.8.5 (from predibase)
  Downloading predibase-api-2024.8.5.tar.gz (6.3 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requ

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 4.42.0 requires urllib3~=2.0, but you have urllib3 1.26.12 which is incompatible.


In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from huggingface_hub import login
from predibase import Predibase

# Pass api_token directly, or get it from the environment variable.
pb = Predibase(api_token="pb_P8XqjAqLXTuM-e6j7Se_Sw")

hf_token = "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"

login(token = hf_token)

wb_token = "094590d0aa8813c0cc044d53c48dbf393da80d96"

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3 8B on SQL dataset',
    job_type="training",
    anonymous="allow"
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\matti\.cache\huggingface\token
Login successful


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmaatvo[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\matti\.netrc


In [3]:
base_model = "meta-llama/Meta-Llama-3-8B-Instruct"
dataset_name = "gretelai/synthetic_text_to_sql"
new_model = "llama-3-8b-sql-chatbot"

In [None]:
torch_dtype = torch.float16
attn_implementation = "eager"

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)

In [None]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)


In [6]:
import datasets

df_dataset = datasets.load_dataset(
    "gretelai/synthetic_text_to_sql"
).get("train").to_pandas()

In [7]:
base_model_template = "<|im_start|>user\n {prompt} <|im_end|>\n<|im_start|>assistant\n"

In [8]:
text_to_sql_training_prompt_template = """\
    You are a database management system expert, proficient in Structured Query Language (SQL).
    
    Your job is to write an SQL query that answers the following question, based on the given \
database schema and any additional information provided.  Use SQLite syntax.
    
    Please output only SQL (without any explanations).


    ### Schema: {sql_context}


    ### Knowledge: This "{sql_task_type}" type task is commonly used for {sql_task_type_description} \
in the domain of {domain}, which involves {domain_description}.


    ### Question: {sql_prompt}


    ### Completion:
"""


In [10]:
df_dataset.to_csv(
    "data/fine_tuning/synthetic_text_to_sql_llama-3-8b-instruct.csv",
    index=False
)

In [5]:
dataset = pb.datasets.from_file(
    "data/fine_tuning/train.csv",
    name="gretel_ai_synthetic_text_to_sql_llama-3-8b_train"
)

In [11]:
repo = pb.repos.create(
    name="gretel_ai_synthetic_text_to_sql_llama-3-8b-instruct", 
    description="Fine-tuning on GretelAI text-to-SQL synthetic dataset with Predibase."
)

In [14]:
adapter: FinetuningJob = pb.finetuning.jobs.create(
    config={
        "base_model": "meta-llama/Meta-Llama-3-8B-Instruct",
        "epochs": 5,
        "learning_rate": 0.0002,
    },
    dataset=dataset,
    repo="gretel_ai_synthetic_text_to_sql_llama-3-8b-instruct",
    description='fine-tune "llama-3-8b-instruct" with GretelAI text-to-SQL synthetic dataset (no JSON)',
)


RuntimeError: Bad request. Response status code 400. Error: {'message': 'Selected dataset gretel_ai_synthetic_text_to_sql_llama-3-8b_train is missing at least one required column: [prompt completion]'}