In [1]:
# Install required packages
!pip install -q -U transformers peft accelerate bitsandbytes trl huggingface_hub
!pip install -q datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.1/362.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.0/348.0 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m94.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Import libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import load_dataset
import torch
from google.colab import drive

# Mount Google Drive to access your dataset
drive.mount('/content/drive')

# Load your formatted dataset
dataset_path = "/content/drive/MyDrive/databricks_docs_instructions.json"
dataset = load_dataset('json', data_files=dataset_path, split='train')

# Print sample to verify
print("Sample dataset entry:")
print(dataset[0])

# Model configuration
model_name = "mistralai/Mistral-7B-v0.1"

# Quantization config for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token
tokenizer.padding_side = "right"  # Prevent overflow issues

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

# Prepare model for PEFT training
model = prepare_model_for_kbit_training(model)

# LoRA configuration
peft_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

# Apply LoRA
model = get_peft_model(model, peft_config)

# Print trainable parameters
model.print_trainable_parameters()

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,  # Reduce if OOM errors occur
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    num_train_epochs=3,
    max_steps=-1,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,
    push_to_hub=False,  # Set to True if you want to push to HF Hub
    report_to="none",
)

Mounted at /content/drive


Generating train split: 0 examples [00:00, ? examples/s]

Sample dataset entry:
{'instruction': '"How can I use the `stack` function to generate a set of rows with specified values in Databricks SQL Databricks Runtime?"', 'input': '', 'output': "You can use the `stack` function in Databricks SQL Databricks Runtime to generate a set of rows with specified values by providing the number of rows and the values as arguments. The function separates the values into the specified number of rows. Here's an example:\n\n```sql\nSELECT 'hello', s.*, 'world' FROM stack(2, 1, 2, 3) AS s(first, second);\n```\n\nIn this example, the `stack` function generates two rows with the values 1, 2 and 3. The first column is named 'first' and the second column is named 'second'. The result of this query would be:\n\n```\nhello | first | second | world\n------+-------+--------+-------\nhello |     1 |      2 | world\nhello |     3 |   NULL | world\n```\n\nRemember that in Databricks SQL Databricks Runtime 12.2 LTS and above, you should invoke the `stack` function as a

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

trainable params: 13,631,488 || all params: 7,255,363,584 || trainable%: 0.1879


In [3]:
# First, let's properly preprocess the dataset
def format_dataset(example):
    # Create the text in the required format
    text = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    return {"text": text}

# Apply the formatting to the entire dataset
formatted_dataset = dataset.map(format_dataset)

Map:   0%|          | 0/5169 [00:00<?, ? examples/s]

In [None]:
# Now create the trainer with the properly formatted dataset
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,  # Use the formatted dataset
)

# Start training
print("Starting training...")
trainer.train()

Converting train dataset to ChatML:   0%|          | 0/5169 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/5169 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5169 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/5169 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Starting training...


  return fn(*args, **kwargs)


Step,Training Loss
10,1.0112
20,0.959
30,0.855
40,0.9362
50,0.8752
60,0.938
70,0.8557
80,0.9259
90,0.8622
100,0.8774


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=969, training_loss=0.7049081908660039, metrics={'train_runtime': 4313.5495, 'train_samples_per_second': 3.595, 'train_steps_per_second': 0.225, 'total_flos': 3.6498184641193574e+17, 'train_loss': 0.7049081908660039})

In [None]:
# Save the model
output_dir = "/content/drive/MyDrive/databricks_docs_model"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Training complete! Model saved to {output_dir}")

Training complete! Model saved to /content/drive/MyDrive/databricks_docs_model
