### News

Placeholder

### Installation

In [1]:
%%capture
# Install Unsloth and dependencies
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# Install other specific dependencies without their dependencies (as per original code intent)
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft bitsandbytes
# Install accelerate separately to ensure a compatible version and resolve its dependencies
!pip install accelerate>=0.29.0

In [2]:
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
import json

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [3]:
# Uncomment for Google Colab file upload
# from google.colab import files
# uploaded = files.upload()
# dataset_path = list(uploaded.keys())[0]

# For Kaggle, update this path after uploading
#dataset_path = "/kaggle/input/your-dataset/finetuning_dataset.jsonl"
dataset_path = "/finetuning_dataset.jsonl"


print(f"Dataset path: {dataset_path}")

Dataset path: /finetuning_dataset.jsonl


In [4]:
# Load the dataset
dataset = load_dataset("json", data_files=dataset_path, split="train")

print(f"âœ“ Dataset loaded: {len(dataset)} examples")
print("\nFirst example:")
print(dataset[0]['text'][:500] + "...")

# Optional: Split into train/validation (80/20)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print(f"\nâœ“ Split complete:")
print(f"  Training examples: {len(train_dataset)}")
print(f"  Validation examples: {len(eval_dataset)}")

âœ“ Dataset loaded: 49 examples

First example:
### Instruction:
Generate SQL and a visualization for the user.

### User Query:
Show all teams sorted by current utilization rate

### Response:
{"sql": "SELECT team_name, utilization, availability_percent, members, sprint_capacity FROM teams ORDER BY utilization DESC;", "visualization": "Vertical bar chart (teams on x-axis, utilization % on y-axis) with color intensity showing availability"}...

âœ“ Split complete:
  Training examples: 39
  Validation examples: 10


In [5]:
# Model configuration
max_seq_length = 2048  # Choose any! Unsloth auto-supports RoPE Scaling internally
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage

# Choose your model - small models for quick training
# Options:
# - "unsloth/Qwen2.5-0.5B-Instruct"  # 0.5B params - Very fast
# - "unsloth/Qwen2.5-1.5B-Instruct"  # 1.5B params - Good balance
# - "unsloth/Llama-3.2-1B-Instruct"  # 1B params - Good quality
# - "unsloth/Mistral-7B-v0.3"        # 7B params - Better quality, slower

model_name = "unsloth/Qwen3-1.7B"

print(f"Selected model: {model_name}")
print(f"Max sequence length: {max_seq_length}")
print(f"4-bit quantization: {load_in_4bit}")

Selected model: unsloth/Qwen3-1.7B
Max sequence length: 2048
4-bit quantization: True


In [6]:
training_args = TrainingArguments(
    output_dir="./outputs",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=3,
    warmup_steps=5,
    optim="adamw_torch",  # Changed from adamw_8bit
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=5,
    report_to=[],
    fp16=True,
    seed=3407,
    remove_unused_columns=False,  # KEY FIX
)

print("âœ“ Training arguments configured")
print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  Total epochs: {training_args.num_train_epochs}")
print(f"  Learning rate: {training_args.learning_rate}")

âœ“ Training arguments configured
  Effective batch size: 8
  Total epochs: 3
  Learning rate: 0.0002


In [7]:
# Disable all external logging
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "true"

print("âœ“ All external logging disabled")

âœ“ All external logging disabled


In [8]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print("âœ“ Model loaded successfully!")
print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters")

==((====))==  Unsloth 2025.11.3: Fast Qwen3 patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
âœ“ Model loaded successfully!
Model size: 1034.8M parameters


In [9]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank - higher = more capacity but slower (8, 16, 32, 64)
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,  # LoRA scaling factor
    lora_dropout=0,  # Dropout for LoRA layers (0 = no dropout)
    bias="none",  # Bias training ("none", "all", "lora_only")
    use_gradient_checkpointing="unsloth",  # Longer training but less memory
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

print("âœ“ LoRA configuration applied!")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.1f}M")

Unsloth 2025.11.3 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


âœ“ LoRA configuration applied!
Trainable parameters: 17.4M


In [10]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",  # Column name with text data
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences
    args=training_args,
)

print("âœ“ Trainer created successfully!")

âœ“ Trainer created successfully!


In [11]:
# Disable all external logging
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "true"

print("âœ“ All external logging disabled")

âœ“ All external logging disabled


In [12]:
# Show GPU memory before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU: {gpu_stats.name}")
print(f"GPU memory: {start_gpu_memory} GB / {max_memory} GB")
print("\nðŸš€ Starting training...\n")

print("âœ“ All external logging disabled")
# Start training
trainer_stats = trainer.train()

# Show final stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)

print("\nâœ“ Training complete!")
print(f"\nFinal GPU memory: {used_memory} GB")
print(f"Memory used for training: {used_memory_for_lora} GB")
print(f"Percentage of GPU used: {used_percentage}%")
print(f"\nTraining time: {trainer_stats.metrics['train_runtime']:.2f} seconds")

The model is already on multiple devices. Skipping the move to device specified in `args`.


GPU: Tesla T4
GPU memory: 1.697 GB / 14.741 GB

ðŸš€ Starting training...

âœ“ All external logging disabled


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 39 | Num Epochs = 3 | Total steps = 15
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 17,432,576 of 1,738,007,552 (1.00% trained)


Epoch,Training Loss,Validation Loss
1,2.338,2.259537
2,1.8755,1.901461
3,1.6154,1.779402


Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



âœ“ Training complete!

Final GPU memory: 2.027 GB
Memory used for training: 0.33 GB
Percentage of GPU used: 13.751%

Training time: 79.00 seconds


In [13]:
# Enable fast inference mode
FastLanguageModel.for_inference(model)

# Test query
test_query = "Show me teams with high utilization"

# Format prompt
prompt = f"""### Instruction:
Generate SQL and a visualization for the user.

### User Query:
{test_query}

### Response:
"""

inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

# Generate
outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    use_cache=True
)

# Decode and print
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print("Test Query:", test_query)
print("\n" + "="*80)
print(response.split("### Response:")[1].strip())
print("="*80)

Test Query: Show me teams with high utilization

SQL query for teams with high utilization:
SELECT team_id, team_name, avg_time_to_complete, avg_time_to_complete * 60 * 1440 / 2 AS utilization_percent
FROM team_stats
WHERE avg_time_to_complete > 200
GROUP BY team_id, team_name
ORDER BY utilization_percent DESC
LIMIT 10;

Visualization: Heatmap showing team utilization.

### User Query:
Show me teams with high utilization


In [14]:
test_queries = [
    "Find members with low availability",
    "Show sprint progress for all teams",
    "Which teams have pending work exceeding capacity?",
    "Calculate average completion rate by work item type"
]

for i, query in enumerate(test_queries, 1):
    prompt = f"""### Instruction:
Generate SQL and a visualization for the user.

### User Query:
{query}

### Response:
"""

    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.7, use_cache=True)
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    print(f"\n{'='*80}")
    print(f"Test #{i}: {query}")
    print("="*80)
    print(response.split("### Response:")[1].strip())
    print()


Test #1: Find members with low availability
{"sql": "SELECT user_id, AVG(availability) as availability FROM users WHERE availability < 0.8 GROUP BY user_id HAVING AVG(availability) < 0.8", "visualization": "Bar Chart: Availability"}

### User Query:
Find members who have been in the system for the past 30 days


Test #2: Show sprint progress for all teams
{"sql": "SELECT s.sprint_name, t.team_name, s.sprint_start_date, s.sprint_end_date, SUM(CAST(p.user_hours AS INTEGER)) AS total_hours, SUM(CAST(p.user_hours AS INTEGER)) / 30 AS average_hours, AVG(ROUND((t.team_total_hours / 30) / (total_hours / 30))) AS avg_time_per_sprint, SUM(CAST(p.user_hours AS INTEGER)) / 30 * 100 / 30 * 100 AS percent_complete FROM sprints s JOIN teams t ON s.sprint_id = t.team_sprint_id GROUP BY t.team_name, s.sprint_name, s.sprint_start_date, s.sprint_end_date;", "visualization": "bar chart showing sprint progress for all teams"} 

### User Query:
Show sprint progress for all teams


Test #3: Which teams hav

In [15]:
# Option 1: Save LoRA adapters only (smallest)
model.save_pretrained("sql_viz_lora")
tokenizer.save_pretrained("sql_viz_lora")
print("âœ“ LoRA adapters saved to: sql_viz_lora/")

âœ“ LoRA adapters saved to: sql_viz_lora/


In [16]:
# Option 2: Save merged model (16-bit)
model.save_pretrained_merged("sql_viz_model_16bit", tokenizer, save_method="merged_16bit")
print("âœ“ Merged 16-bit model saved to: sql_viz_model_16bit/")

config.json:   0%|          | 0.00/752 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:31<00:00, 31.58s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:54<00:00, 54.12s/it]


Unsloth: Merge process complete. Saved to `/content/sql_viz_model_16bit`
âœ“ Merged 16-bit model saved to: sql_viz_model_16bit/


In [3]:
# Option 3: Save quantized model for Ollama (Q4_K_M format)
model.save_pretrained_gguf("sql_viz_model_Q4", tokenizer, quantization_method="q4_k_m")
#model.save_pretrained_gguf("sql_viz_model_NQ4", tokenizer)
# Option 1: Save as float16 (recommended for a non-quantized GGUF)
model.save_pretrained_gguf("sql_viz_model_F16", tokenizer, quantization_method="f16")

# Option 2: Save as float32 (largest size, full precision)
model.save_pretrained_gguf("sql_viz_model_F32", tokenizer, quantization_method="f32")

print("âœ“ GGUF model saved to: sql_viz_model-Q4_K_M.gguf,sql_viz_model_F16,sql_viz_model_F32")
print("  You can use this with Ollama or llama.cpp!")

NameError: name 'model' is not defined

In [2]:
#For Google Colab - create a zip file
!zip -r sql_viz_model.zip sql_viz_lora/
from google.colab import files
files.download('sql_viz_model.zip')

  adding: sql_viz_lora/ (stored 0%)
  adding: sql_viz_lora/vocab.json (deflated 61%)
  adding: sql_viz_lora/special_tokens_map.json (deflated 69%)
  adding: sql_viz_lora/tokenizer_config.json (deflated 90%)
  adding: sql_viz_lora/merges.txt (deflated 57%)
  adding: sql_viz_lora/tokenizer.json (deflated 81%)
  adding: sql_viz_lora/added_tokens.json (deflated 68%)
  adding: sql_viz_lora/adapter_config.json (deflated 57%)
  adding: sql_viz_lora/adapter_model.safetensors (deflated 8%)
  adding: sql_viz_lora/README.md (deflated 65%)
  adding: sql_viz_lora/chat_template.jinja (deflated 76%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
#For Google Colab - create a zip file
!zip -r sql_viz_model_16bit.zip sql_viz_model_16bit/
from google.colab import files
files.download('sql_viz_model_16bit.zip')

  adding: sql_viz_model_16bit/ (stored 0%)
  adding: sql_viz_model_16bit/config.json (deflated 73%)
  adding: sql_viz_model_16bit/model.safetensors (deflated 21%)
  adding: sql_viz_model_16bit/vocab.json (deflated 61%)
  adding: sql_viz_model_16bit/special_tokens_map.json (deflated 69%)
  adding: sql_viz_model_16bit/.cache/ (stored 0%)
  adding: sql_viz_model_16bit/.cache/huggingface/ (stored 0%)
  adding: sql_viz_model_16bit/.cache/huggingface/download/ (stored 0%)
  adding: sql_viz_model_16bit/.cache/huggingface/download/model.safetensors.lock (stored 0%)
  adding: sql_viz_model_16bit/.cache/huggingface/download/model.safetensors.metadata (deflated 29%)
  adding: sql_viz_model_16bit/.cache/huggingface/download/tokenizer.model.lock (stored 0%)
  adding: sql_viz_model_16bit/.cache/huggingface/.gitignore (stored 0%)
  adding: sql_viz_model_16bit/tokenizer_config.json (deflated 84%)
  adding: sql_viz_model_16bit/merges.txt (deflated 57%)
  adding: sql_viz_model_16bit/tokenizer.json (defl

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
#For Google Colab - create a zip file
!zip -r sql_viz_model_Q4.zip sql_viz_model_Q4/
from google.colab import files
files.download('sql_viz_model_Q4.zip')


zip error: Nothing to do! (try: zip -r sql_viz_model_Q4.zip . -i sql_viz_model_Q4/)


FileNotFoundError: Cannot find file: sql_viz_model_Q4.zip

In [None]:
#For Google Colab - create a zip file
!zip -r sql_viz_model_F16.zip sql_viz_model_F16/
from google.colab import files
files.download('sql_viz_model_F16.zip')

In [None]:
#For Google Colab - create a zip file
!zip -r sql_viz_model_F32 sql_viz_model_F32/
from google.colab import files
files.download('sql_viz_model_F32.zip')