In [4]:
# Code step
import torch

# Detect GPU capability and set precision flags
if torch.cuda.is_available():
    major, minor = torch.cuda.get_device_capability()
else:
    major, minor = (0, 0)

USE_BF16 = major >= 8     # True for Ampere+ (A100, L4, 30xx, 40xx)
USE_FP16 = not USE_BF16   # True for T4, V100, etc.

print(f"GPU capability: {major}.{minor} → bf16={USE_BF16}, fp16={USE_FP16}")


GPU capability: 8.0 → bf16=True, fp16=False


In [9]:
# Step 1: Load the JSONL dataset created from the resume into a Hugging Face Dataset
# ------------------------------------------------------------
# Unsloth → GGUF → Ollama end-to-end (single-GPU, local)
# Dataset: JSONL with {"text": "..."} per line
# ------------------------------------------------------------
import os, json, subprocess, sys, shutil
from pathlib import Path

from datasets import load_dataset
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template, CHAT_TEMPLATES
from transformers import TrainingArguments
from trl import SFTTrainer

# -----------------------
# 0) CONFIG — EDIT THESE
# -----------------------
DATA_JSONL = "/content/Viswanath_Chirravuri_Resume.jsonl"
BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
# Other good open-weight options:
#   "mistralai/Mistral-7B-Instruct-v0.3"
#   "Qwen/Qwen2.5-7B-Instruct"
#   "meta-llama/Llama-3.2-3B-Instruct" (gated; needs HF login)

OUT_DIR          = Path("runs/unsloth_run")
LORA_DIR         = OUT_DIR / "lora"
MERGED_DIR       = OUT_DIR / "merged_fp16"
GGUF_DIR         = OUT_DIR / "gguf"
OLLAMA_MODELNAME = "my-unsloth-model"
QUANT_METHOD     = "q4_k_m"
MAX_SEQ_LEN      = 1024
LOAD_IN_4BIT     = False
BATCH_PER_DEV    = 1
GRAD_ACC_STEPS   = 8
NUM_EPOCHS       = 1
LEARNING_RATE    = 2e-4
RANK_R           = 16
LORA_ALPHA       = 16
USE_GC           = "unsloth"    # gradient checkpointing

# -----------------------------
# 1) Load dataset ({"text":..})
# -----------------------------
print("Loading dataset...")
if not Path(DATA_JSONL).exists():
    raise FileNotFoundError(f"Cannot find {DATA_JSONL}")

ds = load_dataset("json", data_files=DATA_JSONL, split="train")
cols = set(ds.column_names)
if "text" not in cols:
    raise ValueError(
        f"Expected a 'text' column, found {cols}. "
        "Each JSONL line should look like: {\"text\": \"...\"}"
    )
print(f"Dataset rows: {len(ds)}  | columns: {cols}")

# --------------------------------------------
# 2) Load base model + tokenizer (Unsloth QLoRA)
# --------------------------------------------
print("Loading base model with Unsloth...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = BASE_MODEL,
    max_seq_length = MAX_SEQ_LEN,
    dtype = None,
    load_in_4bit = LOAD_IN_4BIT,
)

# Choose a chat template that matches the base model *for inference time only*.
# Training below uses plain causal LM on raw text — no chat formatting applied.
def choose_template(model_id: str) -> str:
    low = model_id.lower()
    if "llama-3" in low or "llama-3.1" in low or "llama-3.2" in low or "llama-3.3" in low:
        return "llama-3"
    if "mistral" in low:
        return "mistral"
    if "qwen" in low:
        return "qwen-2.5"
    if "gemma" in low:
        return "gemma"
    return "chatml"

CHAT_TEMPLATE = choose_template(BASE_MODEL)
tokenizer = get_chat_template(tokenizer, chat_template=CHAT_TEMPLATE)

# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = RANK_R,
    target_modules = [
        "q_proj","k_proj","v_proj","o_proj",
        "gate_proj","up_proj","down_proj",
    ],
    lora_alpha = LORA_ALPHA,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = USE_GC,
)

# -------------------------------------------------
# 3) Train: plain causal LM on your resume text
#     - We keep dataset_text_field="text"
#     - packing=True to fill the context with many short lines
# -------------------------------------------------
print("Starting fine-tune...")
training_args = TrainingArguments(
    output_dir = str(OUT_DIR / "hf_trainer"),
    per_device_train_batch_size = BATCH_PER_DEV,
    gradient_accumulation_steps = GRAD_ACC_STEPS,
    learning_rate = LEARNING_RATE,
    num_train_epochs = NUM_EPOCHS,
    logging_steps = 5,
    bf16 = USE_BF16,
    fp16 = USE_FP16,                 # <-- add this
    optim = "adamw_torch",           # <-- avoid fused on Colab T4
    report_to = "none",
)


trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = ds,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LEN,
    packing = True,
    args = training_args,
)

trainer.train()
print("Training done.")

# -------------------------------------------------
# 4) Save LoRA and merged FP16 (for GGUF export)
# -------------------------------------------------
OUT_DIR.mkdir(parents=True, exist_ok=True)
LORA_DIR.mkdir(parents=True, exist_ok=True)
MERGED_DIR.mkdir(parents=True, exist_ok=True)
GGUF_DIR.mkdir(parents=True, exist_ok=True)

print("Saving LoRA adapter...")
model.save_pretrained_merged(str(LORA_DIR), tokenizer, save_method="lora")

print("Merging LoRA into a full 16-bit model...")
model.save_pretrained_merged(str(MERGED_DIR), tokenizer, save_method="merged_16bit")

# -------------------------------------------------
# 5) Export GGUF for Ollama (quantized)
# -------------------------------------------------
print(f"Exporting GGUF → {GGUF_DIR} with quant={QUANT_METHOD} ...")
model.save_pretrained_gguf(str(GGUF_DIR), tokenizer, quantization_method=QUANT_METHOD)

ggufs = sorted(GGUF_DIR.glob("*.gguf"))
if not ggufs:
    raise FileNotFoundError("No .gguf produced.")
gguf_path = ggufs[0]
print("GGUF created:", gguf_path.name)

Loading dataset...
Dataset rows: 63  | columns: {'text'}
Loading base model with Unsloth...
==((====))==  Unsloth 2025.8.4: Fast Llama patching. Transformers: 4.55.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Starting fine-tune...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 63 | Num Epochs = 1 | Total steps = 8
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Step,Training Loss
5,4.2228


Training done.
Saving LoRA adapter...
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Successfully copied all 2 files from cache to runs/unsloth_run/lora.
Downloading safetensors index for unsloth/Llama-3.2-3B-Instruct...


Unsloth: Merging weights into 16bit: 100%|██████████| 2/2 [00:19<00:00,  9.57s/it]


Merging LoRA into a full 16-bit model...
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Successfully copied all 2 files from cache to runs/unsloth_run/merged_fp16.
Downloading safetensors index for unsloth/Llama-3.2-3B-Instruct...


Unsloth: Merging weights into 16bit: 100%|██████████| 2/2 [00:18<00:00,  9.12s/it]
Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Exporting GGUF → runs/unsloth_run/gguf with quant=q4_k_m ...
Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 54.77 out of 83.48 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 28/28 [00:00<00:00, 119.56it/s]


Unsloth: Saving tokenizer... Done.
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at runs/unsloth_run/gguf into bf16 GGUF format.
The output location will be /content/runs/unsloth_run/gguf/unsloth.BF16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: gguf
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-g

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Unsloth: Conversion completed! Output location: /content/runs/unsloth_run/gguf/unsloth.Q4_K_M.gguf
Unsloth: Saved Ollama Modelfile to runs/unsloth_run/gguf/Modelfile
GGUF created: unsloth.BF16.gguf
