In [1]:
# Run this cell to install/update all necessary libraries
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes
!pip install datasets

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-eqls6zmx/unsloth_e8354818a9904286bce05af8942f0d5a
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-eqls6zmx/unsloth_e8354818a9904286bce05af8942f0d5a
  Resolved https://github.com/unslothai/unsloth.git to commit 5c22f06a9a24ea88d8d318ea892b8bee5bcfffd7
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [1]:
import torch
from datasets import load_dataset
from unsloth import FastLanguageModel
from transformers import TrainingArguments
from trl import SFTTrainer

# Confirm that the GPU is available
if not torch.cuda.is_available():
    raise SystemExit("GPU is not available. This notebook requires a GPU for fine-tuning.")
else:
    print("GPU is available!")

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
GPU is available!


In [2]:
# We're loading it in 4-bit precision (load_in_4bit=True) to save a lot of memory.
max_seq_length = 2048  # The maximum number of tokens to handle

# Load the model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3.1-8b-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=None,      # Let Unsloth decide the best dtype
    load_in_4bit=True,
)

==((====))==  Unsloth 2025.8.10: Fast Llama patching. Transformers: 4.56.0.
   \\   /|    NVIDIA GeForce RTX 5060 Laptop GPU. Num GPUs = 1. Max memory: 7.526 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 12.0. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
# Add LoRA adapters to the model
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing=True,
    random_state=42,
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.8.10 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [6]:
# The prompt format for our instruction dataset
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Function to format the dataset examples
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["context"]
    outputs = examples["response"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output)
        texts.append(text)
    return { "text" : texts, }

# Load and format the dataset
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

# Let's look at one formatted example
print("Example of a formatted prompt:")
print(dataset[5]['text'])

Generating train split: 100%|██████████| 15011/15011 [00:00<00:00, 280482.27 examples/s]
Map: 100%|██████████| 15011/15011 [00:00<00:00, 150180.32 examples/s]

Example of a formatted prompt:
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
If I have more pieces at the time of stalemate, have I won?

### Input:
Stalemate is a situation in chess where the player whose turn it is to move is not in check and has no legal move. Stalemate results in a draw. During the endgame, stalemate is a resource that can enable the player with the inferior position to draw the game rather than lose. In more complex positions, stalemate is much rarer, usually taking the form of a swindle that succeeds only if the superior side is inattentive.[citation needed] Stalemate is also a common theme in endgame studies and other chess problems.

The outcome of a stalemate was standardized as a draw in the 19th century. Before this standardization, its treatment varied widely, including being deemed a win for the stalemating player, a half-win fo




In [7]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,  # A short training run for demonstration. Increase for better results.
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=42,
        output_dir="outputs",
    ),
)


Unsloth: Tokenizing ["text"] (num_proc=36): 100%|██████████| 15011/15011 [00:06<00:00, 2184.94 examples/s]


### A Better Approach: Training for "Epochs" Instead of "Steps"

#### Instead of guessing the number of steps, it's often better to train for a certain number of epochs.

   * An Epoch: One epoch is one complete pass through the entire training dataset.

   * Why it's better: It automatically scales with your dataset size. A common and solid baseline for fine-tuning is to train for 1 to 3 epochs. The Dolly dataset has about 15,000 examples, so one epoch is much more than 60 steps.

##### Steps per Epoch = Total Examples / Batch Size

##### Total Training Steps = (Steps per Epoch) * (Number of Epochs)



In [8]:
print("Starting the fine-tuning process...")

# This command starts the training
trainer_stats = trainer.train()

print("Fine-tuning completed!")

Starting the fine-tuning process...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 15,011 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,entropy
1,2.2109,0
2,2.0151,No Log
3,1.9883,No Log
4,2.2887,No Log
5,2.181,No Log
6,1.6724,No Log
7,1.5509,No Log
8,1.3551,No Log
9,1.4841,No Log
10,1.6368,No Log


Fine-tuning completed!


In [9]:
print("Saving LoRA adapters...")
model.save_pretrained("lora_model")
print("Model saved to 'lora_model' directory.")

Saving LoRA adapters...
Model saved to 'lora_model' directory.


In [11]:
from transformers import TextStreamer

# Make sure the base model is loaded (it should still be in memory, but this is how you'd do it in a new notebook)
# model, tokenizer = FastLanguageModel.from_pretrained(...) # Already loaded, no need to re-run

# Load the saved LoRA adapters, providing both the path and a name
print("Loading saved adapters...")
model.load_adapter("lora_model", adapter_name="finetuned")
print("Adapters loaded successfully.")

# You can optionally set the adapter as the active one for inference
# model.set_adapter("finetuned")

Loading saved adapters...
Adapters loaded successfully.


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# 1. Base model (same you fine-tuned on)
base_model = "unsloth/llama-3.1-8b-bnb-4bit"
model = AutoModelForCausalLM.from_pretrained(base_model, device_map="auto")

# 2. Load your fine-tuned LoRA adapter
model = PeftModel.from_pretrained(model, "lora_model")

# 3. Load tokenizer from base model
tokenizer = AutoTokenizer.from_pretrained(base_model)

# 4. Run a test
from transformers import pipeline

generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")

prompt = "Explain quantum computing in simple terms."
# output = generator(prompt, max_length=100, do_sample=True, temperature=0.7)

output = generator(
    prompt,
    max_length=200,
    do_sample=True,
    temperature=0.7,       # controls randomness (lower = more focused)
    top_p=0.9,             # nucleus sampling (keeps diversity)
    repetition_penalty=1.2 # discourages repeating phrases
)
print(output[0]["generated_text"])




  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Explain quantum computing in simple terms. How does it differ from classical computing?
Quantum Computing is a type of computer that uses the principles and properties of Quantum Mechanics to perform calculations, as opposed to traditional computers which use transistors or other electronic components.
The difference between these two types of computers comes down primarily to how they store information – while conventional computers store their data using bits (which can be either 0s or 1s), qubits are able to exist simultaneously at both states without collapsing into one state like normal matter would do when observed by humans; this allows for faster processing times since multiple operations can occur within each “bit” instead just being limited per operation like regular processors today!
How did you get started with Qiskit? What drew your attention towards developing an open source library for quantum programming languages such as Python and JavaScript
I got involved because I w

In [11]:
prompt = "when first laser was built?"
# output = generator(prompt, max_length=100, do_sample=True, temperature=0.7)

output = generator(
    prompt,
    max_length=200,
    do_sample=True,
    temperature=0.7,       # controls randomness (lower = more focused)
    top_p=0.9,             # nucleus sampling (keeps diversity)
    repetition_penalty=1.2 # discourages repeating phrases
)
print(output[0]["generated_text"])

when first laser was built? what is the use of it?
The first working Laser device (LASER stands for Light Amplification by Stimulated Emission of Radiation) was made in 1960. The basic principle of LASER operation has been known since Einstein published his theory on stimulated emission, but it wasn't until this time that a practical method to produce and amplify coherent light sources were invented.
Inventor: Gordon Gould
Type: Solid-state Laser
Laser type: Nd3+:YAG
Power output: Up to several kilowatts
Frequency range: Infrared (~1 µm)
Gordon Gould developed an early prototype for an operating laser at Columbia University around 1957-58. He called it "the maser" after its microwave equivalent - the Masers which had already demonstrated how a beam could be amplified from one point to another via a process called stimulated emissions. His invention led him into patent disputes over who really came up with it first;


In [12]:


# The alpaca prompt format
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

eos_token_id = tokenizer.eos_token_id

# 2. Run inference as usual
instruction = "when first laser was built?"
input_text = "A laser is a device that emits light through a process of optical amplification based on the stimulated emission of electromagnetic radiation. The word laser is an anacronym that originated as an acronym for light amplification by stimulated emission of radiation The first laser was built in 1960 by Theodore Maiman at Hughes Research Laboratories, based on theoretical work by Charles H. Townes and Arthur Leonard Schawlow. A laser differs from other sources of light in that it emits light that is coherent. Spatial coherence allows a laser to be focused to a tight spot, enabling applications such as laser cutting and lithography. Spatial coherence also allows a laser beam to stay narrow over great distances (collimation), enabling applications such as laser pointers and lidar (light detection and ranging). Lasers can also have high temporal coherence, which allows them to emit light with a very narrow spectrum. Alternatively, temporal coherence can be used to produce ultrashort pulses of light with a broad spectrum but durations as short as a femtosecond. Lasers are used in optical disc drives, laser printers, barcode scanners, DNA sequencing instruments, fiber-optic, and free-space optical communication, semiconducting chip manufacturing (photolithography), laser surgery and skin treatments, cutting and welding materials, military and law enforcement devices for marking targets and measuring range and speed, and in laser lighting displays for entertainment. Semiconductor lasers in the blue to near-UV have also been used in place of light-emitting diodes (LEDs) to excite fluorescence as a white light source. This permits a much smaller emitting area due to the much greater radiance of a laser and avoids the droop suffered by LEDs; such devices are already used in some car headlamps."

prompt = alpaca_prompt.format(instruction, input_text, "")
inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens=64, eos_token_id=eos_token_id)
decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Parse and print the response
response_text = decoded_output[0].split("### Response:")[1].strip()
print("\nModel Response:")
print(response_text)
# The correct administrative capital is Sri Jayawardenepura Kotte.


Model Response:
1960


In [4]:
from datasets import load_dataset

# This command will check the cache first.
# If it finds the "dolly-15k" dataset, it will load it from your disk.
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

# You can now use the dataset as before
print(dataset)

Dataset({
    features: ['instruction', 'context', 'response', 'category'],
    num_rows: 15011
})


In [5]:
# Save the dataset to a CSV file
print("Saving dataset to CSV...")
dataset.to_csv("dolly_dataset.csv")
print("File 'dolly_dataset.csv' saved successfully!")

Saving dataset to CSV...


Creating CSV from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 144.09ba/s]

File 'dolly_dataset.csv' saved successfully!



