In [41]:
# Install required packages
!pip install -q unsloth wandb transformers torch accelerate datasets bitsandbytes sentencepiece
!pip install -q packaging ninja
!pip install -q ollama-python

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ollama 0.4.7 requires httpx<0.29,>=0.27, but you have httpx 0.26.0 which is incompatible.
google-genai 1.10.0 requires httpx<1.0.0,>=0.28.1, but you have httpx 0.26.0 which is incompatible.[0m[31m
[0m

In [42]:
!pip install ollama

Collecting httpx<0.29,>=0.27 (from ollama)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Using cached httpx-0.28.1-py3-none-any.whl (73 kB)
Installing collected packages: httpx
  Attempting uninstall: httpx
    Found existing installation: httpx 0.26.0
    Uninstalling httpx-0.26.0:
      Successfully uninstalled httpx-0.26.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ollama-python 0.1.2 requires httpx<0.27.0,>=0.26.0, but you have httpx 0.28.1 which is incompatible.[0m[31m
[0mSuccessfully installed httpx-0.28.1


In [43]:
import os
import torch
import wandb
from datasets import load_dataset
from unsloth import FastLanguageModel
from transformers import TrainingArguments
import ollama
import json
import shutil

In [57]:
# Disable Triton to avoid BF16 errors on T4
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["PYTORCH_TRITON"] = "0"

In [58]:
def prepare_dataset():
    # Load a sample dataset
    dataset = load_dataset("tatsu-lab/alpaca", split="train")
    dataset = dataset.select(range(100))  # Small subset for demo

    def format_instruction(example):
        system_prompt = "You are a helpful assistant."
        instruction = example["instruction"]
        input_text = example["input"] if example["input"] else ""
        output = example["output"]

        if input_text:
            formatted_text = f"<|system|>\n{system_prompt}\n<|user|>\n{instruction}\n{input_text}\n<|assistant|>\n{output}"
        else:
            formatted_text = f"<|system|>\n{system_prompt}\n<|user|>\n{instruction}\n<|assistant|>\n{output}"

        return {"text": formatted_text}

    formatted_dataset = dataset.map(format_instruction)
    return formatted_dataset

In [59]:
def finetune_model():
    print("Loading model...")

    # Use DistilGPT2 instead - much smaller and should work on T4
    model_name = "distilgpt2"  # Only 82M parameters

    # Load model using HuggingFace directly instead of Unsloth for this case
    from transformers import AutoModelForCausalLM, AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Make sure tokenizer has padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,  # Avoid float16 or bf16 entirely
    )

    # Add LoRA using PEFT directly instead of Unsloth's wrapper
    from peft import get_peft_model, LoraConfig

    peft_config = LoraConfig(
        r=4,
        lora_alpha=16,
        target_modules=["c_attn", "c_proj"],  # GPT-2 style attention modules
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

    model = get_peft_model(model, peft_config)

    # Prepare dataset
    train_dataset = prepare_dataset()

    # Tokenize the dataset
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

    tokenized_dataset = train_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=train_dataset.column_names
    )

    # Add labels for causal LM training
    def add_labels(examples):
        examples["labels"] = examples["input_ids"].copy()
        return examples

    tokenized_dataset = tokenized_dataset.map(add_labels)

    # Set up training arguments - avoiding any fp16/bf16 usage
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=1,
        per_device_train_batch_size=4,  # Can use larger batch size with smaller model
        gradient_accumulation_steps=4,
        optim="adamw_8bit",  # Use 8-bit optimizer to save memory
        learning_rate=5e-4,
        weight_decay=0.01,
        fp16=False,  # Disable fp16
        bf16=False,  # Disable bf16
        max_grad_norm=0.3,
        warmup_ratio=0.03,
        group_by_length=True,
        lr_scheduler_type="cosine",
        logging_steps=5,
        save_strategy="epoch",
        report_to="wandb",
        remove_unused_columns=False,
    )

    # Set up trainer
    from transformers import Trainer

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )

    # Start training
    print("Starting fine-tuning...")
    trainer.train()

    # Save the model
    output_dir = "./fine_tuned_model"
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    print(f"Model saved to {output_dir}")
    return output_dir, model, tokenizer

In [60]:
# Export the model to Ollama format
def export_to_ollama(model_dir, model, tokenizer):
    print("Preparing model for Ollama export...")

    # Create Ollama directory
    ollama_dir = "./ollama_model"
    os.makedirs(ollama_dir, exist_ok=True)

    # Create Modelfile for Ollama - using GPT-2 base
    modelfile_content = """
    FROM gpt2
    PARAMETER temperature 0.7
    PARAMETER top_p 0.9
    PARAMETER top_k 40
    """

    with open(f"{ollama_dir}/Modelfile", "w") as f:
        f.write(modelfile_content)

    # Create manifest
    manifest = {
        "name": "fine-tuned-gpt2",
        "version": "1.0",
        "license": "custom",
        "architecture": "gpt2",
        "parameters": {
            "temperature": 0.7,
            "top_p": 0.9,
            "top_k": 40
        }
    }

    with open(f"{ollama_dir}/manifest.json", "w") as f:
        json.dump(manifest, f, indent=2)

    # Copy model files
    shutil.copytree(model_dir, f"{ollama_dir}/model", dirs_exist_ok=True)

    print(f"Model prepared for Ollama at {ollama_dir}")
    return ollama_dir

In [61]:
def setup_ollama_and_inference(ollama_model_dir):
    print("Setting up Ollama for inference...")

    print("Command to create Ollama model:")
    print(f"!ollama create fine-tuned-gpt2 -f {ollama_model_dir}/Modelfile")

    print("\nDemo inference (would run on actual Ollama installation):")
    print("Example prompt: 'Write a short poem about machine learning'")

    print("\n=== USAGE INSTRUCTIONS ===")
    print("1. Install Ollama on your machine: https://ollama.ai/download")
    print("2. Copy the model files to your Ollama directory")
    print("3. Run: ollama create fine-tuned-gpt2 -f Modelfile")
    print("4. Start inferencing: ollama run fine-tuned-gpt2")



In [62]:
def main():
    print("Starting fine-tuning and Ollama export process...")
    model_dir, model, tokenizer = finetune_model()
    ollama_model_dir = export_to_ollama(model_dir, model, tokenizer)
    setup_ollama_and_inference(ollama_model_dir)

    print("\nProcess complete! To use this model with Ollama:")
    print("1. Install Ollama: https://ollama.ai/download")
    print("2. Follow the usage instructions above to import and run the model")



In [63]:
if __name__ == "__main__":
    main()

Starting fine-tuning and Ollama export process...
Loading model...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Starting fine-tuning...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100 | Num Epochs = 1 | Total steps = 6
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 202,752/82,115,328 (0.25% trained)


Step,Training Loss
5,9.4035


Model saved to ./fine_tuned_model
Preparing model for Ollama export...
Model prepared for Ollama at ./ollama_model
Setting up Ollama for inference...
Command to create Ollama model:
!ollama create fine-tuned-gpt2 -f ./ollama_model/Modelfile

Demo inference (would run on actual Ollama installation):
Example prompt: 'Write a short poem about machine learning'

=== USAGE INSTRUCTIONS ===
1. Install Ollama on your machine: https://ollama.ai/download
2. Copy the model files to your Ollama directory
3. Run: ollama create fine-tuned-gpt2 -f Modelfile
4. Start inferencing: ollama run fine-tuned-gpt2

Process complete! To use this model with Ollama:
1. Install Ollama: https://ollama.ai/download
2. Follow the usage instructions above to import and run the model
