<a href="https://colab.research.google.com/github/vi14m/finetune-LLM/blob/main/finetuned_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

In [2]:
from unsloth import FastModel
import torch
from datasets import load_dataset
from transformers import TextStreamer
from unsloth.chat_templates import get_chat_template, standardize_data_formats, train_on_responses_only
from trl import SFTTrainer, SFTConfig

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it",
    max_seq_length = 2048,
    load_in_4bit = True,
    load_in_8bit = False,
    full_finetuning = False,
)

==((====))==  Unsloth 2025.8.4: Fast Gemma3 patching. Transformers: 4.55.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.




model.safetensors:   0%|          | 0.00/4.56G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

In [4]:
# Prepare LoRA model
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers = False,
    finetune_language_layers = True,
    finetune_attention_modules = True,
    finetune_mlp_modules = True,
    r = 8,
    lora_alpha = 8,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

Unsloth: Making `model.base_model.model.model.language_model` require gradients


In [5]:
# Set chat template
tokenizer = get_chat_template(tokenizer, chat_template = "gemma-3")

In [6]:
# Load train, validation, and test datasets
train_dataset = load_dataset("mbpp", split = "train")
val_dataset = load_dataset("mbpp", split = "validation")
test_dataset = load_dataset("mbpp", split = "test")

README.md: 0.00B [00:00, ?B/s]

full/train-00000-of-00001.parquet:   0%|          | 0.00/87.2k [00:00<?, ?B/s]

full/test-00000-of-00001.parquet:   0%|          | 0.00/116k [00:00<?, ?B/s]

full/validation-00000-of-00001.parquet:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

full/prompt-00000-of-00001.parquet:   0%|          | 0.00/7.88k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/374 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/90 [00:00<?, ? examples/s]

Generating prompt split:   0%|          | 0/10 [00:00<?, ? examples/s]

In [7]:
def formatting_prompts_func(examples):
    # Format MBPP dataset into conversation format suitable for Gemma-3
    convos = []
    for prompt_text, code, test_cases in zip(examples["text"], examples["code"], examples["test_list"]):
        assistant_response = code + "\n\nTest Cases:\n" + "\n".join(test_cases)
        convo = [
            {"role": "user", "content": prompt_text},
            {"role": "assistant", "content": assistant_response}
        ]
        convos.append(convo)

    # Apply the chat template and remove the leading <bos> token
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False).removeprefix('<bos>') for convo in convos]
    return { "text" : texts }

# Format all datasets
train_dataset = train_dataset.map(formatting_prompts_func, batched = True)
val_dataset = val_dataset.map(formatting_prompts_func, batched = True)
test_dataset = test_dataset.map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [8]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,  # Using validation dataset for evaluation
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 40,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none",
        eval_steps = 5,  # Evaluate every 10 steps
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/374 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/90 [00:00<?, ? examples/s]

In [9]:
# Train only on responses
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<start_of_turn>user\n",
    response_part = "<start_of_turn>model\n",
)

Map (num_proc=2):   0%|          | 0/374 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/90 [00:00<?, ? examples/s]

In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 374 | Num Epochs = 1 | Total steps = 40
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 14,901,248 of 4,314,980,720 (0.35% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.8239
2,2.2321
3,2.0386
4,1.652
5,1.7726
6,1.1846
7,1.0633
8,0.9665
9,1.0325
10,0.7846


In [11]:
def generate_code(row, model, tokenizer):
    """Generates code for a given dataset row using the fine-tuned model."""
    prompt_text = row["text"]
    messages = [{"role": "user", "content": [{"type": "text", "text": prompt_text}]}]

    # Apply chat template and add generation prompt
    text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)

    # Generate response from the model
    outputs = model.generate(
        **tokenizer([text], return_tensors="pt").to("cuda"),
        max_new_tokens=256,
        temperature=1.0,
        top_p=0.95,
        top_k=64,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode and extract the generated content
    decoded_output = tokenizer.batch_decode(outputs)[0]

    try:
        start_model_token = "<start_of_turn>model\n"
        end_model_token = "<end_of_turn>\n"
        start_index = decoded_output.find(start_model_token) + len(start_model_token)
        end_index = decoded_output.rfind(end_model_token)
        generated_content = decoded_output[start_index:end_index]

        # Split to get code part before "Test Cases:"
        test_cases_marker = "\n\nTest Cases:\n"
        code_end_index = generated_content.find(test_cases_marker)
        if code_end_index != -1:
            generated_code = generated_content[:code_end_index].strip()
        else:
            generated_code = generated_content.strip()

    except Exception as e:
        print(f"Error extracting generated code: {e}")
        generated_code = ""

    return generated_code

def evaluate_code(generated_code, test_cases, test_setup_code):
    """Evaluates the generated code against the provided test cases."""
    exec_globals = {}
    try:
        # Execute test setup code
        if test_setup_code:
            exec(test_setup_code, exec_globals)

        # Execute the generated code
        exec(generated_code, exec_globals)

        # Execute test cases
        all_tests_passed = True
        for test in test_cases:
            try:
                exec(test, exec_globals)
            except AssertionError:
                all_tests_passed = False
                break
            except Exception as e:
                print(f"Error executing test case '{test}': {e}")
                all_tests_passed = False
                break

    except Exception as e:
        print(f"Error during code execution: {e}")
        all_tests_passed = False

    return all_tests_passed

In [12]:
# Load original test data for evaluation
test_dataset_original = load_dataset("mbpp", split = "test")

# Evaluate on a subset of the test dataset
passed_count = 0
total_problems = 10  # Evaluate on 10 problems for demonstration

for i in range(total_problems):
    print(f"\nEvaluating problem {i+1}/{total_problems}")

    # Use the formatted text for inference
    formatted_row = test_dataset[i]
    generated_code = generate_code(formatted_row, model, tokenizer)
    print(f"Generated code:\n{generated_code}")

    # Use the original data for evaluation
    original_row = test_dataset_original[i]
    test_cases = original_row["test_list"]
    test_setup_code = original_row["test_setup_code"]
    if evaluate_code(generated_code, test_cases, test_setup_code):
        passed_count += 1
        print(f"✅ Problem {i+1} passed all tests.")
    else:
        print(f"❌ Problem {i+1} failed some tests.")

# Calculate accuracy
accuracy = (passed_count / total_problems) * 100
print(f"\nEvaluation Results:")
print(f"Total problems: {total_problems}")
print(f"Problems with all tests passed: {passed_count}")
print(f"Accuracy: {accuracy:.2f}%")


Evaluating problem 1/10
Generated code:
def remove_Occ(s,ch): 
    for i in range(len(s)): 
        if (s[i] == ch): 
            s = s[0 : i] + s[i + 1:] 
            break
    for i in range(len(s) - 1,-1,-1):  
        if (s[i] == ch): 
            s = s[0 : i] + s[i + 1:] 
            break
    return s
✅ Problem 1 passed all tests.

Evaluating problem 2/10
Generated code:
def sort_matrix(M):
    result = sorted(M, key=sum)
    return result
✅ Problem 2 passed all tests.

Evaluating problem 3/10
Generated code:
from collections import Counter
def count_common(words):
  word_counts = Counter(words)
  top_four = word_counts.most_common(4)
  return (top_four)
✅ Problem 3 passed all tests.

Evaluating problem 4/10
Generated code:
def find_Volume(l,b,h) : 
    return ((l * b * h) / 2)
✅ Problem 4 passed all tests.

Evaluating problem 5/10
Generated code:
import re
def split_lowerstring(text):
 return (re.findall('[a-z][^a-z]*', text))
✅ Problem 5 passed all tests.

Evaluating problem 6

In [13]:
# Save the fine-tuned model
model.save_pretrained("gemma-3-mbpp-finetuned")
tokenizer.save_pretrained("gemma-3-mbpp-finetuned")

# Optionally save to GGUF format
if False:  # Change to True to enable
    model.save_pretrained_gguf(
        "gemma-3-mbpp-finetuned-gguf",
        quantization_type = "Q8_0",
    )

In [14]:

def chatbot():
    print("Gemma-3 Code Assistant (type 'quit' to exit)")
    print("-------------------------------------------")
    print("Note: I will provide code solutions with explanations only.")

    while True:
        user_input = input("\nUser: ")
        if user_input.lower() == 'quit':
            break

        # Format as a message with instruction to only provide code and explanation
        messages = [{
            "role": "user",
            "content": [{
                "type": "text",
                "text": f"{user_input}\n\nPlease provide only the Python code solution with a brief explanation. Do not include any test cases."
            }]
        }]

        # Apply chat template
        text = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
        )

        print("\nAssistant:")
        streamer = TextStreamer(tokenizer, skip_prompt=True)

        # Generate response
        _ = model.generate(
            **tokenizer([text], return_tensors="pt").to("cuda"),
            max_new_tokens=256,  # Reduced since we don't need test cases
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            streamer=streamer,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            repetition_penalty=1.1,  # Helps avoid repeating test cases
        )

chatbot()

Gemma-3 Code Assistant (type 'quit' to exit)
-------------------------------------------
Note: I will provide code solutions with explanations only.

User: write code to reverse string

Assistant:
def reverse_string(s):
  return s[len(s)-1::-1]

Explanation: 
The function receives an input string (s). The function returns reversed string by slicing from the last character till the beginning of the string and reversing the order.
Test Cases
assert reverse_string("hello") == "olleh"
assert reverse_string("python") == "nothyp"
assert reverse_string('world') == 'dlrow'<end_of_turn>

User: quit


### Optional

In [15]:
!pip install Flask



In [16]:
from flask import Flask

app = Flask(__name__)

In [23]:
from flask import request, jsonify

@app.route('/generate', methods=['POST'])
def generate_code_api():
    data = request.get_json()
    prompt = data.get('prompt')

    if not prompt:
        return jsonify({"error": "No prompt provided"}), 400

    # Format as a message with instruction to only provide code and explanation
    messages = [{
        "role": "user",
        "content": [{
            "type": "text",
            "text": f"{prompt}\n\nPlease provide only the Python code solution with a brief explanation. Do not include any test cases."
        }]
    }]

    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
    )

    # Generate response
    outputs = model.generate(
        **tokenizer([text], return_tensors="pt").to("cuda"),
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        top_k=50,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        repetition_penalty=1.1,
    )

    # Decode and extract the generated content
    decoded_output = tokenizer.batch_decode(outputs)[0]

    try:
        # Attempt to find the start of the model's response
        start_model_token = "<start_of_turn>model\n"
        start_index = decoded_output.find(start_model_token)
        if start_index != -1:
            start_index += len(start_model_token)
            generated_content = decoded_output[start_index:].strip()

            # Split to get code part before "Test Cases:" or other markers
            test_cases_marker = "\n\nTest Cases:\n"
            explanation_marker = "\n\nExplanation:\n" # Also look for Explanation marker
            code_end_index_test = generated_content.find(test_cases_marker)
            code_end_index_explanation = generated_content.find(explanation_marker)

            if code_end_index_test != -1 and (code_end_index_explanation == -1 or code_end_index_test < code_end_index_explanation):
                generated_code = generated_content[:code_end_index_test].strip()
            elif code_end_index_explanation != -1:
                 generated_code = generated_content[:code_end_index_explanation].strip()
            else:
                generated_code = generated_content.strip() # Take everything if no markers found

        else:
            generated_code = "" # No model response found

    except Exception as e:
        print(f"Error extracting generated code: {e}")
        generated_code = ""

    return jsonify({"generated_code": generated_code})

AssertionError: The setup method 'route' can no longer be called on the application. It has already handled its first request, any changes will not be applied consistently.
Make sure all imports, decorators, functions, etc. needed to set up the application are done before running it.

In [18]:
app.run(host='127.0.0.1', port=5000, debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


In [21]:
import threading
import time

def run_flask():
    app.run(host='127.0.0.1', port=5000, debug=True, use_reloader=False)

# Run Flask in a separate thread so the notebook is not blocked
thread = threading.Thread(target=run_flask)
thread.start()

# Give the server a moment to start
time.sleep(3)
print("Flask server is running.")

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


Flask server is running.


In [22]:
import requests
import json

url = 'http://127.0.0.1:5000/generate'
prompt_data = {"prompt": "write a function to calculate the factorial of a number"}

try:
    response = requests.post(url, json=prompt_data)
    response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
    generated_code = response.json().get('generated_code')

    if generated_code:
        print("Generated Code:")
        print(generated_code)
    else:
        print("No code generated.")

except requests.exceptions.RequestException as e:
    print(f"Error sending request: {e}")
    print("Please ensure the Flask server is running in the background.")

INFO:werkzeug:127.0.0.1 - - [10/Aug/2025 07:41:00] "POST /generate HTTP/1.1" 200 -


No code generated.


In [24]:
# Attempt to kill the Flask process
!pkill -f 'flask run'
!pkill -f 'python run_flask.py' # In case it was run from a file

In [27]:
# Regenerate and run the Flask app initialization and route definition
from flask import Flask, request, jsonify
import threading
import time

app = Flask(__name__)

@app.route('/generate', methods=['POST'])
def generate_code_api():
    data = request.get_json()
    prompt = data.get('prompt')

    if not prompt:
        return jsonify({"error": "No prompt provided"}), 400

    messages = [{
        "role": "user",
        "content": [{
            "type": "text",
            "text": f"{prompt}\n\nPlease provide only the Python code solution with a brief explanation. Do not include any test cases."
        }]
    }]

    text = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
    )

    outputs = model.generate(
        **tokenizer([text], return_tensors="pt").to("cuda"),
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        top_k=50,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        repetition_penalty=1.1,
    )

    decoded_output = tokenizer.batch_decode(outputs)[0]

    try:
        start_model_token = "<start_of_turn>model\n"
        start_index = decoded_output.find(start_model_token)
        if start_index != -1:
            start_index += len(start_model_token)
            generated_content = decoded_output[start_index:].strip()

            test_cases_marker = "\n\nTest Cases:\n"
            explanation_marker = "\n\nExplanation:\n"
            code_end_index_test = generated_content.find(test_cases_marker)
            code_end_index_explanation = generated_content.find(explanation_marker)

            if code_end_index_test != -1 and (code_end_index_explanation == -1 or code_end_index_test < code_end_index_explanation):
                generated_code = generated_content[:code_end_index_test].strip()
            elif code_end_index_explanation != -1:
                 generated_code = generated_content[:code_end_index_explanation].strip()
            else:
                generated_code = generated_content.strip()

        else:
            generated_code = ""

    except Exception as e:
        print(f"Error extracting generated code: {e}")
        generated_code = ""

    return jsonify({"generated_code": generated_code})

def run_flask():
    # Use a different port
    app.run(host='127.0.0.1', port=5001, debug=True, use_reloader=False)

# Run Flask in a separate thread
thread = threading.Thread(target=run_flask)
thread.start()

# Give the server a moment to start
time.sleep(3)
print("Flask server is running on port 5001.")

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5001
INFO:werkzeug:[33mPress CTRL+C to quit[0m


Flask server is running on port 5001.


In [28]:
import requests
import json

# Update the URL to the new port
url = 'http://127.0.0.1:5001/generate'
prompt_data = {"prompt": "write a function to calculate the factorial of a number"}

try:
    response = requests.post(url, json=prompt_data)
    response.raise_for_status()
    generated_code = response.json().get('generated_code')

    if generated_code:
        print("Generated Code:")
        print(generated_code)
    else:
        print("No code generated.")

except requests.exceptions.RequestException as e:
    print(f"Error sending request: {e}")
    print("Please ensure the Flask server is running in the background on port 5001.")

INFO:werkzeug:127.0.0.1 - - [10/Aug/2025 07:44:42] "POST /generate HTTP/1.1" 200 -


Generated Code:
def factorial(n):
  if n == 0:
    return 1;
  else:
      return (n*factorial(n-1))


In [None]:
!curl -X POST -H "Content-Type: application/json" -d '{"prompt": "write a python function to check if a number is prime"}' http://127.0.0.1:5001/generate