# Check NVIDIA Driver version and python version

In [None]:
!nvidia-smi
!python --version

# Install apt package

In [None]:
!apt-get update
!apt-get install -y build-essential cmake curl libssl-dev libcurl4-openssl-dev unzip pciutils libgl1

# Install unsloth

In [None]:
%pip install unsloth

# Load Model using unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
# fourbit_models = [
#     "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
#     "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
#     "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
#     "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
#     "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
#     "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
#     "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
#     "unsloth/Phi-3-medium-4k-instruct",
#     "unsloth/gemma-2-9b-bnb-4bit",
#     "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

#     "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
#     "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
#     "unsloth/Llama-3.2-3B-bnb-4bit",
#     "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",

#     "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # NEW! Llama 3.3 70B!
# ] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

# Add LoRA

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

# Load dataset

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset

dataset = load_dataset("json", data_files="/dataset.jsonl", split="train")

print(dataset[0])  # Check First entry

# Map to llama chat format

In [None]:
dataset = dataset.map(formatting_prompts_func, batched = True,)

#Show message in lama format
dataset[5]["messages"]
#Show text of message
dataset[5]["text"]

# Define trainer

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

# Verify masking

In [None]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

# Start training process

In [None]:
trainer_stats = trainer.train()

# GPU stats

In [None]:
import torch

# Get the total GPU memory
max_memory = torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024  # GB

# Calculate memory usage
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
start_gpu_memory = torch.cuda.memory_allocated() / 1024 / 1024 / 1024  # Convert to GB
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

# Print stats
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")


# Test trained model

In [None]:
from unsloth.chat_templates import get_chat_template
import re

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Convert to latex bibliography @article{Low_complexity_QSM,\n  serialno={25},\n  author={Z. {Yigit} and E. {Basar}},\n  year={2016},\n  title={Low-complexity detection of quadrature spatial modulation},\n  journal={\\rvtEleLett},\n  volume={52},\n  number={20},\n  pages={1729--1731},\n  doi={10.1049/el.2016.1583},\n}"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

attention_mask = inputs != tokenizer.pad_token_id

outputs = model.generate(input_ids = inputs, max_new_tokens = 256, use_cache = True, 
                         attention_mask=attention_mask, 
                         temperature = 1.5, min_p = 0.1)

#tokenizer.batch_decode(outputs)

# Decode the generated tokens into human-readable text
text = tokenizer.decode(outputs[0], skip_special_tokens=False)

def extract_assistant_response(text):
    match = re.search(r"<\|start_header_id\|>assistant<\|end_header_id\|>(.*?)<\|eot_id\|>", text, re.DOTALL)
    return match.group(1).strip() if match else None

content = extract_assistant_response(text)
print(content)

# Test trained model streamed output

In [None]:
from transformers import TextStreamer

# Get the token ID for <|eot_id|>
eot_token_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")

# Custom streamer to skip <|eot_id|>
class CustomStreamer(TextStreamer):
    def on_finalized_text(self, text: str, stream_end: bool = False):
        if text.strip().endswith("<|eot_id|>"):
            text = text.replace("<|eot_id|>", "").strip()  # Remove the token
            super().on_finalized_text(text, stream_end=True)  # Force stream end
        else:
            super().on_finalized_text(text, stream_end=stream_end)

# Enable faster inference
FastLanguageModel.for_inference(model)

messages = [
    {"role": "user", "content": "Convert to latex bibliography @article{Low_complexity_QSM,\n  serialno={25},\n  author={Z. {Yigit} and E. {Basar}},\n  year={2016},\n  title={Low-complexity detection of quadrature spatial modulation},\n  journal={\\rvtEleLett},\n  volume={52},\n  number={20},\n  pages={1729--1731},\n  doi={10.1049/el.2016.1583},\n}"},
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda")

attention_mask = inputs != tokenizer.pad_token_id

text_streamer = CustomStreamer(tokenizer, skip_prompt=True)

# Generate with custom stopping and cleanup
_ = model.generate(
    input_ids=inputs,
    streamer=text_streamer,
    max_new_tokens=256,
    attention_mask=attention_mask,
    use_cache=True,
    temperature=1.5,
    min_p=0.1,
    eos_token_id=eot_token_id,  # Stop generation at <|eot_id|>
    pad_token_id=tokenizer.pad_token_id
)

# Save model as LoRA adapter

In [None]:
model.save_pretrained("bibtex_lora_model")  # Local saving
tokenizer.save_pretrained("bibtex_lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

# Test inference using saved lora models

In [None]:
# Get the token ID for <|eot_id|>
eot_token_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")

# Custom streamer to skip <|eot_id|>
class CustomStreamer(TextStreamer):
    def on_finalized_text(self, text: str, stream_end: bool = False):
        if text.strip().endswith("<|eot_id|>"):
            text = text.replace("<|eot_id|>", "").strip()  # Remove the token
            super().on_finalized_text(text, stream_end=True)  # Force stream end
        else:
            super().on_finalized_text(text, stream_end=stream_end)
            
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "bibtex_lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Convert to latex bibliography @article{Low_complexity_QSM,\n  serialno={25},\n  author={Z. {Yigit} and E. {Basar}},\n  year={2016},\n  title={Low-complexity detection of quadrature spatial modulation},\n  journal={\\rvtEleLett},\n  volume={52},\n  number={20},\n  pages={1729--1731},\n  doi={10.1049/el.2016.1583},\n}"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

text_streamer = CustomStreamer(tokenizer, skip_prompt=True)

# Generate with custom stopping and cleanup
_ = model.generate(
    input_ids=inputs,
    streamer=text_streamer,
    max_new_tokens=256,
    attention_mask=attention_mask,
    use_cache=True,
    temperature=1.5,
    min_p=0.1,
    eos_token_id=eot_token_id,  # Stop generation at <|eot_id|>
    pad_token_id=tokenizer.pad_token_id
)

# Save as GGUF / llama.cpp model

In [None]:
# Save to 8bit Q8_0
model.save_pretrained_gguf("bibtex_model", tokenizer,)
# if False: model.save_pretrained_gguf("bibtex_model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
# if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# # Save to 16bit GGUF
# if False: model.save_pretrained_gguf("bibtex_model", tokenizer, quantization_method = "f16")
# # if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
# model.save_pretrained_gguf("bibtex_model", tokenizer, quantization_method = "q4_k_m")
#if False: model.save_pretrained_gguf("bibtex_model", tokenizer, quantization_method = "q4_k_m")
# if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# # Save to multiple GGUF options - much faster if you want multiple!
# if False:
#     model.push_to_hub_gguf(
#         "hf/model", # Change hf to your username!
#         tokenizer,
#         quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
#         token = "", # Get a token at https://huggingface.co/settings/tokens
#     )

# Troubleshooting for quantization

In [None]:
# If you get RuntimeError: Unsloth: The file 'llama.cpp/llama-quantize' or 'llama.cpp/quantize' does not exist run this block.
# Then copy llama.cpp/build/bin/llama-quantize to llama.cpp/ and run the block above again.
!(cd llama.cpp; cmake -B build;cmake --build build --config Release)

# Install ollama

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh
!echo "-= Done. =-"

# Install openweb ui

In [None]:
!pip install open-webui
!pip install ffmpeg
!echo "-= Done. =-"

# Start ollama

In [72]:
import subprocess

log_file = "ollama.log"
with open(log_file, "w") as f:
    subprocess.Popen(["ollama", "serve"], stdout=f, stderr=f)

# Show ollama logs

In [None]:
!cat ollama.log

# Add our finetuned model

In [None]:
!ollama create bibtexmodel -f /bibtex_model/Modelfile
!ollama list

# Install cloudflare tunnel

In [None]:
# @title Install cloudflare
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb
!dpkg -i cloudflared-linux-amd64.deb
!rm -rf cloudflared-linux-amd64.deb
!echo "-= Done. =-"

# Start cloudflare tunnel and openweb ui

In [None]:
# @title Start cloudflare tunnel and openwebui
import subprocess
import threading
import time
import socket
import urllib.request

OPENWEBUI_PORT = 9999

def iframe_thread(port):
  while True:
      time.sleep(0.5)
      sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
      result = sock.connect_ex(('127.0.0.1', port))
      if result == 0:
        break
      sock.close()
  print("\nComfyUI finished loading, trying to launch cloudflared (if it gets stuck here cloudflared is having issues)\n")

  p = subprocess.Popen(["cloudflared", "tunnel", "--url", "http://127.0.0.1:{}".format(port)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  for line in p.stderr:
    l = line.decode()
    if "trycloudflare.com " in l:
      cf_url = l[l.find("http"):]
      print("This is the URL to access ComfyUI:", cf_url, end='')

threading.Thread(target=iframe_thread, daemon=True, args=(OPENWEBUI_PORT,)).start()

!open-webui serve --port $OPENWEBUI_PORT

!echo "-= Done. =-"