In [None]:
import json

file = json.load(open("company_cleaning_minroot_1000.json", "r"))
validation_file = json.load(open("company_cleaning_minroot_val_disjoint1000.json", "r"))

In [None]:
!pip install unsloth trl peft accelerate bitsandbytes

In [None]:
# For GPU check
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

In [None]:
from unsloth import FastLanguageModel
import torch

model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"

max_seq_length = 64
dtype = None  # let Unsloth automatically detect the best precision

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)

In [None]:
from datasets import Dataset

def format_prompt(example):
    return f"### Input: {example['input']}\n### Output: {json.dumps(example['label'])}<|endoftext|>"

formatted_data = [format_prompt(item) for item in file]
dataset = Dataset.from_dict({"text": formatted_data})

In [None]:
formatted_data[:5]

In [None]:
# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=8,  # LoRA rank suggested as suffice in the LoRA paper
    target_modules=[
        "q_proj", "v_proj", # adapters on these projections perform best sugested by the LoRA paper
        "o_proj", "k_proj", "gate_proj", "up_proj", "down_proj", # in newer research, it is recommended to apply LoRA to all layers
    ],
    lora_alpha=16,  # LoRA scaling factor (usually 2x rank), controls the strength of the fine-tuned adjustments
    lora_dropout=0,  # regularization that helps prevent overfitting by randomly setting a fraction of LoRA activations to zero during each training step. No dropout because of our small clean dataset
    use_gradient_checkpointing="unsloth",  # Unsloth's optimized version, reduces memory usage by an extra 30% and supports extremly long context fine-tunes
    random_state=12,
    use_rslora=False, # apply the effective scaling as the standard lora_alpha / r
    loftq_config=None, # advanced technique proposed in LoftQ
)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Training arguments optimized for Unsloth
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text", # field in dataset containing the full text samples
    max_seq_length=max_seq_length,  # maximum number of tokens per input sample
    dataset_num_proc=2, # use 2 CPU processes for parallel tokenization
    args=TrainingArguments(
        # Optimization settings
        learning_rate=2e-4,  # high LR works well for LoRA since only small adapter weights are trained
        optim="adamw_8bit", # memory-efficient 8-bit AdamW optimizer
        weight_decay=0.01, # small L2 regularization to prevent overfitting
        lr_scheduler_type="linear", # linearly decreases LR from initial value to 0 over training

        # Warmup
        warmup_steps=10, # gradually increase LR from 0 to target LR over first 10 steps (helps stabilize training)

        # Batch
        per_device_train_batch_size=2, # number of samples processed per device (GPU) per step
        gradient_accumulation_steps=4,  # accumulate gradients over 4 steps before updating weights
                                        # ->Effective batch size = per_device_train_batch_size * gradient_accumulation_steps = 2 * 4 = 8

        # Epochs and precision
        num_train_epochs=3, # how many times to iterate over the entire dataset
        fp16=not torch.cuda.is_bf16_supported(),  # use 16-bit floating point precision if bf16 not available
        bf16=torch.cuda.is_bf16_supported(), # use bfloat16 if GPU supports it (e.g., A100, L4, T4)

        # Logging
        logging_steps=25, # log loss and metrics every 25 steps
        seed=12,

        # Saving
        output_dir="outputs",
        save_strategy="epoch", # save model at the end of every epoch
        save_total_limit=2,

        # Disable some options for pinned memory and experiment tracking
        dataloader_pin_memory=False,
        report_to="none", # Disable Weights & Biases logging
    ),
)

In [None]:
# Train the model
trainer_stats = trainer.train()

In [None]:
prompt = "### Input: hz ro\n### Output:"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=20,
    temperature=0.0,
    do_sample=False,
    top_p=0.9,
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


In [None]:
for i in range(10):
    print(validation_file[i]['input'])

In [None]:
# Test the fine-tuned model with 1000 names in a json file
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

all_answers = []
for i in range(len(validation_file)):
  original_name = validation_file[i]['input']

  prompt = f"### Input: {original_name}\n### Output:"

  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

  outputs = model.generate(
      **inputs,
      max_new_tokens=20,   # small limit, we expect just one word
      use_cache=True,
      temperature=0.0,
      do_sample=False,
      top_p=0.9,
    )

  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
  answer = response.split("### Output:")[-1].strip()
  all_answers.append(answer)

In [None]:
list_inputs = []
list_labels = []
for input_label_dict in validation_file:
  list_inputs.append(input_label_dict['input'])
  list_labels.append(input_label_dict['label'])

In [None]:
print(len(list_inputs))
print(len(list_labels))
print(len(all_answers))


In [None]:
import pandas as pd
result_df = pd.DataFrame({
    'Original Name': list_inputs,
    'Label': list_labels,
    'Generated Name': all_answers
    })

In [None]:
count = 0
for i in range(len(result_df)):
  correct_name = result_df.iloc[i]['Label']
  pred_name = result_df.iloc[i]['Generated Name'][1:-1]
  if correct_name == pred_name:
    count += 1
accuracy = count / len(result_df) * 100
print(f"Accuracy: {accuracy}")

In [None]:
result_df.to_csv('results.csv', index=False)

In [None]:
result_df

In [None]:
!pip install -U "protobuf==3.20.3"
!pip install -U sentencepiece packaging

In [None]:
!rm -rf llama.cpp && git clone https://github.com/ggerganov/llama.cpp.git && cd llama.cpp && cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j

In [None]:
model.save_pretrained_gguf(
    "gguf_model",
    tokenizer,
    quantization_method="q4_k_m",
)

In [None]:
from google.colab import files
import os

gguf_files = [f for f in os.listdir("gguf_model") if f.endswith(".gguf")]
if gguf_files:
    gguf_file = os.path.join("gguf_model", gguf_files[0])
    print(f"Downloading: {gguf_file}")
    files.download(gguf_file)