In [1]:
import json

file = json.load(open("company_cleaning_minroot_1000.json", "r"))
validation_file = json.load(open("company_cleaning_minroot_val_disjoint1000.json", "r"))

In [2]:
!pip install unsloth trl peft accelerate bitsandbytes

Collecting unsloth
  Downloading unsloth-2025.10.4-py3-none-any.whl.metadata (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting unsloth_zoo>=2025.10.4 (from unsloth)
  Downloading unsloth_zoo-2025.10.4-py3-none-any.whl.metadata (31 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.35-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting datasets!=4.0.*,!=4.1.0,>=3.4.1 (from unsloth)
  Downloading datasets-4.2.0-py3-none-any.whl.metadata (18 kB)
Collecting transforme

In [3]:
# For GPU check
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Tesla T4


In [4]:
from unsloth import FastLanguageModel
import torch

model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"

max_seq_length = 64
dtype = None  # let Unsloth automatically detect the best precision

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.4: Fast Mistral patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/458 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [5]:
from datasets import Dataset

def format_prompt(example):
    return f"### Input: {example['input']}\n### Output: {json.dumps(example['label'])}<|endoftext|>"

formatted_data = [format_prompt(item) for item in file]
dataset = Dataset.from_dict({"text": formatted_data})

In [6]:
formatted_data[:5]

['### Input: bremen logistik GmbH\n### Output: "bremen"<|endoftext|>',
 '### Input: A-B-C-transport Schweiz\n### Output: "abc"<|endoftext|>',
 '### Input: vogele logistik Norge\n### Output: "vogele"<|endoftext|>',
 '### Input: leipzig logistik Ltd\n### Output: "leipzig"<|endoftext|>',
 '### Input: stockholm logistik GmbH & Co. KG\n### Output: "stockholm"<|endoftext|>']

In [7]:
# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=8,  # LoRA rank suggested as suffice in the LoRA paper
    target_modules=[
        "q_proj", "v_proj", # adapters on these projections perform best sugested by the LoRA paper
        "o_proj", "k_proj", "gate_proj", "up_proj", "down_proj", # in newer research, it is recommended to apply LoRA to all layers
    ],
    lora_alpha=16,  # LoRA scaling factor (usually 2x rank), controls the strength of the fine-tuned adjustments
    lora_dropout=0,  # regularization that helps prevent overfitting by randomly setting a fraction of LoRA activations to zero during each training step. No dropout because of our small clean dataset
    use_gradient_checkpointing="unsloth",  # Unsloth's optimized version, reduces memory usage by an extra 30% and supports extremly long context fine-tunes
    random_state=12,
    use_rslora=False, # apply the effective scaling as the standard lora_alpha / r
    loftq_config=None, # advanced technique proposed in LoftQ
)

Unsloth 2025.10.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Training arguments optimized for Unsloth
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text", # field in dataset containing the full text samples
    max_seq_length=max_seq_length,  # maximum number of tokens per input sample
    dataset_num_proc=2, # use 2 CPU processes for parallel tokenization
    args=TrainingArguments(
        # Optimization settings
        learning_rate=2e-4,  # high LR works well for LoRA since only small adapter weights are trained
        optim="adamw_8bit", # memory-efficient 8-bit AdamW optimizer
        weight_decay=0.01, # small L2 regularization to prevent overfitting
        lr_scheduler_type="linear", # linearly decreases LR from initial value to 0 over training

        # Warmup
        warmup_steps=10, # gradually increase LR from 0 to target LR over first 10 steps (helps stabilize training)

        # Batch
        per_device_train_batch_size=2, # number of samples processed per device (GPU) per step
        gradient_accumulation_steps=4,  # accumulate gradients over 4 steps before updating weights
                                        # ->Effective batch size = per_device_train_batch_size * gradient_accumulation_steps = 2 * 4 = 8

        # Epochs and precision
        num_train_epochs=3, # how many times to iterate over the entire dataset
        fp16=not torch.cuda.is_bf16_supported(),  # use 16-bit floating point precision if bf16 not available
        bf16=torch.cuda.is_bf16_supported(), # use bfloat16 if GPU supports it (e.g., A100, L4, T4)

        # Logging
        logging_steps=25, # log loss and metrics every 25 steps
        seed=12,

        # Saving
        output_dir="outputs",
        save_strategy="epoch", # save model at the end of every epoch
        save_total_limit=2,

        # Disable some options for pinned memory and experiment tracking
        dataloader_pin_memory=False,
        report_to="none", # Disable Weights & Biases logging
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [9]:
# Train the model
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 3 | Total steps = 375
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 14,942,208 of 3,836,021,760 (0.39% trained)


Step,Training Loss
25,2.8314
50,1.0027
75,0.7491
100,0.6944
125,0.6529
150,0.5702
175,0.5716
200,0.5669
225,0.5694
250,0.5379


In [10]:
prompt = "### Input: hz ro\n### Output:"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=20,
    temperature=0.0,
    do_sample=False,
    top_p=0.9,
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


### Input: hz ro
### Output: "hz"


In [11]:
for i in range(10):
    print(validation_file[i]['input'])

ripple logistics S.A.R.L.
strata transport
pendulum logistics Schweiz
ambergris transport
andes transportes
fjordline logistik S.A.R.L.
meridian logistics Italia B.V.
jurassic-logistics
vivid-logistik KGaA
xplore logistics Sverige GmbH & Co. KG


In [21]:
# Test the fine-tuned model with 1000 names in a json file
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

all_answers = []
for i in range(len(validation_file)):
  original_name = validation_file[i]['input']

  prompt = f"### Input: {original_name}\n### Output:"

  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

  outputs = model.generate(
      **inputs,
      max_new_tokens=20,   # small limit, we expect just one word
      use_cache=True,
      temperature=0.0,
      do_sample=False,
      top_p=0.9,
    )

  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
  answer = response.split("### Output:")[-1].strip()
  all_answers.append(answer)

In [22]:
list_inputs = []
list_labels = []
for input_label_dict in validation_file:
  list_inputs.append(input_label_dict['input'])
  list_labels.append(input_label_dict['label'])

In [23]:
print(len(list_inputs))
print(len(list_labels))
print(len(all_answers))


1000
1000
1000


In [24]:
import pandas as pd
result_df = pd.DataFrame({
    'Original Name': list_inputs,
    'Label': list_labels,
    'Generated Name': all_answers
    })

In [26]:
count = 0
for i in range(len(result_df)):
  correct_name = result_df.iloc[i]['Label']
  pred_name = result_df.iloc[i]['Generated Name'][1:-1]
  if correct_name == pred_name:
    count += 1
accuracy = count / len(result_df) * 100
print(f"Accuracy: {accuracy}")

Accuracy: 97.6


In [27]:
result_df.to_csv('results.csv', index=False)

In [28]:
result_df

Unnamed: 0,Original Name,Label,Generated Name
0,ripple logistics S.A.R.L.,ripple,"""ripple"""
1,strata transport,strata,"""strata"""
2,pendulum logistics Schweiz,pendulum,"""pendulum"""
3,ambergris transport,ambergris,"""ambergris"""
4,andes transportes,andes,"""andas"""
...,...,...,...
995,jasper-logistics Spain KG,jasper,"""jasper"""
996,delta freight S.A.,delta,"""delta"""
997,xanthe logistics GmbH,xanthe,"""xanthe"""
998,glacier logistik Deutschland GmbH,glacier,"""glacier"""


In [29]:
!pip install -U "protobuf==3.20.3"
!pip install -U sentencepiece packaging

Collecting protobuf==3.20.3
  Downloading protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Downloading protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/162.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 5.29.5
    Uninstalling protobuf-5.29.5:
      Successfully uninstalled protobuf-5.29.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-metadata 1.17.2 requires protobuf>=4.25.2; python_version >= "3.11", but you have protobuf 3.20.3 which is incompatible.
grpcio-status 1.71.2 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 3.20.3 wh



In [30]:
!rm -rf llama.cpp && git clone https://github.com/ggerganov/llama.cpp.git && cd llama.cpp && cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -j

Cloning into 'llama.cpp'...
remote: Enumerating objects: 65036, done.[K
remote: Counting objects: 100% (156/156), done.[K
remote: Compressing objects: 100% (122/122), done.[K
remote: Total 65036 (delta 89), reused 34 (delta 34), pack-reused 64880 (from 3)[K
Receiving objects: 100% (65036/65036), 177.41 MiB | 8.87 MiB/s, done.
Resolving deltas: 100% (47260/47260), done.
Updating files: 100% (1678/1678), done.
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
[0mCMAKE_BUILD_TYPE=Release[0m
-- Found Git: /usr/bin/git (found version "2.3

In [31]:
model.save_pretrained_gguf(
    "gguf_model",
    tokenizer,
    quantization_method="q4_k_m",
)

Unsloth: Merging model weights to 16-bit format...


config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00002.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|█████     | 1/2 [01:25<01:25, 85.94s/it]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 2/2 [02:20<00:00, 70.14s/it]
Unsloth: Merging weights into 16bit: 100%|██████████| 2/2 [03:14<00:00, 97.38s/it]


Unsloth: Merge process complete. Saved to `/content/gguf_model`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF f16 might take 3 minutes.
\        /    [2] Converting GGUF f16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: llama.cpp folder exists but binaries not found - will rebuild
Unsloth: Updating system package directories
Unsloth: All required system packages already installed!
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Install GGUF and other packages
Unsloth: Successfully installed llama.cpp!
Unsloth: Preparing converter script...
Unsloth: [1] Converting model into f16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['phi-3-m

{'save_directory': 'gguf_model',
 'gguf_files': ['phi-3-mini-4k-instruct.Q4_K_M.gguf'],
 'modelfile_location': '/content/Modelfile',
 'want_full_precision': False,
 'is_vlm': False,
 'fix_bos_token': False}

In [32]:
from google.colab import files
import os

gguf_files = [f for f in os.listdir("gguf_model") if f.endswith(".gguf")]
if gguf_files:
    gguf_file = os.path.join("gguf_model", gguf_files[0])
    print(f"Downloading: {gguf_file}")
    files.download(gguf_file)