In [1]:
from unsloth import FastLanguageModel
from transformers import TextStreamer
from unsloth.chat_templates import get_chat_template, train_on_responses_only

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!


In [6]:
import pandas as pd
from datasets import Dataset


In [3]:

#Load in model 
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "data/qwen3-4b-lora", # YOUR MODEL YOU USED FOR TRAINING
    #max_seq_length = max_seq_length,
    #dtype = dtype,
    #load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
text_streamer = TextStreamer(tokenizer)
#_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)

==((====))==  Unsloth 2025.10.8: Fast Qwen3 patching. Transformers: 4.56.2.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.034 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [7]:
# Data Preparation
raw_data = pd.read_csv("./data/camden_output_data.csv", index_col=None, header=1, encoding="latin1")
raw_data.dropna(inplace=True)

# we drop first column and change column names to prepare the data in the prompt-completion style
df = raw_data[["ON-LINE APPLICATION", "DECIDED-FINAL"]].rename(
    columns={"ON-LINE APPLICATION": "prompt", "DECIDED-FINAL": "completion"}
)
# immediately load the raw data into a Dataset object in the prompt-completion style
ds = Dataset.from_pandas(df)

tokenizer = get_chat_template(
    tokenizer,
    chat_template="qwen3-instruct",
)


def format_with_chat_template(example):
    # Create the message structure that the chat template expects
    dct = [
        {
            "role": "system",
            "content": "You are a trained planning officer at a local council in the UK. Change the following piece of text, which is a description of a planning application, to give it the best chance of success.",
        },
        {"role": "user", "content": example["prompt"]},
        {"role": "assistant", "content": example["completion"]},
    ]
    # Apply the chat template, returning a single formatted string
    # `tokenize=False` means we get a string, not token IDs. SFTTrainer will tokenize it.
    templated = tokenizer.apply_chat_template(
        conversation=dct,
        tokenize=False,
        add_generation_prompt=False,
    )
    return {"text": templated}


# Use .map() to apply the function to the entire dataset
formatted_dataset = ds.map(format_with_chat_template)

# TODO: further split out an eval ds
result = formatted_dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
train_ds, test_ds = result["train"], result["test"]

Map: 100%|██████████| 17637/17637 [00:02<00:00, 6637.13 examples/s]


In [8]:
test_ds[6]

{'prompt': 'Alterations to existing shopfront to include installation of retractable awning, placing of tables and chairs within a 1m high timber enclosure and introduction of a ramped access to the front entrance.',
 'completion': 'Installation of retractable awning, erection of a timber enclosure and planters to forecourt and introduction of a ramped access to the front entrance (retrospective)',
 '__index_level_0__': 2262,
 'text': '<|im_start|>system\nYou are a trained planning officer at a local council in the UK. Change the following piece of text, which is a description of a planning application, to give it the best chance of success.<|im_end|>\n<|im_start|>user\nAlterations to existing shopfront to include installation of retractable awning, placing of tables and chairs within a 1m high timber enclosure and introduction of a ramped access to the front entrance.<|im_end|>\n<|im_start|>assistant\nInstallation of retractable awning, erection of a timber enclosure and planters to f

In [25]:
#6
i = 10
original_desc = test_ds[i]["prompt"]
approved_desc = test_ds[i]["completion"]
#"Alterations to existing shopfront to include installation of retractable awning, placing of tables and chairs within a 1m high timber enclosure and introduction of a ramped access to the front entrance."

print("Original description:\n", original_desc)

messages = [
    {"role" : "user", "content" : "You are a trained planning officer at a local council in the UK. Change the following piece of text, which is a description of a planning application, to give it the best chance of success." + original_desc}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
)
print("\nGenerated description:\n")
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 1000, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)


Original description:
 Single storey rear extension with basement below, and replacement outbuilding for use ancillary to main building.

Generated description:

Erection of a 

single-storey rear extension with basement below and replacement outbuilding in rear garden ancillary to main building.<|im_end|>


In [24]:
print("\nApproved description:\n", approved_desc)


Approved description:
 Installation of retractable awning, erection of a timber enclosure and planters to forecourt and introduction of a ramped access to the front entrance (retrospective)
