In [None]:
# based on https://colab.research.google.com/github/dvgodoy/FineTuningLLMs/blob/main/Chapter0.ipynb#scrollTo=3edc24ed

# !pip install transformers==4.46.2 peft==0.13.2 accelerate==1.1.1 trl==0.12.1 bitsandbytes==0.45.2 datasets==3.1.0 huggingface-hub==0.26.2 safetensors==0.4.5 pandas==2.2.2 matplotlib==3.8.0 numpy==1.26.4
# !pip install datasets bitsandbytes trl


import os
import torch
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer


bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.float32
)
repo_id = 'CYFRAGOVPL/Llama-PLLuM-8B-instruct'
model = AutoModelForCausalLM.from_pretrained(repo_id,
                                             device_map="cuda:0",
                                             quantization_config=bnb_config
)

print(model.get_memory_footprint()/1e6)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

5591.580928


In [None]:
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,                   # the rank of the adapter, the lower the fewer parameters you'll need to train
    lora_alpha=16,         # multiplier, usually 2*r
    bias="none",           # BEWARE: training biases *modifies* base model's behavior
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    # Newer models, such as Phi-3 at time of writing, may require
    # manually setting target modules
    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],
)

model = get_peft_model(model, config)
model

print(model.get_memory_footprint()/1e6)

7720.755456


In [None]:
trainable_parms, tot_parms = model.get_nb_trainable_parameters()
print(f'Trainable parameters:             {trainable_parms/1e6:.2f}M')
print(f'Total parameters:                 {tot_parms/1e6:.2f}M')
print(f'Fraction of trainable parameters: {100*trainable_parms/tot_parms:.2f}%')

Trainable parameters:             6.82M
Total parameters:                 8037.09M
Fraction of trainable parameters: 0.08%


In [None]:
dataset = load_dataset('csv', data_files='sample_data/informal_formal_v1.csv', split='train')
dataset = dataset.select(range(5))

In [None]:
dataset[0]

{'zdanie_nieformalne': 'Siema, co tam słychać?',
 'zdanie_formalne': 'Dzień dobry, jak się Pan/Pani miewa?',
 'model': 'gemini-2.5',
 'generator': 'seba'}

In [None]:
# przeróbka pod format instruction
dataset = dataset.rename_column("zdanie_nieformalne", "prompt")
dataset = dataset.rename_column("zdanie_formalne", "completion")
dataset = dataset.remove_columns(["model"])
dataset = dataset.remove_columns(["generator"])
dataset

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 5
})

In [None]:
dataset[0]

{'prompt': 'Siema, co tam słychać?',
 'completion': 'Dzień dobry, jak się Pan/Pani miewa?'}

In [None]:
messages = [
    {"role": "user", "content": dataset[0]['prompt']},
    {"role": "assistant", "content": dataset[0]['completion']}
]
messages

[{'role': 'user', 'content': 'Siema, co tam słychać?'},
 {'role': 'assistant', 'content': 'Dzień dobry, jak się Pan/Pani miewa?'}]

IMPORTANT UPDATE: unfortunately, in more recent versions of the trl library, the "instruction" format is not properly supported anymore, thus leading to the chat template not being applied to the dataset. In order to avoid this issue, we can convert the dataset to the "conversational" format.

In [None]:
command = 'Dokonaj konwersji poniższego tekstu nieformalnego na tekst w stylu formalnym z zachowaniem wszystkich informacji'

def format_dataset(examples):
    if isinstance(examples["prompt"], list):
        output_texts = []
        for i in range(len(examples["prompt"])):
            converted_sample = [
                {"role": "user", "content": f"{command}: {examples['prompt']}"},
                {"role": "assistant", "content": examples["completion"][i]},
            ]
            output_texts.append(converted_sample)
        return {'messages': output_texts}
    else:
        converted_sample = [
            {"role": "user", "content": f"{command}: {examples['prompt']}"},
            {"role": "assistant", "content": examples["completion"]},
        ]
        return {'messages': converted_sample}

In [None]:
dataset = dataset.map(format_dataset).remove_columns(['prompt', 'completion'])

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(repo_id)
tokenizer.chat_template

"{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- '[INST]' + system_message + '\\n\\n' + message['content'] + '[/INST]' }}\n        {%- else %}\n            {{- '[INST]' + message['content'] + '[/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- message['content'] + eos_token}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an

In [None]:
print(tokenizer.apply_chat_template(messages, tokenize=False))

<|begin_of_text|>[INST]Siema, co tam słychać?[/INST]Dzień dobry, jak się Pan/Pani miewa?<|end_of_text|>


In [None]:
# tokenizer.pad_token = tokenizer.unk_token
# tokenizer.pad_token_id = tokenizer.unk_token_id

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))  # IMPORTANT: resize model's embeddings

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(128259, 4096)

In [None]:
sft_config = SFTConfig(
    ## GROUP 1: Memory usage
    # These arguments will squeeze the most out of your GPU's RAM
    # Checkpointing
    gradient_checkpointing=True,
    # this saves a LOT of memory
    # Set this to avoid exceptions in newer versions of PyTorch
    gradient_checkpointing_kwargs={'use_reentrant': False},
    # Gradient Accumulation / Batch size
    # Actual batch (for updating) is same (1x) as micro-batch size
    gradient_accumulation_steps=1,
    # The initial (micro) batch size to start off with
    per_device_train_batch_size=16,
    # If batch size would cause OOM, halves its size until it works
    auto_find_batch_size=True,

    ## GROUP 2: Dataset-related
    max_seq_length=64,
    # Dataset
    # packing a dataset means no padding is needed
    packing=False, # ZAMIENIŁEM Z TRUE ŻEBY COŚ PRZETESTOWAĆ, TODO: ZMIEŃ I NAPRAW

    ## GROUP 3: These are typical training parameters
    num_train_epochs=10,
    learning_rate=3e-4,
    # Optimizer
    # 8-bit Adam optimizer - doesn't help much if you're using LoRA!
    optim='paged_adamw_8bit',

    ## GROUP 4: Logging parameters
    logging_steps=10,
    logging_dir='./logs',
    output_dir='./informal-to-formal-text-converter',
    report_to='none'
)

In [None]:
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset,
)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [None]:
dl = trainer.get_train_dataloader()
batch = next(iter(dl))

In [None]:
batch['input_ids'][0], batch['labels'][0]

(tensor([128000, 128256,     35,    564,    263,   1662,  16947,  24584,   7910,
            281,  21446,   6077,  14694,  13546,  43185,  61782,  11568,    630,
            278,  53199,   4415,  73678,    289,  49304,     84,  16287,  49221,
           1167,  97912,    363,  97408,  45927,  71876,   6179,  34478,     25,
            423,   1662,   1167,   3458,   7886,     11,  19958,   1080,   7545,
          41908,     89,  50906,    289,   1142,  70217,     13, 128257,   1360,
          14694,   5267,    297,   6179,    582,  36410,     11,  77034,  53144,
          45576], device='cuda:0'),
 tensor([128000, 128256,     35,    564,    263,   1662,  16947,  24584,   7910,
            281,  21446,   6077,  14694,  13546,  43185,  61782,  11568,    630,
            278,  53199,   4415,  73678,    289,  49304,     84,  16287,  49221,
           1167,  97912,    363,  97408,  45927,  71876,   6179,  34478,     25,
            423,   1662,   1167,   3458,   7886,     11,  19958,   1080, 

In [None]:
trainer.train()

Step,Training Loss
10,1.2343




TrainOutput(global_step=10, training_loss=1.2343281745910644, metrics={'train_runtime': 183.0227, 'train_samples_per_second': 0.273, 'train_steps_per_second': 0.055, 'total_flos': 144225651916800.0, 'train_loss': 1.2343281745910644, 'epoch': 10.0})

In [None]:
def gen_prompt(tokenizer, sentence):
    converted_sample = [
        {"role": "user", "content": f"{command}: {sentence}"},
    ]
    prompt = tokenizer.apply_chat_template(converted_sample,
                                           tokenize=False,
                                           add_generation_prompt=True)
    return prompt

In [None]:
sentence = 'Nie chce mi się iść do pracy w chuj dzisiaj. Mógłbyś zrobić moje taski za mnie? pls ziom, poratuj'
prompt = gen_prompt(tokenizer, sentence)
print(prompt)

<|begin_of_text|>[INST]Dokonaj konwersji poniższego tekstu nieformalnego na tekst w stylu formalnym z zachowaniem wszystkich informacji: Nie chce mi się iść do pracy w chuj dzisiaj. Mógłbyś zrobić moje taski za mnie? pls ziom, poratuj[/INST]


In [None]:
def generate(model, tokenizer, prompt, max_new_tokens=64, skip_special_tokens=False):
    tokenized_input = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)

    model.eval()
    generation_output = model.generate(**tokenized_input,
                                       eos_token_id=tokenizer.eos_token_id,
                                       max_new_tokens=max_new_tokens)

    output = tokenizer.batch_decode(generation_output,
                                    skip_special_tokens=skip_special_tokens)
    return output[0]

In [None]:
print(generate(model, tokenizer, prompt))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<|begin_of_text|>[INST]Dokonaj konwersji poniższego tekstu nieformalnego na tekst w stylu formalnym z zachowaniem wszystkich informacji: Nie chce mi się iść do pracy w chuj dzisiaj. Mógłbyś zrobić moje taski za mnie? pls ziom, poratuj[/INST]Nie mam dziś siły iść do pracy. Mógłbyś zrobić moje zadania za mnie? Proszę, pomóż mi.<|end_of_text|>


In [None]:
trainer.save_model('informal-to-formal-text-converter')

