In [1]:
# based on https://colab.research.google.com/github/dvgodoy/FineTuningLLMs/blob/main/Chapter0.ipynb#scrollTo=3edc24ed

# !pip install transformers==4.46.2 peft==0.13.2 accelerate==1.1.1 trl==0.12.1 huggingface-hub==0.26.2 numpy==1.26.4
# !pip install datasets bitsandbytes trl dagshub pandas evaluate rouge_score bert_score mlflow


import os
import torch
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer


# We quantize the base model so it requires less GPU's RAM
bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.float32
)
repo_id = 'CYFRAGOVPL/Llama-PLLuM-8B-instruct'
# Load the model and pass quantization config
model = AutoModelForCausalLM.from_pretrained(repo_id,
                                             device_map="cuda:0",
                                             quantization_config=bnb_config
)

# Linear4bit in the quantized model can be used for inference but not for training
# We need to find a way to go around this
# We will use Low-Rank Adapters (LoRA)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
# improves numerical stability during training
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,                   # the rank of the adapter, the lower the fewer parameters you'll need to train
    lora_alpha=16,         # multiplier, usually 2*r
    bias="none",           # BEWARE: training biases *modifies* base model's behavior
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    # Newer models, such as Phi-3 at time of writing, may require
    # manually setting target modules
    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],
)

# apply the configuration to the quantized base model
model = get_peft_model(model, config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128258, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
              (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features

In [4]:
trainable_parms, tot_parms = model.get_nb_trainable_parameters()
print(f'Trainable parameters:             {trainable_parms/1e6:.2f}M')
print(f'Total parameters:                 {tot_parms/1e6:.2f}M')
print(f'Fraction of trainable parameters: {100*trainable_parms/tot_parms:.2f}%')

Trainable parameters:             6.82M
Total parameters:                 8037.09M
Fraction of trainable parameters: 0.08%


In [5]:
from dagshub.data_engine import datasources
import pandas as pd

ds = datasources.get("informal2formal/mlflow", "synthetic_data_source")
ds.head().dataframe

Output()

Unnamed: 0,path,datapoint_id,dagshub_download_url,media type,size
0,informal_formal_synthetic_v1.csv,86087877,https://dagshub.com/api/v1/repos/informal2form...,text/plain,1221781


In [6]:
dataset_uri = ds.head().dataframe["dagshub_download_url"].values[0]
df = pd.read_csv(dataset_uri)
df.head()

Output()

Unnamed: 0,zdanie_nieformalne,zdanie_formalne,model,generator,split
0,"Nie cierpię, jak wchodzę do przymierzalni w sk...","Odczuwam dyskomfort, gdy wchodząc do przymierz...",gemini-2.5,seba,train
1,"Trzeba zakasać rękawy i zapierdalać, jeśli chc...",Konieczne jest wzmożenie wysiłków i intensyfik...,gemini-2.5,jedrek,train
2,Muszę się w końcu zabrać za naukę obsługi jaki...,Postanowiłem rozpocząć naukę obsługi jednego z...,gemini-2.5,seba,train
3,Weź się w końcu ogarnij z tym swoim ciągłym kr...,Apeluję o zmianę Pana/Pani postawy i zaprzesta...,gemini-2.5,jedrek,train
4,"Wpadnij do mnie na chwilę po pracy, musimy prz...",Zapraszam do siebie na chwilę po zakończeniu p...,gemini-2.5,jedrek,train


In [7]:
df_train = df[df['split'] == 'train'][['zdanie_nieformalne', 'zdanie_formalne']]
df_val = df[df['split'] == 'val'][['zdanie_nieformalne', 'zdanie_formalne']]
df_test = df[df['split'] == 'test'][['zdanie_nieformalne', 'zdanie_formalne']]

In [8]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)

train_dataset = train_dataset.select(range(750))

In [9]:
# Promp which will be used for the training and inference
# It serves the purpose of instructing the model what to do
system_message = 'Dokonaj konwersji poniższego tekstu nieformalnego na tekst w stylu formalnym z zachowaniem wszystkich informacji'

# this format is required by the training framework
def format_dataset(examples):
    def format_single(prompt, completion):
        messages = []
        messages.append({"role": "system", "content": system_message})
        messages.append({"role": "user", "content": prompt})
        messages.append({"role": "assistant", "content": completion})
        return messages

    if isinstance(examples["zdanie_nieformalne"], list):
        return {"messages": [format_single(p, c) for p, c in zip(examples["zdanie_nieformalne"], examples["zdanie_formalne"])]}

    return {"messages": format_single(examples["zdanie_nieformalne"], examples["zdanie_formalne"])}

In [10]:
train_dataset = train_dataset.map(format_dataset).remove_columns(['zdanie_nieformalne', 'zdanie_formalne'])
val_dataset = val_dataset.map(format_dataset).remove_columns(['zdanie_nieformalne', 'zdanie_formalne'])
test_dataset = test_dataset.map(format_dataset).remove_columns(['zdanie_nieformalne', 'zdanie_formalne'])

train_dataset

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Dataset({
    features: ['__index_level_0__', 'messages'],
    num_rows: 750
})

In [11]:
# load tokenizer from the base model
tokenizer = AutoTokenizer.from_pretrained(repo_id)
tokenizer.chat_template

"{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message['role'] == 'user' %}\n        {%- if loop.first and system_message is defined %}\n            {{- '[INST]' + system_message + '\\n\\n' + message['content'] + '[/INST]' }}\n        {%- else %}\n            {{- '[INST]' + message['content'] + '[/INST]' }}\n        {%- endif %}\n    {%- elif message['role'] == 'assistant' %}\n        {{- message['content'] + eos_token}}\n    {%- else %}\n        {{- raise_exception('Only user and assistant roles are supported, with the exception of an

In [12]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))  # IMPORTANT: resize model's embeddings

Embedding(128258, 4096)

In [13]:
sft_config = SFTConfig(
    ## GROUP 1: Memory usage
    # These arguments will squeeze the most out of your GPU's RAM
    # Checkpointing
    gradient_checkpointing=True,
    # this saves a LOT of memory
    # Set this to avoid exceptions in newer versions of PyTorch
    gradient_checkpointing_kwargs={'use_reentrant': False},
    # Gradient Accumulation / Batch size
    # Actual batch (for updating) is same (1x) as micro-batch size
    gradient_accumulation_steps=4,
    # The initial (micro) batch size to start off with
    per_device_train_batch_size=2,
    # If batch size would cause OOM, halves its size until it works
    auto_find_batch_size=True,

    ## dataset-related
    max_seq_length=128,

    # packing a dataset means no padding is needed
    packing=False,

    # training parameters
    num_train_epochs=2,
    learning_rate=3e-4,

    # 8-bit Adam optimizer
    optim='paged_adamw_8bit',

    # Logging parameters
    logging_steps=10,
    logging_dir='./logs',
    output_dir='./informal-to-formal-text-converter',
    report_to='mlflow'
)

In [14]:
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=train_dataset,
    eval_dataset = val_dataset,
)

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

In [15]:
dl = trainer.get_train_dataloader()
batch = next(iter(dl))

In [16]:
batch['input_ids'][0], batch['labels'][0]

(tensor([128000, 128256,     35,    564,    263,   1662,  16947,  24584,   7910,
            281,  21446,   6077,  14694,  13546,  43185,  61782,  11568,    630,
            278,  53199,   4415,  73678,    289,  49304,     84,  16287,  49221,
           1167,  97912,    363,  97408,  45927,  71876,   6179,  34478,    271,
             50,   4697,   1412,   1662,    357,    661,     11,  11568,    293,
          62151,  40611,   3625,     89,  14088,    336,    602,   7019,   1142,
             89,    289,  15593,  19699,  20811,  34227,  77910,  90941,   1910,
            648,     11,  24230,    523,   1634,     89,  12951,  39852,  21127,
           7886,     11,   1167,  10830,    561,    503,   5985,  25398,  76411,
           7886,    602,   2709,     89,  22227,  10196,   7886,     11,    293,
          62151,  40611,  45607,    336,    602,  15036,   4697,    266,     86,
            311,   1167, 111321,   5985,     13, 128257,  10835,    301,   9832,
           5267,    297,    

In [17]:
trainer.train()

Step,Training Loss
10,1.4292
20,1.0953
30,1.0104
40,0.9718
50,0.9794
60,0.9383
70,0.9465
80,0.9019
90,0.9008
100,0.8987


Step,Training Loss
10,1.4292
20,1.0953
30,1.0104
40,0.9718
50,0.9794
60,0.9383
70,0.9465
80,0.9019
90,0.9008
100,0.8987


TrainOutput(global_step=186, training_loss=0.9105605566373436, metrics={'train_runtime': 3455.2207, 'train_samples_per_second': 0.434, 'train_steps_per_second': 0.054, 'total_flos': 8229330929025024.0, 'train_loss': 0.9105605566373436, 'epoch': 1.9866666666666668})

In [18]:
def gen_prompt(tokenizer, sentence):
    messages = []
    messages.append({"role": "system", "content": system_message})
    messages.append({"role": "user", "content": sentence})

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return prompt


In [19]:
def generate(model, tokenizer, prompt, max_new_tokens=128, skip_special_tokens=False):
    tokenized_input = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)

    model.eval()
    generation_output = model.generate(**tokenized_input,
                                       eos_token_id=tokenizer.eos_token_id,
                                       max_new_tokens=max_new_tokens)

    output = tokenizer.batch_decode(generation_output,
                                    skip_special_tokens=skip_special_tokens)
    return output[0].split("[/INST]")[-1].replace("<|end_of_text|>", "")

def formalize(text):
  prompt = gen_prompt(tokenizer, text)
  return generate(model, tokenizer, prompt)

In [20]:
trainer.save_model('informal-to-formal-text-converter-model')

In [21]:
evaluation_texts = [
    "Mógłbyś mi przypomnieć, jaki był deadline na oddanie tego raportu? Coś mi świta, że to było w tym tygodniu.",
    "Trzeba opierdolić tych z supportu, bo czekam na odpowiedź już trzeci dzień, a problem dalej nierozwiązany.",
    "Nie chce mi się iść do pracy w chuj dzisiaj. Mógłbyś zrobić moje taski za mnie? pls ziom, poratuj"
]

In [22]:
for text in evaluation_texts:
  print(formalize(text))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Czy mógłby Pan/Pani przypomnieć mi datę graniczną (deadline) oddania raportu? Wydaje mi się, że termin ten przypadał w tym tygodniu.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Należy skontaktować się z działem obsługi klienta i wyrazić swoje niezadowolenie z braku reakcji na zgłoszenie problemu, który pozostaje nierozwiązany od trzech dni.
Niestety, dzisiaj nie mam motywacji do pracy. Czy mógłbyś/mogłabyś zająć się moimi zadaniami za mnie? Bardzo proszę o pomoc.


In [23]:
import pandas as pd
from tqdm.notebook import tqdm
from informal_to_formal.evaluation.evaluator import Evaluator

In [24]:
def evaluate_language_model(
        pred: list[str], target: list[str]
    ) -> tuple[pd.DataFrame, dict]:
    evaluator = Evaluator(pd.DataFrame({
        "pred": pred,
        "target": target,
    }))
    evaluate_df_metrics, avg_metrics = evaluator.evaluate()
    return evaluate_df_metrics, avg_metrics

In [26]:
# limit due to Collab resources & time restrictions
TEST_DATA_LIMIT=10
test_list_pred = df_test['zdanie_nieformalne'].tolist()[:TEST_DATA_LIMIT]
test_list_target = df_test['zdanie_formalne'].tolist()[:TEST_DATA_LIMIT]

# OOM for batch predictions, insufficent Collab resources
# base_model_predictions = generate_language_model_batch_output(test_list, model, tokenizer, alpaca_prompt_pl, batch_size=4)

base_model_predictions = [
    formalize(input)
    for input in tqdm(test_list_pred)
]

  0%|          | 0/10 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [27]:
from informal_to_formal.evaluation import Evaluator


def evaluate_language_model(
        pred: list[str], target: list[str]
    ) -> tuple[pd.DataFrame, dict]:
    evaluator = Evaluator(pd.DataFrame({
        "pred": pred,
        "target": target,
    }))
    evaluate_df_metrics, avg_metrics = evaluator.evaluate()
    return evaluate_df_metrics, avg_metrics

evaluate_df_metrics, avg_metrics = evaluate_language_model(
    base_model_predictions, test_list_target
)
avg_metrics


Computing ROUGE scores:   0%|          | 0/10 [00:00<?, ?it/s][A
Computing ROUGE scores:  20%|██        | 2/10 [00:00<00:00, 11.39it/s][A
Computing ROUGE scores:  40%|████      | 4/10 [00:00<00:00,  8.34it/s][A
Computing ROUGE scores:  50%|█████     | 5/10 [00:00<00:00,  6.58it/s][A
Computing ROUGE scores:  60%|██████    | 6/10 [00:00<00:00,  6.62it/s][A
Computing ROUGE scores:  70%|███████   | 7/10 [00:01<00:00,  6.40it/s][A
Computing ROUGE scores:  80%|████████  | 8/10 [00:01<00:00,  6.54it/s][A
Computing ROUGE scores:  90%|█████████ | 9/10 [00:01<00:00,  6.36it/s][A
Computing ROUGE scores: 100%|██████████| 10/10 [00:01<00:00,  6.05it/s]

Computing BERT scores:   0%|          | 0/10 [00:00<?, ?it/s][A
Computing BERT scores:  20%|██        | 2/10 [00:04<00:19,  2.49s/it][A
Computing BERT scores: 100%|██████████| 10/10 [00:05<00:00,  1.92it/s]


{'rouge1': 0.41000718192139907,
 'rouge2': 0.23872723579235725,
 'rougeL': 0.35398275752130354,
 'bert_precision': 0.7782713532447815,
 'bert_recall': 0.7836599528789521,
 'bert_f1': 0.7807435810565948}

In [28]:
evaluate_df_metrics.sort_values(by='bert_f1')

Unnamed: 0,pred,target,rouge1,rouge2,rougeL,bert_precision,bert_recall,bert_f1
6,Ta pogoda jest niezwykle uciążliwa.,Ta pogoda powoduje u mnie ból głowy.,0.266667,0.153846,0.266667,0.72375,0.721123,0.722434
7,"Nie ukrywam, że jestem zirytowany częstymi tel...",Nachalne telefony od telemarketerów oferującyc...,0.272727,0.069767,0.25,0.720239,0.770377,0.744465
4,"Nowy program telewizyjny, w którym celebryci u...","Nowy program telewizyjny, w którym celebryci p...",0.368932,0.217822,0.291262,0.745188,0.767177,0.756023
8,"Nowa roślina owadożerna, którą posiadam, jest ...",Moja nowa roślina owadożerna okazała się fascy...,0.302326,0.142857,0.255814,0.745088,0.770327,0.757497
0,Kontaktujące się pukanie do drzwi przymierzaln...,Pukanie do drzwi przymierzalni i zadawanie pyt...,0.523077,0.253968,0.4,0.787436,0.741684,0.763876
2,Nowy serial political fiction charakteryzuje s...,Nowy serial z gatunku political fiction charak...,0.311688,0.186667,0.311688,0.75698,0.77384,0.765317
5,"Wyzwania publikowane na Facebooku, polegające ...","Wyzwania internetowe krążące na Facebooku, pol...",0.333333,0.113636,0.266667,0.767407,0.763859,0.765629
3,"Nowo zakupiony masażer do stóp, wyposażony w f...",Nowy masażer do stóp wyposażony w funkcje podg...,0.505051,0.412371,0.484848,0.806274,0.817078,0.81164
9,"Powinniśmy rozważyć zakup nowych, ergonomiczny...",Powinniśmy rozważyć inwestycję w bardziej ergo...,0.576271,0.315789,0.372881,0.821048,0.827028,0.824027
1,"Nowa funkcja w nawigacji samochodowej, informu...",Nowa funkcja w systemie nawigacji samochodowej...,0.64,0.520548,0.64,0.909303,0.884107,0.896528


In [39]:
experiment_data = tqdm(df_test['zdanie_nieformalne'].tolist()[:150])
model_predictions = [
        formalize(input)
        for input in experiment_data
    ]

  0%|          | 0/150 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

In [40]:
import mlflow
import dagshub

dagshub.init(repo_owner="informal2formal", repo_name="mlflow", mlflow=True)

run_name = "PLLuM Huggingface Trained"

mlflow.set_experiment(run_name)

with mlflow.start_run(run_name=run_name):
    mlflow.log_param("base_model_name", repo_id)
    mlflow.log_param("prompt_template", system_message)
    mlflow.log_param("dataset_uri", dataset_uri)

    # Test dataset evaluation using `Evaluator`
    evaluate_df_metrics, avg_metrics = evaluate_language_model(
        model_predictions, experiment_data
    )
    mlflow.log_metrics(avg_metrics)

mlflow.end_run()



Computing ROUGE scores:   0%|          | 0/150 [00:00<?, ?it/s][A[A

Computing ROUGE scores:   1%|▏         | 2/150 [00:00<00:12, 12.18it/s][A[A

Computing ROUGE scores:   3%|▎         | 4/150 [00:00<00:17,  8.43it/s][A[A

Computing ROUGE scores:   3%|▎         | 5/150 [00:00<00:18,  7.96it/s][A[A

Computing ROUGE scores:   4%|▍         | 6/150 [00:00<00:19,  7.55it/s][A[A

Computing ROUGE scores:   5%|▍         | 7/150 [00:00<00:23,  6.17it/s][A[A

Computing ROUGE scores:   5%|▌         | 8/150 [00:01<00:22,  6.39it/s][A[A

Computing ROUGE scores:   6%|▌         | 9/150 [00:01<00:21,  6.51it/s][A[A

Computing ROUGE scores:   7%|▋         | 10/150 [00:01<00:21,  6.60it/s][A[A

Computing ROUGE scores:   7%|▋         | 11/150 [00:01<00:20,  6.65it/s][A[A

Computing ROUGE scores:   8%|▊         | 12/150 [00:01<00:20,  6.59it/s][A[A

Computing ROUGE scores:   9%|▊         | 13/150 [00:01<00:20,  6.62it/s][A[A

Computing ROUGE scores:   9%|▉         | 14/150 [00:02

🏃 View run PLLuM Huggingface Trained at: https://dagshub.com/informal2formal/mlflow.mlflow/#/experiments/9/runs/4d7b6c9fd06a4d3f88f38de31277e6e1
🧪 View experiment at: https://dagshub.com/informal2formal/mlflow.mlflow/#/experiments/9


In [30]:
trainer.save_model('informal-to-formal-text-converter-final')

In [31]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
!cp -r informal-to-formal-text-converter-model/ drive/MyDrive/PLLuM-trained/

In [33]:
model.save_pretrained(run_name)
tokenizer.save_pretrained(run_name)

('PLLuM Huggingface Trained/tokenizer_config.json',
 'PLLuM Huggingface Trained/special_tokens_map.json',
 'PLLuM Huggingface Trained/tokenizer.json')