In [44]:
import torch
import os
import ast
import warnings
import wandb
import pandas as pd
import gc

from datasets import load_dataset, Dataset
from peft import LoraConfig, PeftModel, AutoPeftModelForCausalLM, get_peft_model, TaskType, PeftModel
from transformers import AutoModelForCausalLM,AutoTokenizer,BitsAndBytesConfig,TrainingArguments,pipeline,DefaultDataCollator,TextStreamer,DataCollatorForSeq2Seq
from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM
from huggingface_hub import HfApi, HfFolder
from unsloth import FastLanguageModel, unsloth_train, UnslothTrainer, UnslothTrainingArguments, is_bfloat16_supported
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss

warnings.filterwarnings('ignore')
torch.cuda.empty_cache()
gc.collect() 

SEED = 42

In [45]:
HF_TOKEN = ...

In [46]:
HfFolder.save_token(HF_TOKEN)
api = HfApi(token=HF_TOKEN)

In [47]:
!wandb login ...

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/patryk/.netrc


## Data

In [48]:
data_df = pd.read_csv("stories_summaries_keywords.csv")
data_df.rename(columns={"text": "story"}, inplace=True)
data_df['story'] = data_df['story'].str.replace('\n', ' ')
data_df['story_length'] = data_df['story'].apply(len)
data_df['story_length'].describe() # decyzja: packing=True vs uzywanie data_collator

count    20000.000000
mean       894.545300
std        399.782168
min        205.000000
25%        669.000000
50%        785.000000
75%        952.250000
max       4540.000000
Name: story_length, dtype: float64

In [49]:
# data_df.head()
first_row = data_df.iloc[0]
for column, value in first_row.items():
    print(f"{column}: {value}")

story: One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.  Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."  Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.
summary: Lily, a little girl, and her mother successfully sewed a button onto Lily's shirt by sharing a sharp needle, demonstrating the value of collaboration and mutual support.
keywords_counter_top5: ['lily', 'needle', 'mom', 'shirt', 'share']
keywords_counter_top3: ['lily', 'needle', 'mo

In [50]:
INPUT_COLUMNS = ['keywords_counter_top5', 'keywords_counter_top3', 'keywords_td_idf_top5', 'keywords_td_idf_top3', 'keywords_td_idf_only_nouns_top5', 'keywords_td_idf_only_nouns_top3']

## Loading Model - ( just for constants setting )

https://docs.unsloth.ai/ \
for 2x acceleration

In [51]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
DTYPE = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
LOAD_IN_4BIT = True # Use 4bit quantization to reduce memory usage. Can be False.
MAX_SEQ_LENGTH = int(data_df['story_length'].max()) # !!! sus długosć ale ok !!! Supports automatic RoPE Scaling, so choose any number

In [52]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype = DTYPE,
    load_in_4bit = LOAD_IN_4BIT,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf (nwm c oto znaczy na razie)
)

==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.49.0.dev0.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.66 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Data Prep

In [53]:
EOS_TOKEN = tokenizer.eos_token
INSTRUCTION_TEMPLATE = "### Keywords:\n"
RESPONCE_TEMPLATE = "### Story:\n"
INPUT_COLUMN = "" # it will be set latter

In [54]:
def formatting_prompts_func(examples):
    texts = [
        f"{INSTRUCTION_TEMPLATE}{', '.join(eval(keywords))}\n{RESPONCE_TEMPLATE}{story}{EOS_TOKEN}"
        for keywords, story in zip(examples[INPUT_COLUMN], examples["story"])
    ]
    return {"text": texts}

In [55]:
# split_data will be created in train fucntion couse every input_column needs different set

## Contiuned Prepering ( training loop for variable input_column)

In [56]:
response_template_ids = tokenizer.encode(RESPONCE_TEMPLATE, add_special_tokens=False)[2:]
data_collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)

# checking how collactor works

# examples = [dataset[0]['text']]
# encodings = [tokenizer(e) for e in examples]
# dataloader = DataLoader(encodings, collate_fn=data_collator, batch_size=1)
# batch = next(iter(dataloader))
# batch.keys(), batch['labels']

In [57]:
def train_and_save(input_column):

    global INPUT_COLUMN
    INPUT_COLUMN = input_column # wplywa na formatowanie promptow w formatting_prompts_func

    dataset = Dataset.from_pandas(data_df)
    dataset = dataset.map(formatting_prompts_func, batched=True)
    dataset[0]['text']
    split_dataset = dataset.train_test_split(test_size=0.2, seed=SEED)

    torch.cuda.empty_cache()
    gc.collect() 

    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")

    os.environ["WANDB_PROJECT"] = f"{INPUT_COLUMN}"
    os.environ["WANDB_LOG_MODEL"] = f"{INPUT_COLUMN}"

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_NAME,
        max_seq_length=MAX_SEQ_LENGTH,
        dtype = DTYPE,
        load_in_4bit = LOAD_IN_4BIT,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf (nwm c oto znaczy na razie)
    )

    gc.collect() 

    # Do model patching and add fast LoRA weights
    model = FastLanguageModel.get_peft_model(
        model,
        r=100, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",

                        "embed_tokens", "lm_head",], # Add for continual pretraining
        lora_alpha = 32,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
        use_rslora = True,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
        random_state=SEED
    )

    OUTPUTDIR = f"output_{INPUT_COLUMN}"

    training_args = UnslothTrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,

        warmup_ratio = 0.1,
        num_train_epochs = 1,

        learning_rate = 5e-5,
        embedding_learning_rate = 5e-6,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "cosine",
        seed = SEED,
        output_dir = OUTPUTDIR,
        report_to = "wandb",
        logging_steps = 1, # Change if needed
        # save_steps = 25 # Change if needed - to chyba do zapisywania modelu w trakcie treningu - chekpoints
        run_name = f"{INPUT_COLUMN}" # (Optional)
    )

    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = split_dataset['train'],
        eval_dataset = split_dataset['test'],
        dataset_text_field = "text",
        max_seq_length = MAX_SEQ_LENGTH,
        dataset_num_proc = 8,
        args = training_args,
        data_collator = data_collator
    )

    trainer_stats = unsloth_train(trainer)

    #@title Show final memory and time stats
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory         /max_memory*100, 3)
    lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
    print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
    print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
    print(f"Peak reserved memory = {used_memory} GB.")
    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

    model.save_pretrained(f"./saved_model_{INPUT_COLUMN}", safe_serialization=True)
    tokenizer.save_pretrained(f"./saved_model_{INPUT_COLUMN}")

    print("\nModel saved successfully!\n")
    wandb.finish()

In [58]:
for input_column in INPUT_COLUMNS:
    train_and_save(input_column)
    # print(input_column)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

GPU = NVIDIA GeForce RTX 3060. Max memory = 11.66 GB.
3.416 GB of memory reserved.
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.49.0.dev0.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.66 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM
Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


Map (num_proc=8):   0%|          | 0/16000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/4000 [00:00<?, ? examples/s]

Received unrecognized `WANDB_LOG_MODEL` setting value=keywords_counter_top5; so disabling `WANDB_LOG_MODEL`
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 16,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 209,920,000


Step,Training Loss
1,2.1332
2,2.0422
3,2.9526
4,1.9818
5,1.9219
6,1.6831
7,1.7656
8,1.6845
9,1.6385
10,1.7251


3295.3964 seconds used for training.
54.92 minutes used for training.
Peak reserved memory = 4.969 GB.
Peak reserved memory for training = 1.553 GB.
Peak reserved memory % of max memory = 42.616 %.
Peak reserved memory for training % of max memory = 13.319 %.

Model saved successfully!



0,1
train/epoch,▁▁▂▂▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇▁▁▁▂▂▂▃▄▄▄▄▄▆▆▆▆▇▇▇█
train/global_step,▂▂▂▃▃▃▃▄▅▅▆▆▆▆▆▆▇▇▇▇███▁▁▁▂▂▂▂▃▄▄▄▅▅▆▆▆▇
train/grad_norm,▃█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▁▃▄████▇▇▇▆▆▅▅▅▅▄▃▃▃▁▁▄▂▂█████▇▅▅▅▃▃▂▂▂▁
train/loss,▇▇▅█▆▆▅▆▅▄▃▃▂▃▆▃▃▂▄▂▁▃▄▄█▅▄▃▄▄▄▆▄▄▄▃▃▄▃▃

0,1
total_flos,3.5214842179977216e+16
train/epoch,1.0
train/global_step,1000.0
train/grad_norm,1.4655
train/learning_rate,0.0
train/loss,1.1287
train_loss,1.14419
train_runtime,3295.3964
train_samples_per_second,4.855
train_steps_per_second,0.303


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

GPU = NVIDIA GeForce RTX 3060. Max memory = 11.66 GB.
4.969 GB of memory reserved.
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.49.0.dev0.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.66 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM
Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


Map (num_proc=8):   0%|          | 0/16000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/4000 [00:00<?, ? examples/s]

Received unrecognized `WANDB_LOG_MODEL` setting value=keywords_counter_top3; so disabling `WANDB_LOG_MODEL`
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 16,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 209,920,000


Step,Training Loss
1,2.1459
2,2.0676
3,2.9179
4,1.9846
5,1.928
6,1.6975
7,1.7842
8,1.7016
9,1.6638
10,1.7292


3274.1879 seconds used for training.
54.57 minutes used for training.
Peak reserved memory = 4.969 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 42.616 %.
Peak reserved memory for training % of max memory = 0.0 %.

Model saved successfully!



0,1
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇█
train/global_step,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇████
train/grad_norm,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▃▄▅▆██████▇▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▃▃▃▃▃▃▂▁▁▁▁▁
train/loss,█▆▅▆▆▄▃▄▂▃▄▅▄▄▄▄▃▃▃▅▃▁▄▁▂▂▃▄▂▃▄▄▂▄▃▂▂▄▂▄

0,1
total_flos,3.4710241518526464e+16
train/epoch,1.0
train/global_step,1000.0
train/grad_norm,1.45055
train/learning_rate,0.0
train/loss,1.1532
train_loss,1.16634
train_runtime,3274.1879
train_samples_per_second,4.887
train_steps_per_second,0.305


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

GPU = NVIDIA GeForce RTX 3060. Max memory = 11.66 GB.
4.969 GB of memory reserved.
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.49.0.dev0.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.66 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM
Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


Map (num_proc=8):   0%|          | 0/16000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/4000 [00:00<?, ? examples/s]

Received unrecognized `WANDB_LOG_MODEL` setting value=keywords_td_idf_top5; so disabling `WANDB_LOG_MODEL`
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 16,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 209,920,000


Step,Training Loss
1,2.141
2,2.063
3,2.9516
4,1.9938
5,1.9492
6,1.7004
7,1.7819
8,1.7004
9,1.6299
10,1.7107


3308.1144 seconds used for training.
55.14 minutes used for training.
Peak reserved memory = 4.969 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 42.616 %.
Peak reserved memory for training % of max memory = 0.0 %.

Model saved successfully!



0,1
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇█
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇█████
train/grad_norm,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▄▅▇███████▇▇▇▇▆▅▅▅▅▅▅▄▄▄▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁
train/loss,█▇▄▅▄▅▄▃▄▃▃▃▄▄▃▃▂▃▂▂▂▁▂▃▁▂▃▂▂▂▂▂▂▂▃▂▁▂▂▂

0,1
total_flos,3.524537088073728e+16
train/epoch,1.0
train/global_step,1000.0
train/grad_norm,1.49687
train/learning_rate,0.0
train/loss,1.1219
train_loss,1.13305
train_runtime,3308.1144
train_samples_per_second,4.837
train_steps_per_second,0.302


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

GPU = NVIDIA GeForce RTX 3060. Max memory = 11.66 GB.
4.969 GB of memory reserved.
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.49.0.dev0.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.66 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM
Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


Map (num_proc=8):   0%|          | 0/16000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/4000 [00:00<?, ? examples/s]

Received unrecognized `WANDB_LOG_MODEL` setting value=keywords_td_idf_top3; so disabling `WANDB_LOG_MODEL`
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 16,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 209,920,000


Step,Training Loss
1,2.1602
2,2.0878
3,2.9112
4,1.9932
5,1.9423
6,1.7164
7,1.8014
8,1.7121
9,1.6637
10,1.7439


3319.1542 seconds used for training.
55.32 minutes used for training.
Peak reserved memory = 4.969 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 42.616 %.
Peak reserved memory for training % of max memory = 0.0 %.

Model saved successfully!



0,1
train/epoch,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇███
train/global_step,▁▁▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇███
train/grad_norm,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▃▃████████▇▇▇▇▇▇▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁
train/loss,█▆▄▅▅▅▄▃▂▄▅▃▄▃▃▃▃▂▃▂▄▃▃▂▂▃▃▃▁▃▂▃▃▂▃▃▄▃▃▃

0,1
total_flos,3.471656511873024e+16
train/epoch,1.0
train/global_step,1000.0
train/grad_norm,1.47019
train/learning_rate,0.0
train/loss,1.1419
train_loss,1.15861
train_runtime,3319.1542
train_samples_per_second,4.821
train_steps_per_second,0.301


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

GPU = NVIDIA GeForce RTX 3060. Max memory = 11.66 GB.
4.969 GB of memory reserved.
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.49.0.dev0.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.66 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM
Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


Map (num_proc=8):   0%|          | 0/16000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/4000 [00:00<?, ? examples/s]

Received unrecognized `WANDB_LOG_MODEL` setting value=keywords_td_idf_only_nouns_top5; so disabling `WANDB_LOG_MODEL`
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 16,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 209,920,000


Step,Training Loss
1,2.1404
2,2.0665
3,2.9697
4,1.9958
5,1.9392
6,1.7098
7,1.7942
8,1.7063
9,1.628
10,1.7108


3344.0191 seconds used for training.
55.73 minutes used for training.
Peak reserved memory = 4.969 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 42.616 %.
Peak reserved memory for training % of max memory = 0.0 %.

Model saved successfully!



0,1
train/epoch,▁▁▁▁▁▂▂▂▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇████
train/grad_norm,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▁▅█████▇▇▇▇▇▇▇▇▇▇▆▆▆▅▅▅▅▅▄▃▃▃▃▂▂▂▂▁▁▁▁▁▁
train/loss,▆██▅▂▆▄▂▃▅▃▄▃▄▃▃▂▂▁▅▄▆▄▃▅▁▄▂▁▁▃▆▂▃▃▃▁▃▂▄

0,1
total_flos,3.524358838940467e+16
train/epoch,1.0
train/global_step,1000.0
train/grad_norm,1.4724
train/learning_rate,0.0
train/loss,1.1296
train_loss,1.14994
train_runtime,3344.0191
train_samples_per_second,4.785
train_steps_per_second,0.299


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

GPU = NVIDIA GeForce RTX 3060. Max memory = 11.66 GB.
4.969 GB of memory reserved.
==((====))==  Unsloth 2025.1.5: Fast Llama patching. Transformers: 4.49.0.dev0.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.66 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM
Unsloth: Training embed_tokens in mixed precision to save VRAM
Unsloth: Training lm_head in mixed precision to save VRAM


Map (num_proc=8):   0%|          | 0/16000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/4000 [00:00<?, ? examples/s]

Received unrecognized `WANDB_LOG_MODEL` setting value=keywords_td_idf_only_nouns_top3; so disabling `WANDB_LOG_MODEL`
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 16,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 8
\        /    Total batch size = 16 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 209,920,000


Step,Training Loss
1,2.1606
2,2.0845
3,2.9473
4,1.9976
5,1.9454
6,1.715
7,1.8
8,1.7113
9,1.6505
10,1.7286


3378.8681 seconds used for training.
56.31 minutes used for training.
Peak reserved memory = 4.969 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 42.616 %.
Peak reserved memory for training % of max memory = 0.0 %.

Model saved successfully!



0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇█████
train/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇██
train/grad_norm,█▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▁▃▇█████▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁
train/loss,█▅▅▅▄▃▅▄▃▃▂▅▄▄▅▄▂▁▃▄▁▂▂▂▃▂▂▅▃▂▂▃▃▁▃▃▃▃▃▃

0,1
total_flos,3.471253329309696e+16
train/epoch,1.0
train/global_step,1000.0
train/grad_norm,1.46018
train/learning_rate,0.0
train/loss,1.1467
train_loss,1.16838
train_runtime,3378.8681
train_samples_per_second,4.735
train_steps_per_second,0.296


### SimCTG - Contrastive Training Loss - to avoid repetition in generations

encourage to learn discriminative and isotropic token representations \
https://arxiv.org/pdf/2202.06417 \
NOT USED, mayby in the future