In [1]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Install Flash Attention 2 for softcapping support
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-2b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.11.7: Fast Gemma2 patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu124. CUDA = 7.5. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.22G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.11.7 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


File `book_summaries.csv` is needed in the directory


In [4]:
! pip install -q datasets

In [6]:
from datasets import load_dataset, Dataset
import pandas as pd

ds = load_dataset("xguman/hw5_text_dataset", split = "train[:100]")
summaries_df = pd.read_csv("book_summaries.csv", nrows=100)
summaries_df.shape, ds.shape

ds_df = ds.to_pandas()
ds_df.reset_index(drop=True, inplace=True)
summaries_df.reset_index(drop=True, inplace=True)
combined_df = pd.concat([ds_df, summaries_df], axis=1)

combined_df.head()

Unnamed: 0,text,file_name,summary,important_facts,main_characters
0,"ÚvodSlovensko, ty posvätné dedictvo našich dra...",Urbanek_Kysuca.html.txt,The text reflects on the historical significa...,['The history of Slovakia is marked by the bra...,"['Saints Cyril', 'Saints Methodius', 'The hist..."
1,Krpčeky sv. FloriánaVeselohra v troch dejstvác...,Nadasi-Jege_Krpceky-sv-Floriana.html.txt,"""Krpčeky sv. Floriána"" is a comedic play in t...","[""The play is structured in three acts, highli...","['Andrej Jedlinský - the landowner', 'Cilka - ..."
2,Hlava I. Škola v Jelšave(Caput I. De Schola Al...,Rezik_Gymnaziologia-3-diel-Skoly-malomestske.h...,"The book ""Hlava I. Škola v Jelšave"" discusses...",['The inhabitants of Jelšava were committed to...,['Juraj Fabricius - A prominent rector who sig...
3,Nášmu poetovi![1]Pavol Országh-Hviezdoslav svä...,Vajansky_Z-dejin-literatury-4.html.txt,The text celebrates the 50th birthday of the ...,['Pavol Országh Hviezdoslav has significantly ...,"['Pavol Országh Hviezdoslav (the poet)', 'The ..."
4,Ohlas srbskej piesne[1][2]Pozerá sa Belhrad zá...,Botto_Ohlas-srbskej-piesne.html.txt,"""Ohlas srbskej piesne"" is a poetic narrative ...",['The poem references the historical figure Du...,"['The Young Serbian Man (Šuhaj Srbín)', 'Water..."


Prompt with input - one book and response in form of Summary, Important Facsts and Main Characters.

In [11]:
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Summarize the following text and create summary, extract the important facts, and extract main characters.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKENS

def format_prompt(combined_df):
    books = combined_df['text'][:10000] # restrict number of chars in text, to deal with max token limit
    summaries = combined_df['summary']
    important_facts = combined_df['important_facts']
    main_characters = combined_df['main_characters']
    texts = []
    for book, summary, important_fact, main_character in zip(books, summaries, important_facts, main_characters):
        response = f"Summary:\n{summary}\n\nImportant Facts:\n{important_fact}\n\nMain Characters:\n{main_character}"
        text = prompt.format(book, response) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}
formatted_data = format_prompt(combined_df)
training_dataset = Dataset.from_dict(formatted_data)

In [12]:
training_dataset[1]

{'text': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nSummarize the following text and create summary, extract the important facts, and extract main characters.\n\n### Input:\nKrpčeky sv. FloriánaVeselohra v troch dejstváchMotto:Väčšine ľudí sa mozog vyvíja len do pätnásteho roku; títo potom celý život hrajú komédiu vážnych, dospelých ľudí s rozumom pätnásťročných detí.Anglický prírodovedecOsobyAndrej Jedlinský, statkárCilka, jeho ženaOľga, jeho dcéraElena TomčováDoktor Čvok, lekárDoktor Malinka, advokátDoktor Jancko, advokátsky koncipientMartényi, bývalý statkár, c. kr. komorníkTrnovec, bankový úradníkWinkelmesser, bankový úradníkNiedermeyer Blau Lily, cestujúca so šnurovačkamiUľa, kuchárka\n\n### Response:\nSummary:\n "Krpčeky sv. Floriána" is a comedic play in three acts that explores the absurdity of adult life through the lens of its characters, wh

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [14]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = training_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        # max_steps = None,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

In [15]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 100 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 12
 "-____-"     Number of trainable parameters = 20,766,720


Step,Training Loss
1,3.1761
2,3.584
3,3.4816
4,3.3121
5,3.3131
6,3.4058
7,3.3683
8,3.1906
9,3.2302
10,3.4514


In [None]:
from google.colab import userdata

model.push_to_hub_merged("xguman/hw07_finetuned_slovak_books", tokenizer, save_method = "merged_4bit", token = userdata.get('HF_TOKEN'))

Unsloth: You are pushing to hub, but you passed your HF username = xguman.
We shall truncate xguman/hw07_finetuned_slovak_books to hw07_finetuned_slovak_books
Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.2G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.35 out of 12.67 RAM for saving.


100%|██████████| 26/26 [00:01<00:00, 15.35it/s]


Unsloth: Saving tokenizer...

No files have been modified since last commit. Skipping to prevent empty commit.


 Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving hw07_finetuned_slovak_books/pytorch_model-00001-of-00002.bin...
Unsloth: Saving hw07_finetuned_slovak_books/pytorch_model-00002-of-00002.bin...


README.md:   0%|          | 0.00/581 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Done.
Saved merged model to https://huggingface.co/xguman/hw07_finetuned_slovak_books


## Testing on unseen text

As first 100 texts from the original dataset were chosen,

In [20]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    prompt.format(
        book,
        "",)
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 2048)

<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Summarize the following text and create summary, extract the important facts, and extract main characters.

### Input:
Rozbor znelkyBlaze, kdo si jeden čistý, smělýÚčel místo mnohých představí,V němž by jako v centru mě

### Response:
Summary:
The poem "Rozbor znelky" by the Czech poet Jan Neruda describes the beauty of a clear, pure, and simple goal, which can be achieved by a person who has a pure and simple intention. The poem emphasizes the importance of having a clear and pure intention in order to achieve a goal, as it is the key to success.

Important Facts:
- The poem describes the beauty of a clear, pure, and simple goal.
- The poem emphasizes the importance of having a clear and pure intention in order to achieve a goal.
- The poem describes the journey of a person who is trying to achieve a goal.


In [18]:
import random
ds = load_dataset("xguman/hw5_text_dataset", split = "train[101:]")
book = ds[102]['text'][:100]

FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    prompt.format(
        book,
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 2048, use_cache = True)
tokenizer.batch_decode(outputs)

['<bos>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nSummarize the following text and create summary, extract the important facts, and extract main characters.\n\n### Input:\nRozbor znelkyBlaze, kdo si jeden čistý, smělýÚčel místo mnohých představí,V němž by jako v centru mě\n\n### Response:\nSummary:\nThe poem "Rozbor znelky" by the Czech poet Jan Neruda describes the beauty of a clear, pure, and simple goal, which can be achieved by a person who has a pure and simple intention. The poem emphasizes the importance of having a clear and pure intention in order to achieve a goal, as it is the key to success.\n\nImportant Facts:\n- The poem describes the beauty of a clear, pure, and simple goal.\n- The poem emphasizes the importance of having a clear and pure intention in order to achieve a goal.\n- The poem describes the journey of a person who is trying to