# HW06

## Part 1 - Training a small GPT2 model

In [11]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer
import torch
import pandas as pd

## Prepare data

In [6]:
# Replace with your own dataset
dataset = load_dataset("xguman/hw5_text_dataset")

# Make validation split
dataset['train'] = dataset['train'].select(range(20))
dataset = dataset['train'].train_test_split(test_size=0.01)

# Lower the training dataset to 500 rows

train_dataset = dataset['train']
first_sentence = train_dataset['text'][0]
print(first_sentence)

Hlava I. Škola v Jelšave(Caput I. De Schola Alnoviensi)Jelšava (Alnovia), mesto Gemerskej župy, sa už od dávnych čias preslávilo vzdelanosťou a
         vzornou usporiadanosťou života tamojšieho občianstva.„Príkladný poriadok je viac hoden ako svetoborné činy.“ V zmysle tohto významného výroku
         sa jelšavskí obyvatelia usilovali nielen slovami, ale aj skutkami a svojbytnou pomocou
         pozdvihnúť slabú školu v obci, kolísajúcu podoprieť, klesajúcu dvíhať, postaviť na vlastné
         nohy, upevniť, a upevnenú zveľadiť. Nečudo, že svojím chvályhodným úsilím vzbudzovali až
         závisť iných obcí. Pri spĺňaní tohto ušľachtilého cieľa pomáhali jelšavskej obci títo
         rektori:Juraj Fabricius,[1]pochádzal z Jelšavy. Štyri roky študoval v Levoči počas pôsobenia rektora M.
         Gašpara Kramera, v štúdiu pokračoval v Schweidnitzi vo Sliezsku a rok v Prahe. Po návrate
         pôsobil v Sliezsku a rok v Prahe. V r. 1576 sa stal rektorom v Jelšave. Bol jedným z
         n

In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device = "mps" if torch.backends.mps.is_available()  else device
device

'cuda'

In [23]:
# load the gpt-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token=tokenizer.eos_token

In [24]:
# tokenize the dataset
def tokenize_function(example):
    return tokenizer(text=example["text"])
tokenized_ds = dataset.map(tokenize_function, batched=True, remove_columns='text')
tokenized_ds

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2201 > 1024). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['file_name', 'input_ids', 'attention_mask'],
        num_rows: 19
    })
    test: Dataset({
        features: ['file_name', 'input_ids', 'attention_mask'],
        num_rows: 1
    })
})

In [25]:
from itertools import chain
from datasets import Dataset, DatasetDict

def concatenate_and_chunk(dataset, chunk_size=512):
    # Flatten all `input_ids` into a single list
    all_input_ids = list(chain(*dataset["input_ids"]))
    
    # Create chunks of `chunk_size`
    chunks = [all_input_ids[i:i + chunk_size] for i in range(0, len(all_input_ids), chunk_size)]
    
    # Only keep chunks that are exactly of length `chunk_size`
    chunks = [chunk for chunk in chunks if len(chunk) == chunk_size]
    
    # Create a new dataset with only the `input_ids` chunks
    return Dataset.from_dict({"input_ids": chunks})

# Apply this function to each split (train and test) in the DatasetDict
chunked_ds = DatasetDict({
    split: concatenate_and_chunk(split_ds, chunk_size=512)
    for split, split_ds in tokenized_ds.items()
})


In [26]:
# data collator joins chunks into batches
# see https://huggingface.co/docs/transformers/en/main_classes/data_collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

## Model

In [27]:
# Define the model configuration for the smallest GPT-2
config = GPT2Config(
    vocab_size=len(tokenizer),      # Standard GPT-2 vocab size 50257
    n_positions=512,                # Context size (512 is enough for small-scale models)
    n_embd=768,                     # Embedding size
    n_layer=12,                     # Number of transformer layers
    n_head=12,                      # Number of attention heads
)

# Initialize the model and tokenizer
model = GPT2LMHeadModel(config).to(device)

In [28]:
import torch
import math
import numpy as np

# Define the perplexity metric
def compute_metrics(eval_pred):
    # `eval_pred` is a tuple of (logits, labels)
    logits, labels = eval_pred

    # Convert logits and labels to PyTorch tensors if they are NumPy arrays
    if isinstance(logits, np.ndarray):
        logits = torch.tensor(logits)
    if isinstance(labels, np.ndarray):
        labels = torch.tensor(labels)

    # Shift labels so that tokens align for calculating loss
    shift_labels = labels[:, 1:].reshape(-1)
    shift_logits = logits[:, :-1, :].reshape(-1, logits.shape[-1])

    # Calculate the cross-entropy loss
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)  # Ignore padding tokens
    loss = loss_fct(shift_logits, shift_labels)

    # Calculate perplexity
    perplexity = math.exp(loss.item())
    return {"perplexity": perplexity}


## Training

In [29]:
# Set this according to size of your dataset
# You should train for at least 15 mins on A10 GPU to get something reasonable
TRAIN_EPOCHS = 200

SAVE_STEPS = 1000
EVAL_STEPS = SAVE_STEPS // 2

# training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-training",  # Directory to save the model checkpoints and other outputs
    eval_strategy="steps",  # Evaluation strategy to use during training ('steps' or 'epochs')
    eval_steps=EVAL_STEPS,  # Perform evaluation every EVAL_STEPS steps
    num_train_epochs=TRAIN_EPOCHS,  # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size for training on each device
    per_device_eval_batch_size=16,  # Batch size for evaluation on each device
    learning_rate=2.5e-4,  # Initial learning rate for the optimizer
    lr_scheduler_type='cosine',  # Learning rate scheduler type. 'cosine' provides a cosine decay schedule.
    warmup_ratio=0.05,  # Proportion of training to perform linear learning rate warmup for
    adam_beta1=0.9,  # Beta1 parameter for the Adam optimizer (first moment decay)
    adam_beta2=0.999,  # Beta2 parameter for the Adam optimizer (second moment decay)
    weight_decay=0.01,  # Weight decay to apply (L2 regularization)
    logging_strategy="steps",  # Logging strategy to use. 'steps' logs at specified steps.
    logging_steps=EVAL_STEPS,  # Log training metrics every EVAL_STEPS steps
    save_steps=SAVE_STEPS,  # Save a checkpoint every SAVE_STEPS steps
    save_total_limit=10,  # Maximum number of checkpoints to keep. Older checkpoints are deleted.
    # report_to='wandb',  # Uncomment to report metrics to Weights and Biases (optional)
)

trainer = Trainer(model=model,
                 args = training_args,
                 tokenizer=tokenizer,
                 train_dataset=chunked_ds["train"],
                 eval_dataset=chunked_ds["test"],
                 compute_metrics=compute_metrics,
                 data_collator = data_collator)

  trainer = Trainer(model=model,


In [30]:
trainer.train()

Step,Training Loss,Validation Loss,Perplexity
500,3.1071,5.697019,297.972577
1000,0.1867,6.59979,734.927239
1500,0.0509,6.939453,1032.187362
2000,0.0152,7.223747,1371.59374
2500,0.0056,7.536597,1875.410686
3000,0.0033,7.608286,2014.799442
3500,0.0027,7.654192,2109.445776


TrainOutput(global_step=3600, training_loss=0.4683267268869612, metrics={'train_runtime': 1234.3527, 'train_samples_per_second': 45.368, 'train_steps_per_second': 2.917, 'total_flos': 1.4632353792e+16, 'train_loss': 0.4683267268869612, 'epoch': 200.0})

In [31]:
trainer.save_model("./gpt2-small-final") 

In [None]:
YOUR_MODEL_NAME = "my_small_gpt2_zlatyfond" # change this
HF_TOKEN = "" #todo change this

model.push_to_hub(YOUR_MODEL_NAME, token=HF_TOKEN)
tokenizer.push_to_hub(YOUR_MODEL_NAME, token=HF_TOKEN)

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/xguman/my_small_gpt2_zlatyfond/commit/662bfc530658b1cf51663fbb905f664d4b8727a6', commit_message='Upload tokenizer', commit_description='', oid='662bfc530658b1cf51663fbb905f664d4b8727a6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/xguman/my_small_gpt2_zlatyfond', endpoint='https://huggingface.co', repo_type='model', repo_id='xguman/my_small_gpt2_zlatyfond'), pr_revision=None, pr_num=None)

## Evaluation

Now you can switch from GPU to CPU. Try to complete some prompt specific to your dataset.

Does it make sense? Is it at least in Czech/Slovak?

In [33]:
from transformers import  GPT2LMHeadModel, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token=tokenizer.eos_token

In [34]:
model =  GPT2LMHeadModel.from_pretrained("./gpt2-small-final").to("cpu")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [42]:
PROMPT = ["Hrad", "Treba pomáhať", "prepustili"] # Set starting prompt, something specific for your dataset

generator(
    PROMPT,
    max_length=50,       # Maximum length of the generated text
    do_sample=True,
    temperature=0.7,        
    repetition_penalty=1,  
)

[[{'generated_text': 'Hradné\n         okamženie, od roku 1868, keď zazneli prvé, neisté zvuky jeho lýry,['}],
 [{'generated_text': 'Treba pomáhať sa zašady n prascuračene, svojho pobr zať sa k                '}],
 [{'generated_text': 'prepustili, zám na nad svojúmí, čočnej pieseobodaj                      '}]]

# Part 2: ROBERTA genre classifier

In [7]:
tokenizer = AutoTokenizer.from_pretrained("classla/xlm-roberta-base-multilingual-text-genre-classifier")
model = AutoModelForSequenceClassification.from_pretrained(
    "classla/xlm-roberta-base-multilingual-text-genre-classifier"
)

model.eval()

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [8]:
id2label = model.config.id2label

In [9]:
def predict_genre(batch):
    inputs = tokenizer(batch["text"], return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).tolist()
    batch["predicted_genre"] = predicted_class
    return batch


In [12]:
ds_with_predictions = dataset.map(predict_genre, batched=True, batch_size=16)

df = pd.DataFrame(ds_with_predictions["train"])

df["predicted_genre_label"] = df["predicted_genre"].apply(lambda x: id2label[x])

print(df[["text", "predicted_genre", "predicted_genre_label"]].head(10))

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

                                                text  predicted_genre  \
0  Hlava I. Škola v Jelšave(Caput I. De Schola Al...                1   
1  Trinásta pieseňZ denníka priateľovhoBolo to ro...                4   
2  OsobyDr. PLACHÝ, lekárMILICA, jeho bratanicaJÁ...                4   
3  Prvý záprah1898Ale hô! Hejk hore! — Abyže vás ...                6   
4  PredmluvaZasvätenie storočia Kuzmányho naroden...                4   
5  Nášmu poetovi![1]Pavol Országh-Hviezdoslav svä...                4   
6  Ohlas srbskej piesne[1][2]Pozerá sa Belhrad zá...                6   
7  I. PodludníciDantes nebol ešte ani deň na palu...                4   
8  Hry, zábavy, obyčaje a obrady so spevom spojen...                1   
9  Riadenie v životeSynu! keď raz vstúpiš\n   do ...                6   

     predicted_genre_label  
0  Information/Explanation  
1    Opinion/Argumentation  
2    Opinion/Argumentation  
3            Prose/Lyrical  
4    Opinion/Argumentation  
5    Opinion/Argumenta