In [None]:
!pip install transformers accelerate bitsandbytes peft datasets trl
!pip install flash-attn --no-build-isolation

In [None]:
import os
import torch

import pandas as pd
import numpy as np

from datasets import load_dataset, load_metric
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, BitsAndBytesConfig, TrainingArguments, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

In [None]:
path = "../data/processed"
dataset = load_dataset(path)

In [None]:
MODEL_NAME = "distilbert-base-uncased"

In [None]:
bitsandbytes_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="fp4",
    bnb_4bit_compute_dtype=torch.float16,  
    llm_int8_enable_fp32_cpu_offload=False,
    llm_int8_has_fp16_weight=True,
)

In [None]:
df = dataset['train'].to_pandas()
count_of_categories = df['encoded_text'].nunique()
print(count_of_categories)

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

# Ensure tokenizer has an EOS token
if tokenizer.eos_token is None:
    tokenizer.add_special_tokens({'eos_token': '[EOS]'})

# Explicitly add a pad token if it's not set correctly
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tokenizer.pad_token = tokenizer.eos_token  # Alternatively, you can use the newly added '[PAD]' token
tokenizer.padding_side = "right"

In [None]:
def tokenize_and_get_lengths(examples):
    tokenized_examples = tokenizer(examples['text'], truncation=False, padding=False)
    return {'length': [len(tokens) for tokens in tokenized_examples['input_ids']]}

# Použití funkce na celý dataset
measure = dataset.map(tokenize_and_get_lengths, batched=True, remove_columns=['text'])

# Zjištění maximální délky
max_seq_length = max(measure['train']['length'])
print(f'Max lenght of sequence: {max_seq_length}')

In [None]:
max_seq_length = 512

In [None]:
train_encoded_text = dataset['train']
test_encoded_text = dataset['test']

In [None]:
vocab_size = tokenizer.vocab_size

def preprocess_function(examples):
    tokenized_examples = tokenizer(examples['text'], max_length=max_seq_length,truncation=True, padding=True)
    tokenized_examples['encoded_text'] = examples['encoded_text']
    # Check if any input_ids exceed the vocabulary size
    for input_ids in tokenized_examples['input_ids']:
        if any(id >= vocab_size for id in input_ids):
            print(f"Input ID out of range: {input_ids}")
    # Keep only input_ids, attention_mask, and encoded_text
    processed_examples = {
        'input_ids': tokenized_examples['input_ids'],
        'attention_mask': tokenized_examples['attention_mask'],
        'encoded_text': tokenized_examples['encoded_text']
    }
    return processed_examples

In [None]:
tokenized_train = train_encoded_text.map(preprocess_function, batched=True,remove_columns=['text'])
tokenized_test = test_encoded_text.map(preprocess_function, batched=True, remove_columns=['text'])

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    quantization_config=bitsandbytes_config,
    attn_implementation="flash_attention_2",
    num_labels=count_of_categories
)

In [None]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    target_modules=["attention.q_lin", "attention.k_lin", "attention.v_lin", "attention.out_lin"],  # Correct target modules
    task_type="CAUSAL_LM"
)

In [None]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
args = TrainingArguments(
  output_dir = MODEL_NAME,
  num_train_epochs=5,
  per_device_train_batch_size=16,
  per_device_eval_batch_size=16,  warmup_steps = 0.03,
  logging_steps=10,
  weight_decay=0.01,
  save_strategy="epoch",
  eval_strategy="epoch",
  learning_rate=2.5e-5,
  bf16=False,
)

In [None]:
max_seq_length = 512

trainer = SFTTrainer(
  model=model,
  peft_config=peft_config,
  max_seq_length=max_seq_length,
  dataset_text_field="text",
  tokenizer=tokenizer,
  packing=True,
  args=args,
  train_dataset=tokenized_train,
  eval_dataset=tokenized_test,
  data_collator=data_collator,
  compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()