In [2]:
from datasets import load_dataset
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

ds = load_dataset("SajjadAyoubi/persian_qa")

model_name = "HooshvareLab/bert-base-parsbert-uncased"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]

        # بررسی وجود اطلاعات لازم در پاسخ
        if len(answer["answer_start"]) == 0 or len(answer["text"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
            continue

        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = inputs.sequence_ids(i)

        # پیدا کردن محدوده‌ی متن زمینه در ورودی‌ها
        context_start = 0
        while context_start < len(sequence_ids) and sequence_ids[context_start] != 1:
            context_start += 1

        context_end = len(sequence_ids) - 1
        while context_end >= 0 and sequence_ids[context_end] != 1:
            context_end -= 1

        if context_start >= len(sequence_ids) or context_end < 0 or offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_position = None
            end_position = None
            for j, (start, end) in enumerate(offset):
                if start <= start_char < end:
                    start_position = j
                if start < end_char <= end:
                    end_position = j
                    break
            start_positions.append(start_position if start_position is not None else 0)
            end_positions.append(end_position if end_position is not None else 0)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_ds = ds.map(preprocess_function, batched=True, remove_columns=ds["train"].column_names)


lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none"
)

lora_model = get_peft_model(model, lora_config)


training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/temp/result",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
)

trainer = Trainer(
   model=lora_model,
   args=training_args,
   train_dataset=tokenized_ds["train"],
   eval_dataset=tokenized_ds["validation"],
)

trainer.train()



Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,5.2999,No log
2,4.1998,No log


TrainOutput(global_step=1126, training_loss=4.684278362809447, metrics={'train_runtime': 1459.5067, 'train_samples_per_second': 12.344, 'train_steps_per_second': 0.771, 'total_flos': 4740166268289024.0, 'train_loss': 4.684278362809447, 'epoch': 2.0})

In [3]:
model.save_pretrained("/content/drive/MyDrive/temp/model")
tokenizer.save_pretrained("/content/drive/MyDrive/temp/tokenizer")

# ذخیره مدل LoRA
lora_model.save_pretrained("/content/drive/MyDrive/temp/model")

In [4]:

from google.colab import files
import shutil

# فشرده‌سازی پوشه مدل
shutil.make_archive('/content/model', 'zip', '/content/drive/MyDrive/temp/model')
shutil.make_archive('/content/tokenizer', 'zip', '/content/drive/MyDrive/temp/tokenizer')

# دانلود فایل‌های فشرده شده
files.download('/content/model.zip')
files.download('/content/tokenizer.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>