In [None]:
!pip install transformers datasets evaluate

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer

# model_name = "nur-dev/roberta-kaz-large"
# model = AutoModelForQuestionAnswering.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

model_checkpoint = "nur-dev/roberta-kaz-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
from datasets import load_dataset

train_ds = load_dataset("Kyrmasch/sKQuAD")

test_ds = load_dataset("issai/kazqad")
test_ds['test'] = test_ds['test'].select(range(1000))

In [None]:
def has_answer(example):
    answer = example['answer']
    context = example['context']
    answer_start = context.lower().find(answer.lower())
    if answer_start == -1:
        return False
    return True

train_ds['train'] = train_ds['train'].filter(has_answer)

In [None]:
def add_answer_start(example):
    answer = example['answer']
    context = example['context']
    answer_start = context.lower().find(answer.lower())
    example['answers'] = {'text': [answer],
                          'answer_start': [answer_start]}
    return example

train_ds['train'] = train_ds['train'].map(add_answer_start)

In [None]:
max_length = 384 # hyperparameter
stride = 128 # hyperparameter


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
train_dataset = train_ds['train'].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=train_ds["train"].column_names,
)
len(train_ds['train']), len(train_dataset)

In [None]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [None]:
validation_dataset = test_ds['test'].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=test_ds['test'].column_names
)
len(test_ds['test']), len(validation_dataset)

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

In [None]:
from transformers import TrainingArguments
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=20,
#     label_names=["start_positions", "end_positions"],
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
    learning_rate=1e-5, # hyperparameter
    num_train_epochs=50, # hyperparameter
    weight_decay=0.01, # hyperparameter
    fp16=True, # hyperparameter
    push_to_hub=False,
    report_to="none"
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
#     data_collator=data_collator,
)
# trainer.can_return_loss = True
trainer.train()

In [None]:
from tqdm.auto import tqdm
import evaluate

def compute_metrics(start_logits, end_logits, features, examples):
    n_best = 20
    max_answer_length = 30
    metric = evaluate.load("squad")
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [None]:
import collections
import numpy as np

predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, test_ds["test"])

In [None]:
# end of the notebook

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering

eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(
    device
)

with torch.no_grad():
    outputs = trained_model(**batch)

In [None]:
answers = train_ds["answer"]
contexts = train_ds["context"]
start_positions = []
end_positions = []

count = 0

for i, offset in enumerate(inputs["offset_mapping"]):
    sample_idx = inputs["overflow_to_sample_mapping"][i]
    if i > 999:
      continue
    answer = answers[i].lower()
    start_char = contexts[i].lower().find(answer)
    if start_char == -1:
      count += 1
    end_char = start_char + len(answer)
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label is (0, 0)
    if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)

start_positions, end_positions, count

In [None]:
idx = 8
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]

start = start_positions[idx]
end = end_positions[idx]
labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start : end + 1])

print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer

model_name = "nur-dev/roberta-kaz-large"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_train_data(example):
    inputs = tokenizer(
        example['question'],
        example['context'],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    start_positions = []
    end_positions = []
    for i in range(len(example['answer'])):
        answer = example['answer'][i].lower()
        start_position = example['context'][i].lower().find(answer)
        if start_position == -1:
            end_position = -1
        else:
            end_position = start_position + len(answer)
        start_positions.append(start_position)
        end_positions.append(end_position)
    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs

def preprocess_test_data(example):
    inputs = tokenizer(
        example['question'],
        example['context'],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    start_positions = []
    end_positions = []
    for i in range(len(example['answers'])):
        answer = example['answers'][i]['text'][0]
        start_position = example['answers'][i]['answer_start'][0]
        end_position = start_position + len(answer)
        start_positions.append(start_position)
        end_positions.append(end_position)
    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions
    return inputs

# Apply preprocessing to the entire dataset
train_ds = train_ds.map(preprocess_train_data, batched=True)
test_ds = test_ds.map(preprocess_test_data, batched=True)

# ds = ds.map(preprocess_function, batched=True, remove_columns=ds["train"].column_names)

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

training_args = TrainingArguments(
    output_dir="my_awesome_qa_model",
    eval_strategy="epoch",
    learning_rate=2e-5, # increase to 2e-4, 2e-3
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3, # increase up to 5, 10, 15, 20
    weight_decay=0.01, # if overfitting is detected, increase to 0.03, 0.05, 0.07.
    push_to_hub=False, # is there is dropout parameter, then use it (in case of overfitting)
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    data_collator=data_collator,
)

trainer.train()

In [None]:
!pip install evaluate

In [None]:
import torch
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from evaluate import load
from tqdm import tqdm

# Load the trained model and tokenizer
model_name = "my_awesome_qa_model/checkpoint-750/"  # Adjust this to your model's save path
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained("nur-dev/roberta-kaz-large")

# Load the test dataset (ensure it is in the correct format)
test_ds = load_dataset("issai/kazqad", "kazqad", split="test[100:150]")  # Adjust this as needed

# Load metrics
f1_metric = load("f1")
em_metric = load("exact_match")

def evaluate_model(dataset):
    model.eval()  # Set the model to evaluation mode
    all_start_logits = []
    all_end_logits = []

    with torch.no_grad():
        for example in tqdm(dataset):
            # Tokenize the input
            inputs = tokenizer(
                example["question"],
                example["context"],
                return_tensors="pt",
                max_length=512,
                truncation=True,
                padding="max_length"
            )

            # Move inputs to the same device as the model
            inputs = {key: val.to(model.device) for key, val in inputs.items()}

            # Get model predictions
            outputs = model(**inputs)
            start_logits, end_logits = outputs.start_logits, outputs.end_logits

            all_start_logits.append(start_logits.cpu().numpy())
            all_end_logits.append(end_logits.cpu().numpy())

    return np.concatenate(all_start_logits), np.concatenate(all_end_logits)

# Get model predictions
start_logits, end_logits = evaluate_model(test_ds)



In [None]:

# Function to calculate metrics
def compute_metrics(pred_start_logits, pred_end_logits):
    pred_start_ids = np.argmax(pred_start_logits, axis=1)
    pred_end_ids = np.argmax(pred_end_logits, axis=1)

    # Initialize the lists for the answers
    answers = []
    references = [item["answers"]["text"][0] for item in test_ds]  # Adjust this line if necessary

    for idx, (start, end) in enumerate(zip(pred_start_ids, pred_end_ids)):
        if start <= end:  # Ensure valid answer span
            pred_text = test_ds[idx]["context"][start:end + 1]  # Extract answer from context
            answers.append(pred_text)
        else:
            answers.append("")  # No valid answer

    # Calculate the F1 and EM scores
    for idx, answer in enumerate(answers):
        true_answer = references[idx]
        print(f"({true_answer} - {answer})")
        # f1_metric.add_batch(predictions=[answer], references=[true_answer])
        # em_metric.add_batch(predictions=[answer], references=[true_answer])

    f1_score = f1_metric.compute()
    em_score = em_metric.compute()

    return {
        "f1": f1_score["f1"],
        "exact_match": em_score["exact_match"],
    }

# Calculate and print the evaluation metrics
metrics = compute_metrics(start_logits, end_logits)
print(f"F1 Score: {metrics['f1']:.2f}")
print(f"Exact Match Score: {metrics['exact_match']:.2f}")

# from datasets import load_dataset, DatasetDict
# from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer
# import gc
# import torch

# # Проверяем, доступен ли GPU и используем его, если возможно
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("Using device:", device)

# # Загрузка модели и токенизатора
# model_name = "nur-dev/roberta-kaz-large"
# model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(device)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# # Загрузка полного набора данных
# dataset = load_dataset("issai/kazqad")

# # **Выбор первых 100 записей из тренировочного и валидационного наборов**
# small_dataset = DatasetDict({
#     'train': dataset['train'].select(range(100)),
#     'validation': dataset['validation'].select(range(100))
# })

# # Функция предобработки данных с проверкой на корректность позиций
# def preprocess_data(batch):
#     inputs = tokenizer(
#         batch['question'],
#         batch['context'],
#         max_length=256,  # Увеличили максимальную длину для полного контекста
#         truncation=True,
#         padding="max_length",
#         return_offsets_mapping=True  # Добавили для получения смещений
#     )

#     start_positions = []
#     end_positions = []

#     for i in range(len(batch['answers'])):
#         answer = batch['answers'][i]
#         if answer['answer_start'] and answer['text']:
#             # Получаем позиции ответа в виде символов
#             start_char = answer['answer_start'][0]
#             end_char = start_char + len(answer['text'][0])

#             # Получаем offset_mapping для текущего примера
#             offsets = inputs['offset_mapping'][i]

#             # Инициализируем токеновые позиции
#             start_token = None
#             end_token = None

#             for idx, (offset_start, offset_end) in enumerate(offsets):
#                 if offset_start <= start_char < offset_end:
#                     start_token = idx
#                 if offset_start < end_char <= offset_end:
#                     end_token = idx
#                 if start_token is not None and end_token is not None:
#                     break

#             # Если не нашли токен, устанавливаем в 0
#             if start_token is None:
#                 start_token = 0
#             if end_token is None:
#                 end_token = 0
#         else:
#             start_token = 0
#             end_token = 0

#         start_positions.append(start_token)
#         end_positions.append(end_token)

#     # Добавляем позиции в inputs
#     inputs['start_positions'] = start_positions
#     inputs['end_positions'] = end_positions

#     # Удаляем offset_mapping, так как он не нужен модели
#     inputs.pop("offset_mapping")

#     return inputs

# # Применение предобработки к уменьшенному набору данных
# tokenized_dataset = small_dataset.map(
#     preprocess_data,
#     batched=True,
#     remove_columns=small_dataset["train"].column_names,
# )

# # Параметры обучения
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     logging_strategy="steps",
#     logging_steps=10,
#     learning_rate=3e-5,
#     per_device_train_batch_size=2,  # Уменьшенный размер батча из-за малого объема данных
#     per_device_eval_batch_size=2,
#     num_train_epochs=5,
#     weight_decay=0.01,
#     report_to="none"
# )

# # Очистка памяти перед началом обучения
# torch.cuda.empty_cache()
# gc.collect()

# # Настройка Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["validation"]
# )

# # Запуск тренировки
# trainer.train()

# # Сохранение модели в конце тренировки
# trainer.save_model("./results")  # Сохраняем только итоговую модель

In [None]:
def ask_question(question, context):
    # Токенизация вопроса и контекста
    inputs = tokenizer(
        question,
        context,
        max_length=256,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # Переносим входные данные на устройство модели
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Применяем модель для предсказания
    with torch.no_grad():
        outputs = model(**inputs)

    # Получаем начальные и конечные позиции ответа
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # Находим наилучшие индексы для начала и конца ответа
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    # Извлекаем ответ из токенизированного контекста
    all_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    answer = tokenizer.convert_tokens_to_string(all_tokens[start_index:end_index + 1])

    return answer

# Пример использования
question = "Алиса жасы нешеде?"
context = "Алиса 50 жаста ол робот"

answer = ask_question(question, context)
print("Ответ:", answer)


In [None]:
small_dataset

In [None]:
# Вывод первых 100 записей из тренировочного набора
for idx, data in enumerate(small_dataset['train']):
    print(f"Запись {idx + 1}:")
    print(f"Вопрос: {data['question']}\n")
    print(f"Контекст: {data['context']}\n")
    print(f"Ответы: {data['answers']}\n")
    print("-" * 80)


In [None]:
# Импорт необходимых библиотек
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_dataset

# Шаг 1: Загрузка датасета sKQuAD и выбор первых 1000 строк
dataset = load_dataset("Kyrmasch/sKQuAD")
train_data = dataset["train"].select(range(1000))

# Шаг 2: Подготовка токенизатора и модели
model_name = "nur-dev/roberta-kaz-large"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
model = RobertaForQuestionAnswering.from_pretrained(model_name)

# Шаг 3: Предобработка данных
def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples["question"], examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        context = examples["context"][sample_index]
        answer = examples["answer"][sample_index]

        # Определение позиции ответа в контексте
        answer_start = context.find(answer)
        answer_end = answer_start + len(answer)

        # Проверка, что ответ найден в контексте
        if answer_start == -1:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= answer_start and offsets[token_end_index][1] >= answer_end):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= answer_start:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)

                while offsets[token_end_index][1] >= answer_end:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

# Применение функции предобработки
tokenized_train_data = train_data.map(prepare_train_features, batched=True, remove_columns=train_data.column_names)

# Шаг 4: Настройка Trainer и TrainingArguments с измененными гиперпараметрами
training_args = TrainingArguments(
    output_dir="./roberta-kaz-squad-improved",
    evaluation_strategy="no",  # Отключение оценки
    learning_rate=2e-5,  # Уменьшенная скорость обучения для более точного обучения
    per_device_train_batch_size=4,  # Уменьшенный размер пакета для экономии памяти
    gradient_accumulation_steps=4,  # Накопление градиентов, эффективно увеличивая размер пакета
    num_train_epochs=5,  # Увеличенное количество эпох для лучшего обучения
    weight_decay=0.1  # Усиленный weight decay для уменьшения переобучения
)

# Инициализация Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data
)

# Шаг 5: Fine-tuning модели
trainer.train()

# Шаг 6: Сохранение обученной модели
model.save_pretrained("./roberta-kaz-squad-improved")
tokenizer.save_pretrained("./roberta-kaz-squad-improved")
