# Question Answering Pipeline - Вопросно-ответные системы

Пайплайны для:
- Extractive QA (BERT, RoBERTa)
- Open-domain QA
- Closed-book QA

In [None]:
!pip install transformers datasets torch pandas -q

In [None]:
import torch
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    pipeline,
    TrainingArguments,
    Trainer,
    DefaultDataCollator
)
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

## 1. Extractive QA (готовая модель)

In [None]:
# Загрузка pre-trained модели для QA
MODEL_NAME = "deepset/roberta-base-squad2"
# Альтернативы: "bert-large-uncased-whole-word-masking-finetuned-squad"

qa_pipeline = pipeline(
    "question-answering",
    model=MODEL_NAME,
    device=0 if torch.cuda.is_available() else -1
)

print("✓ QA модель загружена!")

In [None]:
# === ПРИМЕР ИСПОЛЬЗОВАНИЯ ===
context = """
Python is a high-level, interpreted programming language. 
It was created by Guido van Rossum and first released in 1991.
Python is known for its simple syntax and readability.
"""

questions = [
    "Who created Python?",
    "When was Python released?",
    "What is Python known for?"
]

for question in questions:
    result = qa_pipeline(question=question, context=context)
    print(f"\nQ: {question}")
    print(f"A: {result['answer']} (score: {result['score']:.4f})")

## 2. Загрузка данных для fine-tuning

In [None]:
# === ВАШИ ДАННЫЕ ===
# Формат SQuAD:
# {
#   "context": "текст с информацией",
#   "question": "вопрос",
#   "answers": {
#     "text": ["ответ"],
#     "answer_start": [позиция_начала]
#   }
# }

# train_df = pd.read_json('train_qa.json')

# Пример данных
data = {
    'context': [
        "Python was created by Guido van Rossum in 1991.",
        "Machine Learning is a subset of AI."
    ],
    'question': [
        "Who created Python?",
        "What is Machine Learning?"
    ],
    'answers': [
        {'text': ['Guido van Rossum'], 'answer_start': [22]},
        {'text': ['a subset of AI'], 'answer_start': [21]}
    ]
}
train_df = pd.DataFrame(data)

print(train_df.head())

## 3. Подготовка данных

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def prepare_train_features(examples):
    # Токенизация
    tokenized_examples = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    # Находим позиции ответов
    offset_mapping = tokenized_examples.pop("offset_mapping")
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        
        sample_index = sample_mapping[i]
        answers = examples['answers'][sample_index]
        
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            
            token_start_index = 0
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            
            token_end_index = len(offsets) - 1
            while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)
    
    return tokenized_examples

# train_dataset = Dataset.from_pandas(train_df)
# tokenized_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)

## 4. Fine-tuning

In [None]:
# model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)

# training_args = TrainingArguments(
#     output_dir="./qa_model",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     num_train_epochs=3,
#     weight_decay=0.01,
# )

# data_collator = DefaultDataCollator()

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset,
#     data_collator=data_collator,
# )

# trainer.train()
# trainer.save_model("./qa_finetuned")

## 5. Предсказания для соревнования

In [None]:
# === ТЕСТОВЫЕ ДАННЫЕ ===
test_df = pd.read_csv('test_qa.csv')  # Колонки: context, question

predictions = []
for _, row in test_df.iterrows():
    result = qa_pipeline(
        question=row['question'],
        context=row['context']
    )
    predictions.append(result['answer'])

# Submission
submission = pd.DataFrame({
    'id': test_df.index if 'id' not in test_df.columns else test_df['id'],
    'answer': predictions
})

submission.to_csv('qa_submission.csv', index=False)
print("✓ Submission сохранен!")