In [None]:
pip install torch torchvision transformers datasets

In [None]:
import torch
from datasets import load_dataset

# SQuAD veri seti
dataset = load_dataset('squad')


train_size = len(dataset['train'])
validation_size = len(dataset['validation'])

print(f"Train dataset size: {train_size}")
print(f"Validation dataset size: {validation_size}")

In [None]:

train_subset = dataset['train'].shuffle(seed=42).select(range(40000))
validation_subset = dataset['validation'].shuffle(seed=42).select(range(4000))

print(f"Train subset size: {len(train_subset)}")
print(f"Validation subset size: {len(validation_subset)}")

In [None]:
from transformers import BertTokenizerFast, BertForQuestionAnswering


tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')



In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
        return_tensors="pt"
    )


    print(f"Input IDs shape: {inputs['input_ids'].shape}")
    print(f"Attention Mask shape: {inputs['attention_mask'].shape}")
    print(f"Tokenized input IDs: {inputs['input_ids'][0][:10]}")

    offset_mapping = inputs.pop("offset_mapping")
    start_positions = []
    end_positions = []
    for i, offset in enumerate(offset_mapping):
        answer = examples["answers"][i]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx_start = context_start
            while idx_start < len(offset) and offset[idx_start][0] <= start_char:
                idx_start += 1
            start_positions.append(idx_start - 1)

            idx_end = context_end
            while idx_end >= 0 and offset[idx_end][1] >= end_char:
                idx_end -= 1
            end_positions.append(idx_end + 1)


    print(f"Start positions: {start_positions[:5]}")
    print(f"End positions: {end_positions[:5]}")

    inputs.update({"start_positions": torch.tensor(start_positions), "end_positions": torch.tensor(end_positions)})
    return inputs


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from tqdm import tqdm

tokenized_validation = validation_subset.map(preprocess_function, batched=True, remove_columns=validation_subset.column_names)
validation_dataset = tokenized_validation.with_format("torch")

validation_dataloader = DataLoader(validation_dataset, sampler=SequentialSampler(validation_dataset), batch_size=8)


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.eval()


In [None]:
def get_answer(question, context):
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt', truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    input_ids = inputs["input_ids"].tolist()[0]


    print(f"Encoded input IDs: {input_ids[:10]}")
    print(f"Inputs shapes: {[inputs[k].shape for k in inputs]}")

    outputs = model(**inputs)

    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits


    print(f"Start logits shape: {answer_start_scores.shape}")
    print(f"End logits shape: {answer_end_scores.shape}")
    print(f"Start logits: {answer_start_scores[0][:10]}")
    print(f"End logits: {answer_end_scores[0][:10]}")

    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    return answer


In [None]:

for i in range(10):
    question = validation_subset[i]["question"]
    context = validation_subset[i]["context"]
    real_answer = validation_subset[i]["answers"]["text"][0]
    predicted_answer = get_answer(question, context)

    print(f"Example {i+1}:")
    print(f"Question: {question}")
    print(f"Context: {context[:200]}...")
    print(f"Real Answer: {real_answer}")
    print(f"Predicted Answer: {predicted_answer}")
    print("="*50)


In [None]:

question = "What is the capital of Germany?"
context = "Germany is a country in Central Europe. Its capital is Berlin, which is also the largest city in the country. Berlin is known for its art scene and modern landmarks."


predicted_answer = get_answer(question, context)

print(f"Question: {question}")
print(f"Context: {context}")
print(f"Predicted Answer: {predicted_answer}")


In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(preds, labels):
    start_pred, end_pred = preds
    start_labels, end_labels = labels

    start_acc = accuracy_score(start_labels, start_pred)
    end_acc = accuracy_score(end_labels, end_pred)

    return {
        "start_accuracy": start_acc,
        "end_accuracy": end_acc,
    }

model.eval()

all_start_preds = []
all_end_preds = []
all_start_labels = []
all_end_labels = []

for batch in tqdm(validation_dataloader, desc="Evaluating"):
    with torch.no_grad():
        input_ids = batch['input_ids'].squeeze(1).to(device)
        attention_mask = batch['attention_mask'].squeeze(1).to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        start_preds = torch.argmax(start_logits, dim=1).cpu().numpy()
        end_preds = torch.argmax(end_logits, dim=1).cpu().numpy()

        all_start_preds.extend(start_preds)
        all_end_preds.extend(end_preds)
        all_start_labels.extend(start_positions.cpu().numpy())
        all_end_labels.extend(end_positions.cpu().numpy())

metrics = compute_metrics((all_start_preds, all_end_preds), (all_start_labels, all_end_labels))
print(metrics)
