In [None]:
import fitz  # PyMuPDF
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW, pipeline
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from transformers import default_data_collator
from tqdm.auto import tqdm
import random

# Step 1: PDF Text Extraction
def get_text_from_pdf(file_path):
    document = fitz.open(file_path)
    extracted_text = ""
    for page in document:
        extracted_text += page.get_text()
    return extracted_text

# Step 2: Load the SQuAD dataset
dataset = load_dataset('squad')

# Select a smaller subset of the dataset for quicker training
train_subset = dataset["train"].shuffle(seed=42).select(range(1000))
eval_subset = dataset["validation"].shuffle(seed=42).select(range(200))

# Step 3: Load a pre-trained transformer model
model_identifier = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_identifier)
model = AutoModelForQuestionAnswering.from_pretrained(model_identifier)

# Step 4: Tokenize the dataset
def preprocess_data(samples):
    questions = [q.strip() for q in samples["question"]]
    encodings = tokenizer(
        questions,
        samples["context"],
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mappings = encodings.pop("offset_mapping")
    sample_mappings = encodings.pop("overflow_to_sample_mapping")
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mappings):
        input_ids = encodings["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = encodings.sequence_ids(i)
        sample_index = sample_mappings[i]
        answers = samples["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_idx = 0
            while sequence_ids[token_start_idx] != 1:
                token_start_idx += 1

            token_end_idx = len(input_ids) - 1
            while sequence_ids[token_end_idx] != 1:
                token_end_idx -= 1

            if not (offsets[token_start_idx][0] <= start_char and offsets[token_end_idx][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_idx < len(offsets) and offsets[token_start_idx][0] <= start_char:
                    token_start_idx += 1
                start_positions.append(token_start_idx - 1)

                while offsets[token_end_idx][1] >= end_char:
                    token_end_idx -= 1
                end_positions.append(token_end_idx + 1)

    encodings["start_positions"] = start_positions
    encodings["end_positions"] = end_positions
    return encodings

tokenized_train_data = train_subset.map(preprocess_data, batched=True, remove_columns=train_subset.column_names)
tokenized_eval_data = eval_subset.map(preprocess_data, batched=True, remove_columns=eval_subset.column_names)

# Step 5: Create DataLoaders
train_loader = DataLoader(tokenized_train_data, shuffle=True, batch_size=16, collate_fn=default_data_collator)
eval_loader = DataLoader(tokenized_eval_data, batch_size=16, collate_fn=default_data_collator)

# Step 6: Set Up Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 1  # Reduce the number of epochs for quicker training
total_training_steps = num_epochs * len(train_loader)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=total_training_steps // 3, gamma=0.1)

# Step 7: Training Loop
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

progress_bar = tqdm(range(total_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.eval()
    with torch.no_grad():
        for batch in eval_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

# Save the fine-tuned model and tokenizer
model.save_pretrained("./my-fine-tuned-model")
tokenizer.save_pretrained("./my-fine-tuned-model")

# Step 8: Evaluate the Model
qa_pipeline = pipeline('question-answering', model="./my-fine-tuned-model", tokenizer="./my-fine-tuned-model")

# Example usage
pdf_file_path = 'example.pdf'  # Replace with the path to your PDF file
pdf_text = get_text_from_pdf(pdf_file_path)
user_question = "Who won Super Bowl 50?"
answer = qa_pipeline(question=user_question, context=pdf_text)
print(f"Answer: {answer['answer']}")
