In [None]:
# AnswerResult will hold the bert answers for ease of scalability
class AnswerResult:
    def __init__(self, answer, score=0.0, source_sentence=None):
        self.answer = answer
        self.score = score

    def __repr__(self):
        return f"AnswerResult(text='{self.answer}', score={self.score:.4f}')"

    def is_valid(self):
        return self.answer and self.answer.upper() != "NO ANSWER FOUND"

    def get_answer(self):
        return self.answer

    @staticmethod
    def no_answer():
        return AnswerResult("NO ANSWER FOUND", 0.0, None)


# Import the model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_path = "/content/drive/MyDrive/CISC 688 - Intro to NLP /Project - CISC 688 - Intro to NLP/NLP_BERT_Model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

# Test Set

In [None]:
dataset = [
    "What is the capital of France?",
    "Who wrote the novel '1984'?",
    "What is the largest planet in the Solar System?",
    "In which year did World War II end?",
    "Who painted the Mona Lisa?",
    "What is the chemical symbol for gold?",
    "Who discovered penicillin?",
    "What is the smallest country in the world by area?",
    "What is the currency of Japan?",
    "Who was the first person to walk on the moon?",
    "What is the tallest mountain on Earth?",
    "Which element has the atomic number 1?",
    "What is the largest mammal on Earth?",
    "Who developed the theory of relativity?",
    "What is the capital city of Canada?",
    "What is the longest river in the world?",
    "Who invented the telephone?",
    "What is the official language of Brazil?",
    "Which country hosted the 2016 Summer Olympics?",
    "What is the fastest land animal?",
    "Who is known as the father of modern physics?",
    "Which continent is the Sahara Desert located in?",
    "Who was the first President of the United States?",
    "What is the main ingredient in guacamole?",
    "What is the largest ocean on Earth?",
    "Who directed the movie 'Titanic'?",
    "What is the boiling point of water at sea level in Celsius?",
    "What country is known as the Land of the Rising Sun?",
    "Who wrote 'To Kill a Mockingbird'?",
    "What is the hardest natural substance on Earth?",
    "Who is the author of 'Harry Potter'?",
    "What is the currency used in the United Kingdom?",
    "Who painted the ceiling of the Sistine Chapel?",
    "What is the main gas found in Earth's atmosphere?",
    "Which country is famous for the Great Wall?",
    "Who discovered gravity according to legend?",
    "What is the largest desert in the world?",
    "What is the capital of Australia?",
    "Who is the Greek god of the sea?",
    "What is the square root of 64?",
    "What is the longest bone in the human body?",
    "Who composed the 'Fifth Symphony'?",
    "What is the largest island in the world?",
    "What is the smallest prime number?",
    "Who was the first woman to win a Nobel Prize?",
    "Which planet is known as the Red Planet?",
    "What is the largest continent by area?",
    "Who invented the World Wide Web?",
    "What is the primary language spoken in Egypt?",
    "What is the capital of Italy?",
]

# Run Inference, convert span to text

In [None]:
!pip install wikipedia



# BERT stuff

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def find_sentence_for_span(span_text, context):
    # """
    # Attempts to find the full sentence in the context that contains the span text.
    # Returns the sentence or None if not found.
    # """
    # doc = nlp(context)
    # for sent in doc.sents:
    #     if span_text in sent.text:
    #         return sent.text.strip()
    return None

In [None]:
import torch
import torch.nn.functional as F

def score_spans_in_chunk(ids, offsets, sequence_ids, start_probs, end_probs, context, tokenizer):
    best_score = 0.2  # Thresholding
    best_result = AnswerResult.no_answer()

    for start_idx in range(len(ids)):
        for end_idx in range(start_idx, min(start_idx + 15, len(ids))):
            if sequence_ids[start_idx] != 1 or sequence_ids[end_idx] != 1:
                continue
            if offsets[start_idx] is None or offsets[end_idx] is None:
                continue
            if offsets[start_idx][0] is None or offsets[end_idx][1] is None:
                continue

            span_length = end_idx - start_idx + 1
            penalty = 1.0 if span_length < 5 else 1.0 - (span_length / len(ids))
            score = start_probs[start_idx] * end_probs[end_idx] * penalty

            if score > best_score and offsets[start_idx][0] < offsets[end_idx][1]:
                start_char = offsets[start_idx][0]
                end_char = offsets[end_idx][1]
                text = context[start_char:end_char].strip().lower().title()
                best_score = score
                best_result = AnswerResult(text, score.item())

    if best_result.is_valid():
        best_result.source_sentence = find_sentence_for_span(best_result.answer, context)
    return best_result


In [None]:
def process_chunk(ids, attention_mask, token_type_ids, sequence_ids, offsets, context, tokenizer, model):
    with torch.no_grad():
        outputs = model(
            input_ids=ids.unsqueeze(0),
            attention_mask=attention_mask.unsqueeze(0),
            token_type_ids=token_type_ids.unsqueeze(0),
        )

    start_logits = outputs.start_logits[0]
    end_logits = outputs.end_logits[0]

    start_probs = F.softmax(start_logits, dim=0)
    end_probs = F.softmax(end_logits, dim=0)

    best_result = score_spans_in_chunk(ids, offsets, sequence_ids, start_probs, end_probs, context, tokenizer)

    if best_result.is_valid():
        print(best_result)
    return best_result

In [None]:
def answer_long_context_with_logs(question, context, tokenizer, model, max_length=384, doc_stride=128):
    """
    Process a long context by sliding window and returning the best AnswerResult.
    """
    inputs = tokenizer(
        question,
        context,
        max_length=max_length,
        truncation="only_second",
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        return_tensors="pt",
        return_token_type_ids=True,
    )

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    offset_mappings = inputs["offset_mapping"]
    token_type_ids = inputs["token_type_ids"]

    print(f"🔍 Total chunks to evaluate: {len(input_ids)}")
    all_results = []

    # foreach chunk
    for i in range(len(input_ids)):
        # get best result
        result = process_chunk(
            input_ids[i],
            attention_mask[i],
            token_type_ids[i],
            inputs.sequence_ids(i),
            offset_mappings[i],
            context,
            tokenizer,
            model
        )

        if result.is_valid():
            all_results.append(result)

    # return answers with scores
    return all_results


# Wikipedia stuff

In [None]:
!pip install wikipedia-api



In [None]:
import wikipediaapi

# Setup Wikipedia API client with a proper User-Agent
user_agent = "BERT-QA-Pipeline/1.0 (contact: your-email@example.com)"
wiki_api = wikipediaapi.Wikipedia(user_agent=user_agent, language='en')

def get_key_with_max_value(my_dict):
    """Returns the key with the maximum value in a dictionary."""
    if not my_dict:
        return None
    return max(my_dict, key=my_dict.get)

def ask_question_to_wiki(question: str, tokenizer, model, topN: int = 3):
    """
    Searches Wikipedia using wikipediaapi and extracts answers using the provided tokenizer and model.
    """
    import wikipedia  # Only for search
    search_results = wikipedia.search(question, results=topN)
    print(f"Found the following relevant Wikipedia documents: {search_results}\n")

    results = []  # List of all answers
    answers = {}  # Scored key:value pairs on AnswerString:ConfidenceScore

    for title in search_results:
        print(f"🔍 Attempting to fetch page: {title}")
        page = wiki_api.page(title)

        if not page.exists():
            print(f"⚠️ Skipping '{title}' - Page does not exist or is disambiguation.")
            continue

        print(f"📄 Page Retrieved: {page.title} (Page ID: {page.pageid})")
        content = page.text.strip()

        if not content or len(content) < 50:
            print(f"⚠️ Skipping '{page.title}' due to insufficient content.")
            continue

        results += answer_long_context_with_logs(question, content, tokenizer, model)

    # softmax magic time
    scores_tensor = torch.tensor([r.score for r in results])
    softmaxed_scores = torch.nn.functional.softmax(scores_tensor, dim=0)

    for i, result in enumerate(results):
        key = result.answer
        weighted_score = softmaxed_scores[i].item()  # Convert tensor to float

        if key in answers:
            answers[key] += weighted_score
        else:
            answers[key] = weighted_score

    if not answers:
        print("❌ No valid answers found in any pages.")
        return None

    best_answer_key = get_key_with_max_value(answers)
    print(f"🏆 Best answer: {best_answer_key} with score {answers[best_answer_key]:.4f}")
    return best_answer_key


# Evaluation

In [None]:
!pip install openai



In [None]:
import openai

openaiapikey = None  # your_key_here

# Define the OpenAI client first
client = openai.OpenAI(api_key=openaiapikey)

def ask_chatgpt(question, model="gpt-3.5-turbo"):
    """Ask ChatGPT a question and return the response."""
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": question}],
        temperature=0.7,
        max_tokens=500,
    )
    return response.choices[0].message.content.strip()


In [None]:
import csv

csvpath = "/content/drive/MyDrive/CISC 688 - Intro to NLP /Project - CISC 688 - Intro to NLP/answeredQs.csv"

def export_answers_to_csv(questions, tokenizer, model, csvpath, chatgpt_func):
    """
    Exports BERT and ChatGPT answers for a list of questions to a CSV file.
    """
    with open(csvpath, 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        csvwriter.writerow(['question', 'BERT_Ans', 'ChatGPT_Ans'])

        for question in questions:
            print(f"\n❓ Question: {question}")

            # Get ChatGPT answer
            ChatGPTAnswer = chatgpt_func(question)

            # Get BERT answer from Wikipedia
            BERTanswer = ask_question_to_wiki(question, tokenizer, model)

            # Fallback if no BERT answer found
            if BERTanswer is None:
                BERTanswer = "No valid Wikipedia answer found"

            # Log and write the result
            output = [question, BERTanswer, ChatGPTAnswer]
            print(f"✅ Exporting: {output}")
            csvwriter.writerow(output)

In [None]:
# Put in csv
export_answers_to_csv(dataset, tokenizer, model, csvpath, ask_chatgpt)


❓ Question: What is the capital of France?
Found the following relevant Wikipedia documents: ['Closed-ended question', 'France', 'What Is a Nation?']

🔍 Attempting to fetch page: Closed-ended question
📄 Page Retrieved: Closed-ended question (Page ID: 11517229)
🔍 Total chunks to evaluate: 4
AnswerResult(text='Lyon', score=0.9316')
🔍 Attempting to fetch page: France
📄 Page Retrieved: France (Page ID: 5843419)
🔍 Total chunks to evaluate: 76
AnswerResult(text='Paris', score=0.9906')
AnswerResult(text='Lyon', score=0.4889')
AnswerResult(text='Paris', score=0.2286')
AnswerResult(text='Paris', score=0.9708')
AnswerResult(text='Palace Of Versailles', score=0.3331')
AnswerResult(text='Lisbon', score=0.9907')
AnswerResult(text='Corsica', score=0.2284')
AnswerResult(text='Paris', score=0.9680')
AnswerResult(text='Marseille', score=0.3075')
AnswerResult(text='Canal Du Midi', score=0.4132')
AnswerResult(text='Paris', score=0.6612')
AnswerResult(text='Paris', score=0.5766')
AnswerResult(text='Paris