In [1]:
# AIT 526-001
# 03/26/2025 
# Team 2: Yasser Jahgoori, Andrej Paskalov, Yaseen Trombati
# Programming Assignment #2 Question Answering 

In [2]:
# imports
import wikipediaapi
import wikipedia
from wikipedia.exceptions import DisambiguationError, PageError
import spacy
import en_core_web_sm
from spacy import displacy
from collections import Counter
from pprint import pprint
import re
import sys
import random

import json
import logging
import nbformat

In [3]:
nlp = spacy.load("en_core_web_sm")
# Define a user-agent according to Wikipedia's policy
user_agent = "MyWikipediaScraper/1.0 (Contact: your-email@example.com)"  # Update with your details

# Initialize Wikipedia API with user agent
wiki = wikipediaapi.Wikipedia(user_agent=user_agent, language="en")

# Stores logs
log_file = []
    
def log(question, search, raw_results, answer):
    log_entry = {
        "Question": question,
        "Wikipedia Search": search,
        "Raw Wikipedia Results": raw_results,
        "Answer": answer
    }
    log_file.append(log_entry)

In [4]:
def fetch_wikipedia_content(subject):
    """
    Fetches the Wikipedia page for the given subject using a search-first approach.
    Returns a tuple (summary, full_text, page_count).
    """
    page_count = 0  # Track number of pages processed

    try:
        # Search for Wikipedia articles using the subject
        search_results = wikipedia.search(subject)
        if not search_results:
            print(f"No search results found for: {subject}")
            return None, None, page_count  # No results, return None

        # Get the first relevant page
        for result in search_results:
            try:
                page = wiki.page(result)
                page_count += 1
                full_text = getattr(page, 'content', None) or page.summary
                print(f"Page found: {page.title}")
                return page.summary, full_text, page_count
            except DisambiguationError as e:
                print(f"Disambiguation error on {result}, trying alternatives...")
                continue  # Skip disambiguation pages
            except PageError:
                print(f"PageError: {result} does not exist, trying next...")
                continue  # Skip non-existent pages

    except Exception as ex:
        print(f"An error occurred: {ex}")

    return None, None, page_count  # If all attempts fail, return None

############################################
#         Question Parsing                 #
############################################

def extract_question_main_elements(question):
    # Grammar Check (guardrails)
    question_eval = check_question_grammar(question)

    if question_eval != "VALID QUESTION":
        return None, None, None, None, question_eval

    
    # Clean the question text
    cleaned = re.sub(r"[?!.,]", " ", question).strip()
    words = cleaned.split()

    # Identify question type (Who, What, When, Where)
    question_type = words[0].capitalize()
    if question_type not in ["Who", "What", "When", "Where"]:
        return None, None, None, None

    doc = nlp(question)
    
    keywords = []
    subject_tokens = []

    """ 
    If question has pattern: NOUN+ADP+NOUN or NOUN+ADP+ADJ+NOUN
        then detect NER that is GPE/ORG/MISC/WORK_OF_ART.
        If no NER (GPE/ORG/MISC/WORK_OF_ART) detected then go as usual.
    """  
    # Identify special NERs
    for token in doc:
        if token.pos_ == "NOUN" and token.nbor(1).pos_ == "ADP":
            # Check if it's NOUN + ADP + NOUN or NOUN + ADP + ADJ + NOUN
            if token.nbor(2).pos_ in ["NOUN", "ADJ"]:
                # Look for NER tags: GPE, ORG, MISC, WORK_OF_ART
                for ent in doc.ents:
                    if ent.label_ in ['GPE', 'ORG', 'MISC', 'WORK_OF_ART']:
                        # store as keyword and subject and remove from question
                        keywords.append(ent.text)
                        subject_tokens.append(ent.text)
                        doc_text = doc.text.replace(ent.text, '')  # Remove entity from the text
                        doc = nlp(doc_text)  # Recreate the Doc object
    
    # Get Topic using NER (for SEARCH ENGINE)
    keywords.extend([ent.text for ent in doc.ents])
    topic = " ".join(keywords)
    
    # Get Verbs using POS
    verb_tokens = [token.text for token in doc if token.pos_ in ["AUX", "VERB"]]
    verb = " ".join(verb_tokens).strip()

    # Subject using POS (** currently using for SEARCH **)
    subject_tokens.extend([token.text for token in doc if token.pos_ in ["DET", "NOUN", "PROPN", "ADJ"]])
    subject = " ".join(subject_tokens).strip()

    #print(f"Type: {question_type}, Verb: {verb}, Subject: {subject}, Search: {topic}")

    return question_type, topic, verb, subject, question_eval

############################################
#     BONUS: Check Question Grammar        #
############################################

def check_question_grammar(question):
    # Strip punctuation (use spaCy to handle this better)
    cleaned = re.sub(r"[^\w\s]", " ", question).strip()  # Remove all punctuation

    # Process the cleaned question with spaCy
    doc = nlp(cleaned)

    # Check if the question has more than two words
    if len(doc) < 3:
        return "INVALID QUESTION: The question must have more than two words."
    
    # Check if the first word is one of the valid WH-question words
    if doc[0].text.lower() not in ["what", "when", "where", "who"]:
        return "INVALID QUESTION: The question must start with What, When, Where, or Who."
        
    # Check if the second word is an auxiliary verb (AUX) or main verb (VERB)
    if doc[1].pos_ not in ["AUX", "VERB"]:
        return "INVALID QUESTION: The second word must be an auxiliary verb (AUX) or main verb (VERB)."

    # Check for a capitalized NOUN or PROPN
    has_noun_propn = any(token.pos_ in ["NOUN", "PROPN"] and token.text.istitle() for token in doc)
    
    if not has_noun_propn:
        return "INVALID QUESTION: The question must contain a capitalized NOUN or PROPER NOUN."

    return "VALID QUESTION"

############################################
#         Extract Keywords                 #
############################################

def extract_keywords(question):
    
    # Remove punctuation and tokenize
    question = re.sub(r'[^\w\s]', '', question)  # Remove punctuation
    words = question.split()  # Split into words
    
    # Drop the first word
    words = words[1:]

    # Apply spaCy NLP processing
    doc = nlp(" ".join(words))  # Convert cleaned text to spaCy Doc object
    
    # Filter words by POS (NOUN, PROPN, ADJ, VERB)
    keywords = [token.text for token in doc if token.pos_ in {"NOUN", "PROPN", "VERB"}]

    return keywords

############################################
#         Extract Date Answer              #
############################################

def extract_date(text, topic, question):    
    
    date_patterns = [
        r"\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b",  # Matches 12/25/2024, 25-12-2024
        r"\b(?:\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{2,4})\b",  # Matches 25 December 2024
        r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{2,4}\b",  # Matches December 25, 2024
        r'\b\d{4}[-–]\d{4}\b',  # Matches 1999-2000, 1990–2000
        r"\b\d{4}\b"  # Matches standalone years like "2024"
    ]

    # Combine into one regex
    date_regex = re.compile("|".join(date_patterns), re.IGNORECASE)

    # Get list of keywords
    keywords = extract_keywords(question)
    
    # Search for keywords in the text and find date nearby
    for keyword in keywords:
        match = re.search(rf"(.{{0,100}}\b{keyword}\b.{{0,100}})", text, re.IGNORECASE)
        if match:
            surrounding_text = match.group(1)
            date_match = date_regex.search(surrounding_text)
            if date_match:
                return date_match.group()

    # If keyword search fails, scan for date
    date_match = date_regex.search(text)
    if date_match:
        return date_match.group()
    
    return None   

############################################
#         Extract Location Answer          #
############################################

def extract_location(text, topic):
    text = re.sub(r'\(.*?\)', '', text) # removes all content inside parenthesis
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["GPE", "LOC"]:
            # Avoid returning the topic itself as the location.
            if ent.text.lower() != topic.lower():
                return ent.text
    # look for a phrase following 'in' or 'at'
    match = re.search(r'\b(?:in|at)\s+([A-Z][\w\s,]+)', text)
    if match:
        loc = match.group(1).strip()
        loc = re.sub(r'\s+United States.*', '', loc)
        if loc.lower() != topic.lower():
            return loc
    return None

############################################
#         Extract Individual               #
############################################

def extract_individual(text, topic):
    text = re.sub(r'\(.*?\)', '', text) # removes all content inside parenthesis
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["PERSON"]:
            # Avoid returning the topic itself as the location.
            if ent.text.lower() != topic.lower():
                return ent.text
    # look for a phrase following 'in' or 'at'
    match = re.search(r'\b(?:in|at)\s+([A-Z][\w\s,]+)', text)
    if match:
        loc = match.group(1).strip()
        loc = re.sub(r'\s+United States.*', '', loc)
        if loc.lower() != topic.lower():
            return loc
    return None

############################################
#         Verbage Variations               #
############################################

def check_verb(verb):
    # Parse the verb phrase using spaCy
    doc = nlp(verb)

    # If there is only one VERB and it is past tense (excluding some verbs), then add was before and by after. 
    if len(doc) == 1 and doc[0].tag_ in ["VBD", "VBN"] and not doc[0].text in ["did", "was"] :
        return f"was {doc[0].text} by"

    # Identify the positions of the auxiliary verb (AUX) and the verbs (VERBs)
    new_phrase = []
    prev_token = None
    
    # Iterate through tokens in the parsed document
    for i, token in enumerate(doc):
        # Add the current token to the new phrase list
        new_phrase.append(token.text)
        
        # Check if the previous token is AUX and the current token is VERB
        # and there is another VERB after that
        if prev_token and prev_token.pos_ == "AUX" and token.pos_ == "VERB":
            # If the next token is also a VERB, add "to"
            if i + 1 < len(doc) and doc[i + 1].pos_ == "VERB":
                new_phrase.append("to")
        
        # Update the previous token to the current token
        prev_token = token
    
    # Join the modified tokens into a single string
    return " ".join(new_phrase)

def subject_verb(subject, verb):
    # Parse the subject with spaCy
    doc = nlp(subject)
    
    # Find NERs in the document
    ner_entities = [ent for ent in doc.ents if ent.label_ in ['GPE', 'ORG', 'MISC', 'WORK_OF_ART', 'PERSON']]

    # Check if a PERSON entity is present in the subject and verb phrase
    has_person = any(ent.label_ == "PERSON" for ent in doc.ents)
    
    # Ensure there are exactly two NERs
    if len(ner_entities) == 2:
        # Get the text of the two NERs
        ner_1 = ner_entities[0].text
        ner_2 = ner_entities[1].text
        
        # Switch their positions and insert the verb between them
        #result = f"{ner_2} {verb} {ner_1}"
        result = f"{ner_1} {verb} {ner_2}"
    else:
        # If there are not exactly two NERs, insert the verb after the original subject
        result = f"{subject} {verb}"
        
    return result, has_person

############################################
#         Process the Question             #
############################################

def process_question(question):
    question_type, topic, verb, subject, question_eval = extract_question_main_elements(question)
    if not question_type:
        return question_eval
    
    summary, wiki_text, page_count = fetch_wikipedia_content(subject) # changed from topic
    if not wiki_text:
        return f"I am sorry, I couldn't find relevant information about {topic}."
    
    # Use the full summary as the extracted answer.
    answer = summary
    response = generate_response(question, topic, verb, subject, answer, question_type, summary, wiki_text)

    log(question, subject, wiki_text, response)
    
    return response

############################################
#         Response Generation              #
############################################

def generate_response(question, topic, verb, subject, answer, question_type, summary, wiki_text):
    if not answer:
        return "I am sorry, I don't know the answer."

    # check and adjust verb grammar
    verb = check_verb(verb)

    # merge subject and verb, and check if subject contains PERSON
    sub_verb, has_person = subject_verb(subject, verb)

    # Capitalize the first letter of the answer
    sub_verb = sub_verb[0].upper() + sub_verb[1:] if sub_verb else sub_verb

    # Extract elements to be used for each type of Wh- question
    location = extract_location(wiki_text, topic)
    date = extract_date(wiki_text, topic, question)
    individual = extract_individual(wiki_text, topic)
    
    if question_type == "Where":
        if location:
            return f"{sub_verb} in {location}."
    elif question_type == "When":
        if date:
            return f"{sub_verb} on {date}." 
    # Who question that does not contain a PERSON's name
    elif question_type == "Who" and individual and not has_person:
        return f"{sub_verb} {individual}."
    else:
        # Else returns the first sentence of the summary. for What question or Who question that contains a PERSON's name
        doc = nlp(summary)
        first_sentence = list(doc.sents)[0].text.strip()

        # Remove any content inside parentheses
        first_sentence = re.sub(r'\([^)]*\)', '', first_sentence)
        # Remove extra spaces that might have been introduced
        first_sentence = re.sub(r'\s+', ' ', first_sentence)
        # Capitalize the first letter without lowercasing the rest
        first_sentence = first_sentence[0].upper() + first_sentence[1:]
        
        return first_sentence

In [5]:
############################################
#         Question-Answer                  #
############################################

if __name__ == "__main__":
    print("*** This is a QA system. It will try to answer Who, What, When, and Where questions. ***")
    print("Enter 'exit' to leave the program.")
    while True:
        user_question = input("=?> ").strip()
        if user_question.lower() == "exit":
            print("Thank you! Goodbye.")
            break
        print("=>", process_question(user_question))

*** This is a QA system. It will try to answer Who, What, When, and Where questions. ***
Enter 'exit' to leave the program.


=?>  What is Uranium?


Page found: Uranium
=> Uranium is a chemical element with the symbol U and atomic number 92.


=?>  What is the Elephant's Foot?


Page found: Elephant's Foot (Chernobyl)
=> The Elephant's Foot is the nickname given to the large mass of corium beneath Reactor 4 of the Chernobyl Nuclear Power Plant, near Pripyat, Ukraine.


=?>  Where was the Eiffel Tower built?


Page found: Eiffel Tower
=> The Eiffel Tower was built in Paris.


=?>  Where is the Charles Bridge located?


Page found: Charles Bridge
=> The Charles Bridge is located in Prague.


=?>  When was the Mona Lisa painted?


Page found: Mona Lisa
=> The Mona Lisa was painted on 1503.


=?>  When was the Battle of Kursk?


Page found: Battle of Kursk
=> The Battle Kursk was on 1943.


=?>  When was Neptune discovered?


Page found: Neptune
=> Neptune was discovered on 23 September 1846.


=?>  Who painted the Mona Lisa?


Page found: Mona Lisa
=> The Mona Lisa was painted by Leonardo da Vinci.


=?>  Who is the Nvidia CEO?


Page found: Nvidia GTC
=> The Nvidia CEO is Jensen Huang.


=?>  Who directed Fullmetal Jacket?


Page found: Full Metal Jacket
=> Fullmetal Jacket was directed by Stanley Kubrick.


=?>  What is the largest Planet in the Solar System?


Page found: List of gravitationally rounded objects of the Solar System
=> This is a list of most likely gravitationally rounded objects of the Solar System, which are objects that have a rounded, ellipsoidal shape due to their own gravity .


=?>  What is the currency used in Japan?


Page found: Japanese currency
=> Japanese currency has a history covering the period from the 8th century CE to the present.


=?>  What is the largest ocean on Earth?


Page found: Ocean
=> The ocean is the body of salt water that covers approximately 70.8% of Earth.


=?>  Where is the Sahara Desert located?


Page found: Sahara
=> The Sahara Desert is located in Sahara.


=?>  Where is the tallest Mountain in the world?


Page found: List of highest mountains on Earth
=> The tallest Mountain the world is in Earth.


=?>  Where is the oldest University in the world?


Page found: List of oldest universities in continuous operation
=> The oldest University the world is in Europe.


=?>  When was the Internet invented?


Page found: Internet
=> The Internet was invented on 2006.


=?>  When did the Berlin Wall fall?


Page found: Berlin Wall
=> The Berlin Wall did fall on 22 December 1989.


=?>  Who discovered Pluto?


Page found: Pluto
=> Pluto was discovered by Eris.


=?>  Who invented the Airplane?


Page found: Airplane!
=> The Airplane was invented by Jim Abrahams.


=?>  What is Plutonium?


Page found: Plutonium
=> Plutonium is a chemical element; it has symbol Pu and atomic number 94.


=?>  What is String Theory?


Page found: String theory
=> In physics, string theory is a theoretical framework in which the point-like particles of particle physics are replaced by one-dimensional objects called strings.


=?>  What is a Regolith?


Page found: Regolith
=> Regolith is a blanket of unconsolidated, loose, heterogeneous superficial deposits covering solid rock.


=?>  What is a Hummock?


Page found: Hummock
=> In geology, a hummock is a small knoll or mound above ground.


=?>  What is a Bicycle?


Page found: Bicycle
=> A bicycle, also called a pedal cycle, bike, push-bike or cycle, is a human-powered or motor-assisted, pedal-driven, single-track vehicle, with two wheels attached to a frame, one behind the other.


=?>  Where is the Red Forest located?


Page found: Red Forest
=> The Red Forest is located in the Exclusion Zone.


=?>  Where was the Sistine Chapel painted?


Page found: Sistine Chapel
=> The Sistine Chapel was painted in the Apostolic Palace.


=?>  Where is the Apostolic Palace?


Page found: Apostolic Palace
=> The Apostolic Palace is in Vatican City.


=?>  Where is the Colloseum?


Page found: Colosseum
=> The Colloseum is in Rome.


=?>  Where was the Taj Mahal built?


Page found: Taj Mahal
=> The Taj Mahal was built in Agra.


=?>  When was the Berlin Wall built?


Page found: Berlin Wall
=> The Berlin Wall was built on 1961.


=?>  When did the French Revolution start?


Page found: French Revolution
=> The French Revolution did start on 1789.


=?>  When was the Battle of Hastings?


Page found: Battle of Hastings
=> The Battle Hastings was on 14 October 1066.


=?>  When was the Wheel created?


Page found: Wheel
=> None


=?>  When did the Industrial Revolution start?


Page found: Industrial Revolution
=> The Industrial Revolution did start on 1760.


=?>  Who was the first President of the United States?


Page found: President of the United States
=> The first President the United States was George Washington.


=?>  Who was Thomas Edison?


Page found: Thomas Edison
=> Thomas Alva Edison was an American inventor and businessman.


=?>  Who built the Eiffel Tower?


Page found: Eiffel Tower
=> The Eiffel Tower was built by Gustave Eiffel.


=?>  Who was Jiang Zamin?


Page found: Jiang Zemin
=> Jiang Zemin was a Chinese politician who served as general secretary of the Chinese Communist Party from 1989 to 2002, as chairman of the Central Military Commission from 1989 to 2004, and as president of China from 1993 to 2003.


=?>  Who invented the Toilet Paper?


Page found: Toilet paper
=> The Toilet Paper was invented by China in the 6th century AD, with specifically manufactured toilet paper being mass.


=?>  exit


Thank you! Goodbye.


In [7]:
# Display All the entire Log file (list)
display(log_file)

[{'Question': 'What is Uranium?',
  'Wikipedia Search': 'Uranium',
  'Raw Wikipedia Results': 'Uranium is a chemical element with the symbol U and atomic number 92. It is a silvery-grey metal in the actinide series of the periodic table. A uranium atom has 92 protons and 92 electrons, of which 6 are valence electrons. Uranium radioactively decays, usually by emitting an alpha particle. The half-life of this decay varies between 159,200 and 4.5 billion years for different isotopes, making them useful for dating the age of the Earth. The most common isotopes in natural uranium are uranium-238 (which has 146 neutrons and accounts for over 99% of uranium on Earth) and uranium-235 (which has 143 neutrons). Uranium has the highest atomic weight of the primordially occurring elements. Its density is about 70% higher than that of lead and slightly lower than that of gold or tungsten. It occurs naturally in low concentrations of a few parts per million in soil, rock and water, and is commercial

In [8]:
# FOR TESTING PURPOSES ONLY
"""
def extract_pos_from_question(question):
    # Process the question with spaCy
    doc = nlp(question)
    
    # Extract POS tags for each token
    pos_tags = [(token.text, token.pos_) for token in doc]

    topic = [token.text for token in doc if token.pos_ in ["DET", "NOUN", "PROPN", "ADJ", "ADP"]]
    topic = " ".join(topic).strip()

    verb = [token.text for token in doc if token.pos_ in ["VERB", "AUX"]]
    verb = " ".join(verb).strip()

    # did + verb = verb past tense

    # noun + verb + noun = topic?

    # between the first and last noun
    
    return pos_tags, topic, verb

# Example question
question = "I want to sleep."

# Extract POS tags
pos_tags, topic, verb = extract_pos_from_question(question)

# Output the POS tags
for word, pos in pos_tags:
    print(f"{word}: {pos}")

print(f"{topic} {verb}")
"""

'\ndef extract_pos_from_question(question):\n    # Process the question with spaCy\n    doc = nlp(question)\n    \n    # Extract POS tags for each token\n    pos_tags = [(token.text, token.pos_) for token in doc]\n\n    topic = [token.text for token in doc if token.pos_ in ["DET", "NOUN", "PROPN", "ADJ", "ADP"]]\n    topic = " ".join(topic).strip()\n\n    verb = [token.text for token in doc if token.pos_ in ["VERB", "AUX"]]\n    verb = " ".join(verb).strip()\n\n    # did + verb = verb past tense\n\n    # noun + verb + noun = topic?\n\n    # between the first and last noun\n    \n    return pos_tags, topic, verb\n\n# Example question\nquestion = "I want to sleep."\n\n# Extract POS tags\npos_tags, topic, verb = extract_pos_from_question(question)\n\n# Output the POS tags\nfor word, pos in pos_tags:\n    print(f"{word}: {pos}")\n\nprint(f"{topic} {verb}")\n'

In [9]:
# References:

# [1] Better Stack. (n.d.). JSON logging: The ultimate guide. Better Stack Community. Retrieved March 26, 2025, from https://betterstack.com/community/guides/logging/json-logging/
# [2] Explosion AI. (n.d.). DisplaCy: Named entity recognition visualization. Retrieved March 26, 2025, from https://demos.explosion.ai/displacy-ent
# [3] Explosion AI. (n.d.). Named entity recognition with spaCy. spaCy. Retrieved March 26, 2025, from https://spacy.io/usage/linguistic-features#named-entities
# [4] Explosion AI. (n.d.). Part-of-speech tagging with spaCy. spaCy. Retrieved March 26, 2025, from https://spacy.io/usage/linguistic-features#pos-tagging
# [5] GeeksforGeeks. (2021, January 21). Wikipedia module in Python. GeeksforGeeks. Retrieved March 26, 2025, from https://www.geeksforgeeks.org/wikipedia-module-in-python/
# [6] Wikipedia-API. (n.d.). Wikipedia-API. PyPI. Retrieved March 26, 2025, from https://pypi.org/project/Wikipedia-API/