# Wiki Article Matcher

algorithm:
1. regex replacement operations
2. word tokenize
3. stanford NER and nltk pos
4. Determine pronoun chunking technique
5. if noun/pronoun ambiguous (first result does not match exactly, ask to choose from list)
6. use new nouns/pronouns to find article

https://www.mediawiki.org/wiki/API:Search#GET_request

testing:
- test query preprocessing
- test noun/pronoun/query_list

## I. Import Libraries

In [1]:
# core modules
import os
import re
import json
import string
import requests
import wikipedia
import pandas as pd

# nltk modules
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.tag import StanfordNERTagger

# set environment var with path to stanford-ner.jar
os.environ["CLASSPATH"] = "../models/stanford-ner.jar"

# instantiate stanford ner
stanford_ner = StanfordNERTagger("../models/english.conll.4class.distsim.crf.ser.gz")

## II. Preprocess Query

In [45]:
query = "What religions and idea of thought is heresy cited as being used frequently in?"

In [46]:
def process_query(query, remove_possessive = True, 
                  remove_punctuation = True, 
                  remove_stopwords = True):
    '''
    Peforms string processing such as removing possessive, punctuation, and stopword.
    '''
    if remove_possessive:
        # remove possessive
        query = query.replace("\'s", "")
        query = query.replace("s\'", "")

    if remove_punctuation:
        # remove punctuation
        query = query.translate(str.maketrans("", "", string.punctuation))

    if remove_stopwords:
        # remove stopwords
        eng_stopwords = stopwords.words('english')
        for word in eng_stopwords:
            if re.search(r" {} ".format(word), query):
                query = query.replace(r"{} ".format(word), "")

    # word tokenize query
    query_lst = [word for word in word_tokenize(query)]
    
    return query_lst

In [54]:
query_lst = preprocess_query(query)

## III. Wikipedia API Wrapper

In [48]:
class WikipediaAPI:
    url = "https://en.wikipedia.org/w/api.php"
    
    @staticmethod
    def find_titles_by_text(query, limit = 10):
        params = {
            "srsearch": query,
            "srlimit": limit,
            "srwhat" : "text",
            "action": "query",
            "format": "json",
            "list": "search",
        }
        res = requests.get(WikipediaAPI.url, params)
        return res.json()

    @staticmethod
    def has_near_match(query):
        params = {
            "srsearch": query,
            "srwhat" : "nearmatch",
            "action": "query",
            "format": "json",
            "list": "search",
        }
        res = requests.get(WikipediaAPI.url, params)
        return len(res.json()["query"]["search"]) > 0
    
    @staticmethod
    def find_titles_by_query(query, results = 10):
        return wikipedia.search(query, results)

## IV. Chunking

In [59]:
def get_consec_ner(lst):
    '''
    Groups nouns based on consecutive named-entities and consecutive non-named-entities.
    '''
    consec_ner = []
    temp = []
    is_entity = False
    ner_tags = stanford_ner.tag(lst)
    
    for word, tag in ner_tags:
        if (tag != "O" and is_entity) or (tag == "O" and not is_entity):
            temp.append(word)
        else:
            if temp:
                consec_ner.append(temp)
            temp = [word]
            is_entity = not is_entity
    
    # add remaning words in temp
    if temp:
        consec_ner.append(temp)
        
    return consec_ner

In [50]:
def get_consec_combinations(lst):
    '''
    Gets consecutive combinations in a list where each combination has two or more elements.
    '''
    consec_combinations = []
    for i in range(len(lst)):
        for j in range(i + 1, len(lst)):
            consec_combinations.append(lst[i:j + 1])
            
    return consec_combinations

In [51]:
def chunk_nouns(nouns):
    return chunk_nouns_recursive(nouns, list())

def chunk_nouns_recursive(rem_nouns, chunks):
    if len(rem_nouns) == 0:
        return chunks

    if len(rem_nouns) == 1:
        chunks.append(rem_nouns[0])
        return chunks

    all_combinations = get_consec_combinations(rem_nouns)

    results = []
    for combination in all_combinations:
        combination_str = " ".join(combination)
        new_rem_nouns = rem_nouns.copy()
        new_chunks = chunks.copy()
        if WikipediaAPI.has_near_match(combination_str):
            for noun in combination:
                new_rem_nouns.remove(noun)
            new_chunks.append(combination_str)
            result = chunk_nouns_recursive(new_rem_nouns, new_chunks)
            results.append(result)

    if results:
        max_result = min(results, key=len)
        return max_result
    else:
        chunks.extend(rem_nouns)
        return chunks

In [52]:
def get_chunks(query_lst):
    res = []
    for consecutive_ner in get_consec_ner(query_lst):
        if len(consecutive_ner) > 1:
            res.extend(chunk_nouns(consecutive_ner))
        else:
            res.append(consecutive_ner[0])
    tagged = nltk.pos_tag(res)
    res = [tag[0] for tag in tagged if tag[1][0] == "N"]
    return res

## VI. Get Relevant Searches

In [34]:
def get_relavent_searches(query_lst, chunks, n_searches):
    if n_searches > len(chunks) + 1:
        relavent_searches = []
        n_searches_per_chunk = n_searches // (len(chunks) + 1)

        for chunk in chunks:
            n_chunk_searches = 0
            searches = WikipediaAPI.find_titles_by_query(chunk, n_searches)
            for search in searches:
                # find searches not already found
                if n_chunk_searches >= n_searches_per_chunk:
                    break
                if search not in relavent_searches:
                    relavent_searches.append(search)
                    n_chunk_searches += 1

        searches = WikipediaAPI.find_titles_by_query(" ".join(query_lst), n_searches)
        for search in searches:
            if len(relavent_searches) >= n_searches:
                break
            if search not in relavent_searches:
                relavent_searches.append(search)
    else:
        relavent_searches = WikipediaAPI.find_titles_by_query(" ".join(query_lst), n_searches)
    
    return relavent_searches

## VII. Evaluate

In [35]:
path = "../../data/page_dataset/page_dataset_1.csv"
page_df = pd.read_csv(path)

In [43]:
def get_acc(page_df, n_searches = 10):
    acc = 0
    for index, row in page_df.iterrows():
        if index % 25 == 0:
            print("Evaluating: [{}]/[{}]".format(index, len(page_df)))
        question = row["question"]
        label = row["label"]
        query_lst = preprocess_query(question)
        chunks = get_chunks(query_lst)
        relevant_searches = get_relavent_searches(query_lst, chunks, n_searches)
        if label in relevant_searches:
            acc += 1
        else:
            print("Question: {}".format(question))
            print("Label: {}".format(label))
            print("Chunks: {}".format(chunks))
            print("Searches: {}".format(relevant_searches))
    return acc / len(page_df)

In [44]:
acc = get_acc(page_df)

Evaluating: [0]/[440]
Question: Who is the main character of the story?
Label: The Legend of Zelda: Twilight Princess
Chunks: ['main character', 'story']
Searches: ['Protagonist', 'Oggy and the Cockroaches', 'Main Page', 'Story', 'Story within a story', 'American Horror Story', 'Bo Peep (Toy Story)', 'List of breakout characters', 'Forky', 'Character arc']
Question: On what fault did the earthquake occur?
Label: 2008 Sichuan earthquake
Chunks: ['fault', 'earthquake']
Searches: ['Fault', 'The Fault in Our Stars (film)', 'The Fault in Our Stars', 'Earthquake', '2004 Indian Ocean earthquake and tsunami', 'Lists of earthquakes', 'San Andreas Fault', 'Wasatch Fault', 'Fault (geology)', 'Alpine Fault']
Question: What city in the United States has the highest population?
Label: New York (state)
Chunks: ['city', 'population']
Searches: ['The City & the City', 'City', 'New York City', 'Population', 'World population', 'Population density', 'List of United States cities by population', 'List of 

KeyboardInterrupt: 

In [38]:
acc

0.5522727272727272