# Wiki Article Matcher

algorithm:
1. regex replacement operations
2. word tokenize
3. stanford NER and nltk pos
4. Determine pronoun chunking technique
5. if noun/pronoun ambiguous (first result does not match exactly, ask to choose from list)
6. use new nouns/pronouns to find article

https://www.mediawiki.org/wiki/API:Search#GET_request

testing:
- test query preprocessing
- test noun/pronoun/query_list

## I. Import Libraries

In [100]:
# core modules
import os
import re
import json
import string
import requests
import wikipedia
import pandas as pd

# nltk modules
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.tag import StanfordNERTagger

# set environment var with path to stanford-ner.jar
os.environ["CLASSPATH"] = "../models/stanford-ner.jar"

# instantiate stanford ner
stanford_ner = StanfordNERTagger("../models/english.conll.4class.distsim.crf.ser.gz")

## II. Preprocess Query

In [293]:
query = "Where is Stephen Curry cell phone Draymond Green World Health Organization?"

In [379]:
def preprocess_query(query):
    # remove possessive
    query = query.replace("\'s", "")
    query = query.replace("s\'", "")

    # remove punctuation
    query = query.translate(str.maketrans("", "", string.punctuation))

    # remove stopwords
    eng_stopwords = stopwords.words('english')
    for word in eng_stopwords:
        if re.search(r" {} ".format(word), query):
            query = query.replace(r"{} ".format(word), "")

    # word tokenize query
    query_lst = [word for word in word_tokenize(query)]
    
    return query_lst

In [380]:
preprocess_query(query)

['Where',
 'Stephen',
 'Curry',
 'cell',
 'phone',
 'Draymond',
 'Green',
 'World',
 'Health',
 'Organization']

## III. Wikipedia API Wrapper

In [355]:
class WikipediaAPI:
    url = "https://en.wikipedia.org/w/api.php"
    
    @staticmethod
    def find_titles_by_text(query, limit = 10):
        params = {
            "srsearch": query,
            "srlimit": limit,
            "srwhat" : "text",
            "action": "query",
            "format": "json",
            "list": "search",
        }
        res = requests.get(WikipediaAPI.url, params)
        return res.json()

    @staticmethod
    def has_near_match(query):
        params = {
            "srsearch": query,
            "srwhat" : "nearmatch",
            "action": "query",
            "format": "json",
            "list": "search",
        }
        res = requests.get(WikipediaAPI.url, params)
        return len(res.json()["query"]["search"]) > 0
    
    @staticmethod
    def find_titles_by_query(query, results = 10):
        return wikipedia.search(query, results)

## IV. Chunking

In [290]:
def get_consecutive_ner(nouns):
    res = []
    temp = []
    is_entity = False
    ner_tags = stanford_ner.tag(nouns)
    for word, tag in ner_tags:
        if (tag != "O" and is_entity) or (tag == "O" and not is_entity):
            temp.append(word)
        else:
            if temp:
                res.append(temp)
            temp = [word]
            is_entity = not is_entity
        
    if temp:
        res.append(temp)
        
    return res
                

In [305]:
def get_all_combinations(lst):
    all_combinations = []
    for i in range(len(lst)):
        for j in range(i + 1, len(lst)):
            all_combinations.append(lst[i:j + 1])
    return all_combinations

In [335]:
def chunk_nouns(nouns):
    return chunk_nouns_recursive(nouns, list())

def chunk_nouns_recursive(rem_nouns, chunks):
    if len(rem_nouns) == 0:
        return chunks

    if len(rem_nouns) == 1:
        chunks.append(rem_nouns[0])
        return chunks

    all_combinations = get_all_combinations(rem_nouns)

    results = []
    for combination in all_combinations:
        combination_str = " ".join(combination)
        new_rem_nouns = rem_nouns.copy()
        new_chunks = chunks.copy()
        if has_near_match(combination_str):
            for noun in combination:
                new_rem_nouns.remove(noun)
            new_chunks.append(combination_str)
            result = chunk_nouns_recursive(new_rem_nouns, new_chunks)
            results.append(result)

    if results:
        max_result = min(results, key=len)
        return max_result
    else:
        chunks.extend(rem_nouns)
        return chunks

In [383]:
def get_chunks(query_lst):
    res = []
    for consecutive_ner in get_consecutive_ner(query_lst):
        if len(consecutive_ner) > 1:
            res.extend(chunk_nouns(consecutive_ner))
        else:
            res.append(consecutive_ner[0])
    tagged = nltk.pos_tag(res)
    res = [tag[0] for tag in tagged if tag[1][0] == "N"]
    return res

## VI. Get Relevant Searches

In [401]:
def get_relavent_searches(query_lst, chunks, n_searches):
    assert(n_searches > len(chunks) + 1)
    relavent_searches = []
    n_searches_per_chunk = n_searches // (len(chunks) + 1)
    
    for chunk in chunks:
        n_chunk_searches = 0
        searches = WikipediaAPI.find_titles_by_query(chunk, n_searches)
        for search in searches:
            # find searches not already found
            if n_chunk_searches >= n_searches_per_chunk:
                break
            if search not in relavent_searches:
                relavent_searches.append(search)
                n_chunk_searches += 1
            
    searches = WikipediaAPI.find_titles_by_query(" ".join(query_lst), n_searches)
    for search in searches:
        if len(relavent_searches) >= n_searches:
            break
        if search not in relavent_searches:
            relavent_searches.append(search)
    
    return relavent_searches

## VII. Evaluate

In [395]:
path = "../../data/page_dataset/page_dataset_10.csv"
page_df = pd.read_csv(path)

In [406]:
def get_acc(page_df, n_searches = 10):
    acc = 0
    for index, row in page_df.iterrows():
        if index % 100 == 0:
            print(index)
        question = row["question"]
        label = row["label"]
        query_lst = preprocess_query(question)
        chunks = get_chunks(query_lst)
        relevant_searches = get_relavent_searches(query_lst, chunks, n_searches)
        if label in relevant_searches:
            acc += 1
    return acc / len(page_df)

In [None]:
acc = get_acc(page_df)

0
100
