# COMP90042 Project 2018: Question Answering – Team: Xudong & Winn
## Winn Chow <winnchow> (38315) and Xudong Han <xudongh1 > (881045)


## Load data from JSON files

In [0]:
import json

# Data in JSON format
training_json_data = None
development_json_data = None
documents_json_data = None
testing_json_data = None

def load_data():
    global training_json_data, development_json_data, documents_json_data, testing_json_data
    with open('training.json') as json_data:
        training_json_data = json.load(json_data)
    with open('devel.json') as json_data:
        development_json_data = json.load(json_data)
    with open('documents.json') as json_data:
        documents_json_data = json.load(json_data)
    with open('testing.json') as json_data:
        testing_json_data = json.load(json_data)

load_data()

# Testing
print("<<<Training data>>>")
print("Size = " + str(len(training_json_data)))
print("1st data: " + str(training_json_data[0]))
print("<<<Development data>>>")
print("Size = " + str(len(development_json_data)))
print("1st data: " + str(development_json_data[0]))
print("<<<Document data>>>")
print("Size = " + str(len(documents_json_data)))
print("1st data: " + str(documents_json_data[0]))
print("<<<Testing data>>>")
print("Size = " + str(len(testing_json_data)))
print("1st data: " + str(testing_json_data[0]))

print("Done")

<<<Training data>>>
Size = 43379
1st data: {'question': 'A kilogram could be definined as having a Planck constant of what value?', 'text': '6966662606895999999♠6.62606896×10−34 j⋅s', 'answer_paragraph': 23, 'docid': 0}
<<<Development data>>>
Size = 3097
1st data: {'question': 'On what date did the companies that became the Computing-Tabulating-Recording Company get consolidated?', 'text': 'june 16 , 1911', 'answer_paragraph': 5, 'docid': 380}
<<<Document data>>>
Size = 441
1st data: {'docid': 0, 'text': ['First recognized in 1900 by Max Planck, it was originally the proportionality constant between the minimal increment of energy, E, of a hypothetical electrically charged oscillator in a cavity that contained black body radiation, and the frequency, f, of its associated electromagnetic wave. In 1905 the value E, the minimal energy increment of a hypothetical oscillator, was theoretically associated by Einstein with a "quantum" or minimal element of the energy of the electromagnetic wa

<<<Testing data>>>
Size = 3618
1st data: {'question': 'Modern browser support standards-based and defacto what?', 'docid': 410, 'id': 0}
Done


## Load spaCy

In [0]:
import spacy
# You need to download the en model: python -m spacy download en
nlp = spacy.load('en')

print("Done")

Done


## Load NLTK

In [0]:
import nltk
sent_segmenter = nltk.data.load('tokenizers/punkt/english.pickle')
#nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()

ans_types_set = ["NN","NNP","NNS","NNPS","JJ","JJR","JJS"]
noun_set = ["NN","NNP","NNS","NNPS"]

print("Done")

Done


## Evaluation Functions on Training and Development set

In [0]:
'''
Description: Evaluate the prediction of development questions
Input: (1) prediction
Output: (1) F1 score
''' 
def f1_score_evaluation(prediction_result):
    global development_answer_data
    if len(prediction_result) != len(development_answer_data):
        print("Please check the prediction structure")
    else:
        result = []
        for i in range(len(prediction_result)):
            answer = development_answer_data[i]["text"].split()
            prediction = prediction_result[i][1].split()
            tp = len([word for word in answer if word in prediction])
            try:
                precision = tp/len(prediction)
                recall = tp/len(answer)
                f1 = 2*precision*recall/(precision+recall)
            except:
                f1 = 0.0
            result.append(f1)
        ave_f1 = np.average(np.array(result))
        return ave_f1

'''
Description: check whether the top k results cover the correct paragraph
Input: the value the k 
Output: the probability that the correct prar belongs to the result list
''' 
def evaluate_top_k(k):
    global development_answer_data
    correct_prediction = 0
    for q in development_answer_data:
        docid = q["docid"]
        question = q["question"] 
        _, pred = get_top_k_paragraphs(k, docid, question)
        if q["ans_para_id"] in pred[0]:
            correct_prediction += 1
    return correct_prediction/len(development_answer_data)

## Some helper functions

In [0]:
from sklearn.externals import joblib
from nltk.corpus import wordnet
import csv
import time

'''
Description: get time
Input: 
Output: time
''' 
def nowtime():
    return time.strftime("%Y%m%d-%H%M", time.localtime())

'''
Description: Remove some unidentifiable unicode characters
Input: (1) text
Output: (1) text with some unidentifiable unicode characters removed
''' 
def replace(text):
    unicode_letters = ["♠"]
    for l in unicode_letters:
        text = text.replace(l, "")
    return text

'''
Description: Save object to files
Input: (1) object
       (2) filename
Output: 
'''
def save_object_to_file(obj, filename):
    # It will overwrite
    with open(filename, 'wb') as output:  
        joblib.dump(obj, output)

'''
Description: Load object from file
Input: (1) filename
Output: (1) object
'''
def load_object_from_file(filename):
    with open(filename, 'rb') as input:  
        obj = joblib.load(input)
    return obj

'''
Description: Normalize text
Input: (1) text
Output: (1) a list of lowercase lemma with stop words and punctuations removed
        (2) spaCy doc of the text
'''
def normalize(text):
    global nlp
    
    doc = nlp(text)
    normalized = []
    # Tokenize and normalize
    for t in doc:
        if not t.is_stop and not t.is_punct:
            normalized.append(t.lemma_.lower())
    return normalized, doc

'''
Description: Check if the answer and prediction are the same
Input: (1) answer
       (2) prediction
Output: (1) True - if they are considered to be the same, otherwise False 
'''
def check_answer(answer, pred):
    print("Correct: " + answer)
    print("Predicted: " + pred)
    answer = answer.lower()
    pred = pred.lower()
    if answer == pred or answer in pred or pred in answer or answer.replace(" ,", ",") == pred:
        return True
    else:
        return False
    
'''
Description: Get lemma
Input: (1) word
Output: (1) the lemma of the input
'''
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

'''
Description: Get the longest common str
Input: (1) str1
       (2) str2
Output: (1) the longest commcon str
        (2) the length of the common str 
'''
def getNumofCommonSubstr(str1, str2):  
  
    lstr1 = len(str1)  
    lstr2 = len(str2)  
    record = [[0 for i in range(lstr2+1)] for j in range(lstr1+1)]  # 多一位  
    maxNum = 0          # 最长匹配长度  
    p = 0               # 匹配的起始位  
    for i in range(lstr1):  
        for j in range(lstr2):  
            if str1[i] == str2[j]:  
                # 相同则累加  
                record[i+1][j+1] = record[i][j] + 1  
                if record[i+1][j+1] > maxNum:  
                    # 获取最大匹配长度  
                    maxNum = record[i+1][j+1]  
                    # 记录最大匹配长度的终止位置  
                    p = i + 1  
    return str1[p-maxNum:p], maxNum

'''
Description: Get the number of how many times that st1's words occurence
                in str2
Input: (1) str1
       (2) str2
Output: (1) the longest commcon str
        (2) the length of the common str 
'''
def getNumofApperance(str1,str2):
    ccount_num = 0
    str1_token = [i for i in word_tokenizer.tokenize(str1.lower()) if not i in stopwords]
    str2_token = [i for i in word_tokenizer.tokenize(str2.lower()) if not i in stopwords]
    for i in str1_token:
        if i in str2_token:
            ccount_num += 1
    return ccount_num

"""
Description: Save prediction result to files
Input: (1) result
       (2) filename
Output: 
"""
def save_prediction_to_csv(result,filename):
    headers = ['Id','answer']

    with open(filename + str(nowtime()) + ".csv", 'w', encoding = 'utf8') as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f_csv.writerows(result)

'''
Description: Check hypernym relationship
Input: (1) hypernym
       (2) hyponym
Output: (1) True - if they are hypernym-hyponym
'''
def check_hypernym(hypernym, hyponym):
    hyponym_syn = wordnet.synsets(hyponym)
    hypernym_syn = wordnet.synsets(hypernym)
    
    if len(hyponym_syn) == 0 or len(hypernym_syn) == 0:
        return False
    
    for y in hypernym_syn:
        for x in hyponym_syn:
            while len(x.hypernyms()) != 0:
                if x.hypernyms()[0] == y:
                    return True
                else:
                    x = x.hypernyms()[0]
    return False

'''
Description: Get verb and subject from question
Input: (1) question
Output: (1) verb
        (2) subject
        (3) the children node of verb
'''
def get_verb_subj(question):
    for token in question:
        if token.dep_ in ['nsubj', 'nsubjpass']:  # and token.head == verb
            subj = token
            verb = subj.head
            if verb.pos_ != 'VERB':
                continue
    return verb, subj, verb.children

'''
Description: Get answer
Input: (1) nlp(sent)
       (2) verb
       (3) the children node of verb
Output: (1) answer
'''
def process_passage(doc, verb, v_children):
    ans = ""
    for rootmatch in doc:
        if rootmatch.lemma_ == verb.lemma_:
            #print(list(rootmatch.children))
            for i in rootmatch.children:
                if i.tag_ in ans_types_set and (not i.text in v_children):
                    #print(i.text, i.tag_)
                    #print(list(i.subtree))
                    for part in i.subtree:
                        ans = ans + part.text + " "     
    return ans

'''
Description: Get answer
Input: (1) nlp(sent)
       (2) verb
       (3) the children node of verb
Output: (1) answer
'''
def get_answer(question, passage):
    verb, subj, v_children = get_verb_subj(nlp(question))
    possible_list = [i.text for i in v_children]
    try:
        answer = process_passage(nlp(passage), verb, possible_list)
        return answer.lower()[:-1]
    except:
        return "not found"

print("Done")

Done


## Load data into usable format

In [0]:
# traning data in a list of dictionary format
training_answer_data = []

for t in training_json_data:
    q = {}
    q["question"] = t["question"]
    q["ans_para_id"] = t["answer_paragraph"]
    q["ans_para_content"] = documents_json_data[t["docid"]]["text"][t["answer_paragraph"]]
    q["docid"] = t["docid"]
    q["text"] = t["text"]
    q["labeled_ans_type"] = "" # To be used
    q["labeled_ans_type_by"] = "" # To be used
    q["classified_ans_type"] = "" # To be used
    training_answer_data.append(q)

# development data in a list of dictionary format
development_answer_data = []

for t in development_json_data:
    q = {}
    q["question"] = t["question"]
    q["ans_para_id"] = t["answer_paragraph"]
    q["ans_para_content"] = documents_json_data[t["docid"]]["text"][t["answer_paragraph"]]
    q["docid"] = t["docid"]
    q["text"] = t["text"]
    q["labeled_ans_type"] = "" # To be used
    q["labeled_ans_type_by"] = "" # To be used
    q["classified_ans_type"] = "" # To be used
    development_answer_data.append(q)

# testing data in a list of dictionary format
testing_answer_data = []

for t in testing_json_data:
    q = {}
    q["id"] = t["id"]
    q["question"] = t["question"]
    q["ans_para_id"] = "" # To be used
    q["ans_para_content"] = "" # To be used
    q["docid"] = t["docid"]
    q["text"] = "" # To be used
    q["labeled_ans_type"] = "" # To be used
    q["labeled_ans_type_by"] = "" # To be used
    q["classified_ans_type"] = "" # To be used
    testing_answer_data.append(q)
    
# document data in a dictionary format
documents_answer_data = {}

for d in documents_json_data:
    documents_answer_data[d["docid"]] = d["text"]

print("Done")

Done


## Build answer type detection classifier

In [0]:
import os.path
from tqdm import tqdm
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

'''
Description: Extract features from a question
Input: (1) question
Output: (2) features as a list
Based on https://shirishkadam.com/2017/07/03/nlp-question-classification-using-support-vector-machines-spacyscikit-learnpandas/
'''
def process_question(question):
    global nlp
    
    features = {}
    doc = nlp(question)
    for token in doc:
        if token.tag_ in ["WDT", "WP", "WP$", "WRB"]:
            features[token.tag_] = 1
            features[token.lemma_] = 1
            if len(doc) > token.i + 1:
                features[token.lemma_ + " " + doc[token.i + 1].lemma_] = 1
                features[doc[token.i + 1].tag_] = 1
        if token.dep_ == "ROOT":
            features["root " + token.tag_] = 1
            
            if token.text in ["is", "was", "are", "were"]:
                found_what = False
                found_label = False
                for c in token.children:
                    if "what" == c.text.lower():
                        found_what = True
                    elif c.pos_ == "NOUN":
                        found_label = True
                        label = c.lemma_
                    
                    # Done
                    if found_what and found_label:
                        features["what be " + label] = 1
                        break

            
    #print(features)
    
    return features

# Keep a copy
training_question_features = None
training_answer_type = None

'''
Description: Detect answer type from training data
Input: (1) load_from_file - whether to load from file
Output: (1) a list of question features from the training data
        (2) a list of answer types from the training data
'''
def detect_training_ans_type(load_from_file = True):
    global nlp, training_question_features, training_answer_type
    
    filename = "training_answer_type_final.pkl"

    if training_question_features is not None and training_answer_type is not None:
        return (training_question_features, training_answer_type)
    
    if load_from_file and os.path.isfile(filename):
        training_question_features, training_answer_type = load_object_from_file(filename)
        return (training_question_features, training_answer_type)
    else:
        training_question_features = []
        training_answer_type = []
    
    total = 0
    for i in tqdm(range(len(training_answer_data))):
        doc = nlp(training_answer_data[i]["ans_para_content"])
        
        best_match = 0
        best_label = ""
        for ent in doc.ents:
            #print(ent.text, ent.label_)
            match = 0
            # Tokenize each entity name and match with the answer
            for e in nlp(ent.text):
                # Ignore stop words and do lower case comparison
                if not e.is_stop and e.text.lower() in training_answer_data[i]["text"].lower():
                    match += 1
            # If better match is found
            if match > best_match:
                best_match = match
                best_label = ent.label_
    
        if best_match > 0:
            total += 1
            training_answer_data[i]["labeled_ans_type_by"] = "NER"
            training_answer_data[i]["labeled_ans_type"] = best_label
            training_answer_data[i]["question_features"] = process_question(training_answer_data[i]["question"])
            training_question_features.append(training_answer_data[i]["question_features"])
            training_answer_type.append(training_answer_data[i]["labeled_ans_type"])
        else:
            training_answer_data[i]["labeled_ans_type_by"] = "UNKOWN"
            training_answer_data[i]["labeled_ans_type"] = "UNKOWN"
            training_answer_data[i]["question_features"] = process_question(training_answer_data[i]["question"])
            training_question_features.append(training_answer_data[i]["question_features"])
            training_answer_type.append(training_answer_data[i]["labeled_ans_type"])

    print("Total = " + str(total))
    
    save_object_to_file((training_question_features, training_answer_type), filename)
    
    return (training_question_features, training_answer_type)

# Keep a copy
ml = None

'''
Description: Train an answer type detection ML classifer using SVC
Input: (1) load_from_file - whether to load from file
Output: (1) question vectorizer
        (2) machine learning model
'''
def train_answer_type_detection_classifer(load_from_file = True):
    global ml
    
    filename = "ml_model_final.pkl"
    
    if ml is not None:
        return ml
    
    if load_from_file and os.path.isfile(filename):
        ml = load_object_from_file(filename)
        return ml
        
    train_feature_matrix, train_target = detect_training_ans_type()
    vectorizer = DictVectorizer()
    train_dataset = vectorizer.fit_transform(train_feature_matrix)
    
    clf = SVC(gamma = 0.3, C = 3)
    model = clf.fit(train_dataset, train_target)

    save_object_to_file((vectorizer, model), filename)
    
    ml = (vectorizer, model)
    
    return (vectorizer, model)

'''
Description: Predict answer type using ML classifier
Input: (1) question
Ouput: (1) predicted answer type
'''
def predict_answer_type_ml(question):
    # ML classifer
    vectorizer, model = train_answer_type_detection_classifer()
    f = process_question(question)
    vdata = vectorizer.transform(f)    
    pred = model.predict(vdata)
    
    if pred in ["PERSON", "ORG"]:
        return "who_type", None
    if pred in ["DATE"]:
        return "when_type", None
    if pred in ["GPE", "LOC"]:
        return "where_type", None
    if pred in ["PERCENT"]:
        return "what_percentage_type", None
    if pred in ["CARDINAL"]:
        return "how_quantity_type", None
    if pred in ["UNKOWN"]:
        return "unkown_type", None
    else:
        return "other_label_type", pred

'''
Description: Testing answer type detection classifier
Input:
Output:
'''
def testing_answer_type_detection_classifer():
    
    training_question_features, training_answer_type = detect_training_ans_type()
    
    vectorizer, model = train_answer_type_detection_classifer()
    data = []
    for i in tqdm(range(len(training_answer_data))):
        q = training_answer_data[i]["question"]
        f = process_question(q)
        data.append(f)
    vdata = vectorizer.transform(data)    
    pred = model.predict(vdata)
    print("Accuracy: " + str(accuracy_score(training_answer_type, pred)))
    print(classification_report(training_answer_type, pred))
    
#testing_answer_type_detection_classifer()

print("Done")

Done


## Check most common question words - for analysis

In [0]:
from collections import defaultdict
from collections import Counter

'''
Description: Check most common question words
'''
def check_most_common_question_words():
    global training_answer_type, nlp
    
    ans_tokens = defaultdict(lambda: [])
    ans_counters = {}

    for idx, ans in tqdm(enumerate(training_answer_type)):
        doc = nlp(training_answer_data[idx]["question"])
        
        bigram = []
        for token in doc:
            if len(doc) > token.i + 1:
                bigram.append(token.text + " " + doc[token.i + 1].text)
        ans_tokens[ans] = ans_tokens[ans] + bigram
    
    for ans, ts in ans_tokens.items():
        ans_counters[ans] = Counter(ts)
        print(ans)
        c = ans_counters[ans].most_common(20)
        print(c)

print("Done")

#check_most_common_question_words()

Done


## Answer type detection classifer

In [0]:
'''
Description: Classify a question into an answer type
Input: (1) question
Output: (1) classified answer type
        (2) info about the type if needed
'''
def classify_answer_type(question):
    qcontent = question.lower()
    
    # Rule-based
    if qcontent.startswith("who") or "what is the name" in qcontent or "name of" in qcontent or \
        "what scientist" in qcontent or "what composer" in qcontent or "what job" in qcontent or \
        "what artifact" in qcontent or "what charity" in qcontent or "what university" in qcontent or "which university" in qcontent or \
        "what agency" in qcontent or "what school" in qcontent or "which school" in qcontent  or "what council" in qcontent or \
        "what organization" in qcontent or "what local businessman" in qcontent or "what name" in qcontent or \
        "what building" in qcontent or "what museum" in qcontent or "what leader" in qcontent or "which person" in qcontent or \
        "what government agency" in qcontent or "which footballer" in qcontent or "which actress" in qcontent or "which actor" in qcontent or \
        "which athlete" in qcontent or "which king" in qcontent or "which person" in qcontent:
        return "who_type", None
    if qcontent.startswith("when") or "what date" in qcontent or "how long" in qcontent or "how many years" in qcontent or \
        "how many hours" in qcontent or "what time" in qcontent or "what decade" in qcontent or "what century" in qcontent or \
        "what day" in qcontent or "what month" in qcontent or "which time" in qcontent or "which period" in qcontent or \
        qcontent.endswith("when?"):
        return "when_type", None
    if qcontent.startswith("where") or "located where" in qcontent or "what country" in qcontent or "what city" in qcontent or \
        "what nation" in qcontent or "what is the location" in qcontent or "what empire" in qcontent or "which country" in qcontent or \
        "what genre of music" in qcontent or "what location" in qcontent or "what region" in qcontent or "what area" in qcontent or \
        "which country" in qcontent or "which place" in qcontent or "which area" in qcontent or "which city" in qcontent or \
        "which nation" in qcontent or qcontent.endswith("where?"):
        return "where_type", None
    if "what percentage" in qcontent:
        return "what_percentage_type", None
    if "what year" in qcontent or "which year" in qcontent:
        return "what_year_type", None
    if "how much" in qcontent or "how many" in qcontent or "what value" in qcontent or \
       "what amount" in qcontent or "what number" in qcontent or "what size" in qcontent or "what quantity" in qcontent or \
       "population of" in qcontent or "many square" in qcontent or "many acres" in qcontent or "many miles" in qcontent or \
       "what age" in qcontent or "which figure" in qcontent:
        return "how_quantity_type", None
    if "how much money" in qcontent or "does it cost" in qcontent or "value of" in qcontent:
        return "money_type", None
    if "what rank" in qcontent:
        return "order_type", None
    if "what event" in qcontent or "what war" in qcontent or "what battle" in qcontent or "which war" in qcontent:
        return "event_type", None
    if "what language" in qcontent or "which language" in qcontent:
        return "language_type", None
    if "what religion" in qcontent or "what nationality" in qcontent:
        return "norp_type", None
    if "what town" in qcontent:
        return "fac_type", None
    
    x, _ = check_what_x(question)
    if x in ["color", "shape", "alloy", "animal", "metal", "weapon"]:
        return "what_hypernym_type", x
    
    # Try ML classifier    
    pred_ans_type, info = predict_answer_type_ml(question)
    
    # print(">>>>>>>>>>>>>>> ML Prediction: " + pred_ans_type)

    return pred_ans_type, info

'''
Description: Check if it is a What X question
Input: (1) question
Output: (1) X of What X, None if it is not a What X question
'''
def check_what_x(question):
    global nlp
    
    doc = nlp(question)
    for chunk in doc.noun_chunks:
        c = chunk.text.split()
        if len(c) > 1 and c[0].lower() == "what":
            # Turn X into lemma
            d = nlp(" ".join(c[1:]))
            found_noun = False
            x = ("", None)
            for l in d:
                # there is a noun, use it
                if l.pos_ == "NOUN":
                    found_noun = True
                    x = (l.lemma_, None)
                    
                    # get What X of Y
                    for token in doc:
                        if token.lemma_ == x[0]:
                            for c in token.children:
                                if c.lemma_ == "of":
                                    for cc in c.children:
                                        x = (x[0].lower(), cc.lemma_)
                                        break
                                    break
                            break
                    break
            if not found_noun:
                x = (c[1].lower(), None)
                # get What X of Y
                for token in doc:
                    if token.lemma_ == x[0]:
                        for c in token.children:
                            if c.lemma_ == "of":
                                for cc in c.children:
                                    x = (x[0].lower(), cc.lemma_)
                                    break
                                break
                        break
                break
            return x
    return (None, None)
                
print("Done")

Done


## Build vectorized documents - for passage retrieval

In [0]:
import numpy as np
import time
import spacy
import os.path
from tqdm import tqdm
from collections import defaultdict
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Keep a copy
vectorized_documents = None

'''
Description: Build vectorized documents and dump to file, vectorized_documents.pkl
Input: load_from_file - whether to load from file
Output:
'''
def build_vectorized_documents(load_from_file = True):
    global documents_answer_data, nlp, vectorized_documents

    filename = "vectorized_documents.pkl"
    
    if vectorized_documents is not None:
        return vectorized_documents
    
    if load_from_file and os.path.isfile(filename):
        return load_object_from_file(filename)
        
    counts = defaultdict(list)
    doc_tfidf = defaultdict(lambda: {})
    
    # Each document
    for docid in tqdm(documents_answer_data.keys()):
        
        # Each paragraph
        for para in documents_answer_data[docid]:
            p, doc = normalize(para)
            # Count
            counts[docid].append(Counter(p))
    
        # Vectorize
        vectorizer = DictVectorizer()
        doc_count_feature = vectorizer.fit_transform(counts[docid])

        # tf-idf
        transformer = TfidfTransformer(smooth_idf=True, norm="l1", sublinear_tf=True)
        doc_tfidf[docid]["tfidf"] = transformer.fit_transform(doc_count_feature)
        doc_tfidf[docid]["vectorizer"] = vectorizer
        doc_tfidf[docid]["transformer"] = transformer
        
    save_object_to_file(doc_tfidf, filename)
    
    return doc_tfidf

vectorized_documents = build_vectorized_documents()

print("Done")

Done


## Top-K paragraphs retrieval

In [0]:
import numpy as np
from collections import Counter

'''
Description: Returns the top-k matched paragraphs from the document
Input: (1) k - top k
       (2) docid
       (3) question
Output: (1) a list of top k best matched paragraphs in descending matching order 
        (2) a list of corresponding similarity scores
'''
def get_top_k_paragraphs(k, docid, question):
    global documents_answer_data
    
    nq, doc = normalize(question)
    q_count = Counter(nq)
    doc_tfidf = build_vectorized_documents()
    q_count_feature = doc_tfidf[docid]["vectorizer"].transform(q_count)
    q_count_tfidf = doc_tfidf[docid]["transformer"].transform(q_count_feature)
    q_score = (doc_tfidf[docid]["tfidf"]*q_count_tfidf.T).T

    scores = q_score[0,:].toarray()[0]
    best = np.argsort(scores)[::-1]
    best_ans = best[0:k]
    best_scores = scores[best_ans]
    
    topk_paragraphs = []
    for b in best_ans:
        topk_paragraphs.append(documents_answer_data[docid][b])
    return topk_paragraphs, best_scores

print("Done")

Done


## Solver to solve questions - Answer Processing

In [0]:
import re

'''
Description: A solver to solve questions
Input: (1) question
       (2) a list of matched paragraphs
       (3) a list of corresponding similarity scores of the matched paragraphs
       (4) answer type
       (5) answer type info
Output: (1) prediction
'''
def main_solve(question, para_list, para_score_list, ans_type, ans_type_info):
    if ans_type == "who_type":
        return predict_ner(question, para_list, para_score_list, ["PERSON", "ORG"])
    if ans_type == "when_type":
        return predict_ner(question, para_list, para_score_list, ["DATE", "TIME"])
    if ans_type == "where_type":
        return predict_ner(question, para_list, para_score_list, ["GPE", "LOC"])
    if ans_type == "what_percentage_type":
        return predict_ner(question, para_list, para_score_list, ["PERCENT"])
    if ans_type == "what_year_type":
        return predict_ner(question, para_list, para_score_list, ["DATE"], year = True)
    if ans_type == "how_quantity_type":
        return predict_ner(question, para_list, para_score_list, ["CARDINAL", "QUANTITY"])
    if ans_type == "money_type":
        return predict_ner(question, para_list, para_score_list, ["MONEY"])
    if ans_type == "order_type":
        return predict_ner(question, para_list, para_score_list, ["ORDINAL"])
    if ans_type == "event_type":
        return predict_ner(question, para_list, para_score_list, ["EVENT"])
    if ans_type == "language_type":
        return predict_ner(question, para_list, para_score_list, ["LANGUAGE"])
    if ans_type == "norp_type":
        return predict_ner(question, para_list, para_score_list, ["NORP"])
    if ans_type == "fac_type":
        return predict_ner(question, para_list, para_score_list, ["FAC"])
    if ans_type == "other_label_type":
        return predict_ner(question, para_list, para_score_list, [ans_type_info])
    if ans_type == "what_hypernym_type":
        return predict_hypernym(question, para_list, para_score_list, ans_type_info)
    
    return "Not Found"

def predict_ner(question, para_list, para_score_list, ner_labels, year = False):
    global nlp
    
    all_potential_ans = []
    all_text_sentences = []
    all_sentences = []
    doc_question = nlp(question)
    question_ents = [ent.text for ent in doc_question.ents]
    # Find all potential answers
    for index, para in enumerate(para_list):
        potential_ans = []
        doc = nlp(para)
        for ent in doc.ents:
            if not ent.text in question_ents:
                if ent.label_ in ner_labels:
                    ans = {}
                    ans["para_idx"] = index
                    ans["start"] = ent.start
                    ans["end"] = ent.end
                    ans["text"] = ent.text

                    if year:
                        if len(ans["text"]) == 4 and ans["text"].isnumeric():
                            potential_ans.append(ans)
                    else:
                        potential_ans.append(ans)
            #print(potential_ans)
        
        potential_ans.sort(key = lambda x: x["start"])
        
        prev_end = 0
        prev_ans = None
        for i, ans in enumerate(potential_ans):
            if i == 0:
                ans["doc_start"] = 0
                prev_ans = ans
            else:
                ans["doc_start"] = round((prev_end + ans["start"]) / 2)
                prev_ans["doc_end"] = ans["doc_start"]
                prev_ans = ans
            prev_end = ans["end"]
        if len(potential_ans) > 0:
            potential_ans[-1]["doc_end"] = len(doc)
        
        all_potential_ans = all_potential_ans + potential_ans
        
        # Normalize the sentences
        sentences = []
        text_sentences = []
        for ans in potential_ans:
            st = []
            for t in doc[ans["doc_start"]:ans["doc_end"]]:
                if not t.is_stop and not t.is_punct:
                    st.append(t.lemma_.lower())
            sentences.append(st)
            text_sentences.append(doc[ans["doc_start"]:ans["doc_end"]].text)
            
        all_sentences = all_sentences + sentences
        all_text_sentences = all_text_sentences + text_sentences

    #print(all_sentences)
    
    # No potential answers found
    if len(all_potential_ans) == 0:
        return "Not Found"
    
    counts = []
    for s in all_sentences:
        counts.append(Counter(s))
    
    vectorizer = DictVectorizer()
    sent_count_feature = vectorizer.fit_transform(counts)

    transformer = TfidfTransformer(smooth_idf=True, norm="l1", sublinear_tf=True)
    sent_tfidf = transformer.fit_transform(sent_count_feature)
        
    nq, doc = normalize(question)
    q_count = Counter(nq)
    
    q_count_feature = vectorizer.transform(q_count)
    q_count_tfidf = transformer.transform(q_count_feature)
    q_score = (sent_tfidf*q_count_tfidf.T).T

    scores = q_score[0,:].toarray()[0]
    # print(scores)
    
    best = np.argsort(scores)[::-1]
    best_ans = best[0]
    
    # If the best_ans also appears in the question, choose the second best
    for b in best:
        if all_potential_ans[b]["text"].lower() not in question.lower():
            best_ans = b
            break

    best_ans_text = all_potential_ans[best_ans]["text"]
    
    return best_ans_text


'''
Description: A specific solver to solve hypernym type questions
Input: (1) question
       (2) a list of matched paragraphs
       (3) a list of corresponding similarity scores of the matched paragraphs
       (4) a token
Output: (1) prediction or "Not Found"
'''
def predict_hypernym(question, para_list, para_score_list, token):
    global nlp
    
    # Find the best matched sentence
    all_text_sentences = []
    for index, para in enumerate(para_list):
        sentences = []
        doc = nlp(para)
        for sent in doc.sents:
            sentences.append(sent.text)
        all_text_sentences = all_text_sentences + sentences
    
    # No potential answers found
    if len(all_text_sentences) == 0:
        return "Not Found"
        
    all_sentences = []
    for s in all_text_sentences:
        st, doc = normalize(s)
        all_sentences.append(st)

    counts = []
    for s in all_sentences:
        counts.append(Counter(s))
    
    vectorizer = DictVectorizer()
    sent_count_feature = vectorizer.fit_transform(counts)

    transformer = TfidfTransformer(smooth_idf=True, norm="l1", sublinear_tf=True)
    sent_tfidf = transformer.fit_transform(sent_count_feature)
        
    nq, doc = normalize(question)
    q_count = Counter(nq)
    
    q_count_feature = vectorizer.transform(q_count)
    q_count_tfidf = transformer.transform(q_count_feature)
    q_score = (sent_tfidf*q_count_tfidf.T).T

    scores = q_score[0,:].toarray()[0]
 
    best = np.argsort(scores)[::-1]

    all_potential_ans = []
    for b in best:
        # best matched sentence
        best_sentence = all_text_sentences[b]
        #print(str(b) + ": " + best_sentence)
        tokens, doc = normalize(best_sentence)
        for t in tokens:
            if t not in nq and check_hypernym(token, t):
                all_potential_ans.append((b, t))

    # No potential answers found
    if len(all_potential_ans) == 0:
        return "Not Found"
    
    sent, best_ans = all_potential_ans[0]
    best_sentence = all_text_sentences[sent]
    
    for t in re.findall(r"[\w-]+", best_sentence):
        if best_ans in t:
            return t
    
    return best_ans

'''
Description: A solver to solve all the other type questions
Input: (1) question
       (2) a list of matched paragraphs
Output: (1) prediction or "not found"
'''
def predict_others(question, para_list):
    
    ans_para = ""
    for i in para_list:
        ans_para = ans_para+i+" "
        
    max_num = 0
    max_sen_num = 0
    ans_sentence = ""
    for sen in sent_segmenter.tokenize(ans_para):
        com_str, _ = getNumofCommonSubstr(sen.lower(),question.lower())
        app_num = getNumofApperance(sen, question)
        sen_score = len(word_tokenizer.tokenize(com_str)) + app_num
        if sen_score > max_num:
            max_num = sen_score
            ans_sentence = sen
    ans = ""
    try:
        ans = get_answer(question, ans_sentence)
        if len(ans)>2 and (len(ans.split())<5):
            return ans.lower()
        else:
            doc1 = nlp((ans_sentence))
            doc2 = nlp(question)
            possible_ans = [i.merge().text for i in [chunk for chunk in doc1.noun_chunks] if not i in [chunk for chunk in doc2.noun_chunks]]

            # Look for What X and What X of Y
            x, y = check_what_x(question)
            if x is not None:
                for idx, a in enumerate(possible_ans):
                    if x in a or (y is not None and y in a):
                        #print("!!!!!!!!!!!!!!!!!!!!")
                        #print(possible_ans)
                        # Get 2 chunks before and after
                        if idx > 1:
                            start = idx - 1
                        elif idx == 1:
                            start = 1
                        else:
                            start = 0
                        if len(possible_ans) > idx + 2:
                            end = idx + 2
                        elif len(possible_ans) > idx + 1:
                            end = idx + 2
                        else:
                            end = idx + 1
                        return " ".join(possible_ans[start:end])

            # Check for "stand for" and "refer to"
            found_stand_refer = False
            key_word = []
            for token in doc2:
                if token.dep_ == "ROOT" and token.lemma_ in ["refer", "stand"]:
                    found_stand_refer = True
                if token.dep_ == "nsubj":
                    key_word = [t.text for t in list(token.lefts) if not t.is_stop] + [token.text] + [t.text for t in list(token.rights) if not t.is_stop]
            if found_stand_refer:
                aw = word_tokenizer.tokenize(ans_sentence)
                #print(ans_para)
                for idx, a in enumerate(aw):
                    for kw in key_word:
                        if kw in a:
                            #print("!!!!!!!!!!!!!!!!!!!!")
                            if idx > 5:
                                start = idx - 6
                            else:
                                start = 0
                            if len(aw) > idx + 5:
                                end = idx + 6
                            else:
                                end = len(aw)
                            #print("************* Found")
                            return " ".join(aw[start:end])
             
            # Check if "what" appears in the question
            if "what" in question:
                root_token = None
                nsubj = []
                for token in doc2:
                    if token.dep_ == "ROOT":
                        if token.lemma_ != "be":
                            root_token = token
                    if token.dep_ == "nsubj" or token.dep_ == "nsubjpass":
                        nsubj = nsubj + [t.lemma_ for t in list(token.lefts) if not t.is_stop] + [token.lemma_] + [t.lemma_ for t in list(token.rights) if not t.is_stop]
                
                # Remove "the"
                times = 0
                for ns in nsubj:
                    if ns == "the":
                        times += 1
                for i in range(times):
                    nsubj.remove("the")

                all_pos_words = []
                if root_token is not None:
                    all_pos_words.append(root_token.lemma_)
                    all_pos_words = all_pos_words + nsubj
            
                # Check if the root or nsubj or nsubjpass is found
                found_it = False
                start = -1
                end = -1
                for token in doc1:
                    checkx = False
                    for aw in all_pos_words:
                        if aw in token.text.lower():
                            checkx = True
                    if checkx or token.lemma_ in all_pos_words:
                        if not found_it:
                            start = max(0, token.i - 3)
                            found_it = True
                        end = token.i + 8
                        if len(doc1) <= end:
                            end = len(doc1)
                            if doc1[-1].text == ".":
                                end = len(doc1) - 1

                if found_it:
                    #print("Got >>>>>>>>>>>>>>>>>>> what?")
                    #print(all_pos_words)
                    #print(ans_sentence)
                    if end - start + 1 < 10:
                        start = max(0, end - 9)
                    f_possible_ans = []
                    for pans in possible_ans:
                        if pans in doc1[start:end].text:
                            f_possible_ans.append(pans)
                    return " ".join(f_possible_ans)
            try:
                #print("Just join !!!!!!!!!!!!!!!!!!!!")
                #print(ans_sentence)
                #print()
                return str(" ".join(possible_ans))
            except:
                try:
                    return str(" ".join([chunk.merge().text for chunk in doc1.noun_chunks]))
                except:
                    return "Not Found"
    except:
        doc1 = nlp(ans_sentence)
        doc2 = nlp(question)
        possible_ans = [i.merge().text for i in [chunk for chunk in doc1.noun_chunks] if not i in [chunk for chunk in doc2.noun_chunks]]
           
        # Look for What X and What X of Y
        x, y = check_what_x(question)
        if x is not None:
            for idx, a in enumerate(possible_ans):
                if x in a or (y is not None and y in a):
                    #print("!!!!!!!!!!!!!!!!!!!!")
                    #print(possible_ans)
                    # Get 1 chunk before and after
                    if idx > 1:
                        start = idx - 1
                    elif idx == 1:
                        start = 1
                    else:
                        start = 0
                    if len(possible_ans) > idx + 2:
                        end = idx + 2
                    elif len(possible_ans) > idx + 1:
                        end = idx + 2
                    else:
                        end = idx + 1
                    return " ".join(possible_ans[start:end])

        # Check for "stand for" and "refer to"
        found_stand_refer = False
        key_word = []
        for token in doc2:
            if token.dep_ == "ROOT" and token.lemma_ in ["refer", "stand"]:
                found_stand_refer = True
            if token.dep_ == "nsubj":
                key_word = [t.text for t in list(token.lefts) if not t.is_stop] + [token.text] + [t.text for t in list(token.rights) if not t.is_stop]
        if found_stand_refer:
            aw = word_tokenizer.tokenize(ans_sentence)
            #print(ans_para)
            for idx, a in enumerate(aw):
                for kw in key_word:
                    if kw in a:
                        #print("!!!!!!!!!!!!!!!!!!!!")
                        if idx > 5:
                            start = idx - 6
                        else:
                            start = 0
                        if len(aw) > idx + 5:
                            end = idx + 6
                        else:
                            end = len(aw)
                        #print("************* Found")
                        return " ".join(aw[start:end])
               
        if "what" in question:
            root_token = None
            nsubj = []
            for token in doc2:
                if token.dep_ == "ROOT":
                    if token.lemma_ != "be":
                        root_token = token
                if token.dep_ == "nsubj" or token.dep_ == "nsubjpass":
                    nsubj = nsubj + [t.lemma_ for t in list(token.lefts) if not t.is_stop] + [token.lemma_] + [t.lemma_ for t in list(token.rights) if not t.is_stop]
            
            times = 0
            for ns in nsubj:
                if ns == "the":
                    times += 1
            for i in range(times):
                nsubj.remove("the")
            
            all_pos_words = []
            if root_token is not None:
                all_pos_words.append(root_token.lemma_)
                all_pos_words = all_pos_words + nsubj
            found_it = False
            start = -1
            end = -1
            for token in doc1:
                checkx = False
                for aw in all_pos_words:
                    if aw in token.text.lower():
                        checkx = True
                if checkx or token.lemma_ in all_pos_words:
                    if not found_it:
                        start = max(0, token.i - 3)
                        found_it = True
                    end = token.i + 8
                    if len(doc1) <= end:
                        end = len(doc1)
                        if doc1[-1].text == ".":
                            end = len(doc1) - 1
            
            if found_it:
                #print("Got >>>>>>>>>>>>>>>>>>> what?")
                #print(all_pos_words)
                #print(ans_sentence)
                if end - start + 1 < 10:
                        start = max(0, end - 9)
                f_possible_ans = []
                for pans in possible_ans:
                    if pans in doc1[start:end].text:
                        f_possible_ans.append(pans)
                return " ".join(f_possible_ans)
             
        try:
            #print("Just join !!!!!!!!!!!!!!!!!!!!")
            #print(ans_sentence)
            #print()
            return str(" ".join(possible_ans))
        except:
            try:
                return str(" ".join([chunk.merge().text for chunk in doc1.noun_chunks]))
            except:
                return "Not Found"
    
    
print("Done")

Done


## Run on training data

In [0]:
'''
Description: Run solver on the training data
Input:
Output:
'''
def run_training():
    global training_answer_data
    
    total_questions = 0
    total_correct = 0
    
    # Each training question
    for q, t in enumerate(training_answer_data[0:2000]):
        # (1) classify answer type
        ans_type, ans_type_info = classify_answer_type(t["question"])
        
        if ans_type != "unkown_type" or not "what" in t["question"]:
            # Skip
            continue
        
        print(t["question"])
        
        print("Q" + str(q) + ": " + ans_type)
        # (2) Find top-k matched paragraphs
        topk_paragraphs, topk_scores = get_top_k_paragraphs(3, t["docid"], t["question"])
        #print(topk_paragraphs)
        # (3) Call main solver
        if ans_type == "unkown_type":
            prediction = predict_others(t["question"], topk_paragraphs)
        else:
            prediction = main_solve(t["question"], topk_paragraphs, topk_scores, ans_type, ans_type_info)
            if prediction == "Not Found":
                prediction = predict_others(t["question"], topk_paragraphs)
            
        total_questions += 1
        # Count
        if check_answer(t["text"], prediction):
            total_correct += 1
            print("Correct")
        else:
            print("Wrong")
    
    print("Total Questions = " + str(total_questions))
    print("Total Correct = " + str(total_correct))

#run_training()
       

## Run on dev data

In [0]:
from tqdm import tqdm

def run_devl():
    total_questions = 0
    total_correct = 0
    result = []
    global development_answer_data
        
    # Each training question
    for q, t in tqdm(enumerate(development_answer_data)):
        # (1) classify answer type
        ans_type, ans_type_info = classify_answer_type(t["question"])
        # (2) Find top-k matched paragraphs
        topk_paragraphs, topk_scores = get_top_k_paragraphs(3, t["docid"], t["question"])
        #print(topk_paragraphs)
        # (3) Call main solver

        prediction = ""
        if ans_type == "unkown_type":
            prediction = predict_others(t["question"], topk_paragraphs)
            result.append((q,prediction.lower()))
        elif ans_type == "what_hypernym_type":
            prediction = main_solve(t["question"], topk_paragraphs, topk_scores, ans_type, ans_type_info)
            if prediction == "Not Found":
                prediction = predict_others(t["question"], topk_paragraphs)
            result.append((q,prediction.lower()))

        if ans_type == "what_hypernym_type":
            total_questions += 1
            # Count
            if check_answer(t["text"], prediction):
                total_correct += 1
                print("Correct")
            else:
                print("Wrong")
    
    print("Total Questions = " + str(total_questions))
    print("Total Correct = " + str(total_correct))
    
    return result
#result_devl = run_devl()


## Evaluation on dev data

In [0]:
def evaluate_dev():
    total_correct = 0
    for i in range(len(result_devl)):
        if result_devl[i][1] == development_answer_data[i]["text"]:
            total_correct += 1
    print(total_correct)

# evaluate()

## Run on testing data

In [0]:
from tqdm import tqdm

def run_testing():
    result = []
    global testing_answer_data
        
    # Each training question
    for q, t in tqdm(enumerate(testing_answer_data)):
        # (1) classify answer type
        ans_type, ans_type_info = classify_answer_type(t["question"])
        # (2) Find top-k matched paragraphs
        topk_paragraphs, topk_scores = get_top_k_paragraphs(3, t["docid"], t["question"])
        #print(topk_paragraphs)
        # (3) Call main solver

        if ans_type == "unkown_type":
            prediction = predict_others(t["question"], topk_paragraphs)
            result.append((q,prediction.lower()))
        else:
            prediction = main_solve(t["question"], topk_paragraphs, topk_scores, ans_type, ans_type_info)
            if prediction == "Not Found":
                prediction = predict_others(t["question"], topk_paragraphs)
            result.append((q,prediction.lower()))

    return result
result = run_testing()

3618it [07:24,  8.13it/s]


## Write data to csv file for submission

In [0]:
save_prediction_to_csv(result, "winn_xudong")
print("Done")

Done
