# Initialization

In [1]:
import json
import logging
import math
import requests
import jieba

In [2]:
# Normal
logging.getLogger().setLevel(logging.WARNING)
# Debug
# logging.getLogger().setLevel(logging.INFO)

In [3]:
class Wiki(dict):
    def __init__(self):
        super(Wiki, self).__init__()
    def __getitem__(self, key):
        if key in self:
            return super(Wiki, self).__getitem__(key)
        return ReverseIndex(key, [])
class ReverseIndex(object):
    def __init__(self, key, indexes):
        super(ReverseIndex, self).__init__()
        self.key = key
        self.indexes = indexes
    def __len__(self):
        return len(self.indexes)
    def __getitem__(self, key):
        return self.indexes[key]
    def __iter__(self):
        return iter(self.indexes)
    def __and__(self, rev_ind):
        return set(self.indexes).intersection(set(rev_ind.indexes))
def tf(term, doc):
    """calculate term frequency
    
    Arguments:
        term {str} -- keyword to be evaluate
        doc {list} -- list of words splited from source document
    
    Returns:
        int -- times that term appears in doc
    """
    return doc.count(term)
def df(term, docs):
    """calculate document frequency from a series of documents
    
    This is a modified version which use df directly from wiki reverse index.
    For normal version, invert the comment
    
    Arguments:
        term {str} -- keyword to be evaluate
        docs {list} -- list doc defined by list of words splited from source document
    
    Returns:
        int -- times that term appears in docs
    """
    # count = 0
    # for doc in docs:
    #     count += doc.count(term)
    return len(docs[term])
def idf(df, docs_len):
    """convert document frequency into inverse document frequency
    
    Arguments:
        df {int} -- document frequency
        docs_len {int} -- total number of docs
    
    Returns:
        float -- inverse document frequency
    """
    return math.log10(docs_len/df)
def tf_idf(term, doc, docs):
    """Define how important a term is in a document
    
    higher the value, term is more important to the doc.
    An important term should be:
        * Appears in few docs
        * Appears many times in doc
    
    Arguments:
        term {str} -- keyword to be evaluate
        doc {list} -- document that keyword are to be rated, in this cse: question
        docs {list} -- all documents, as an idf reference, in this case: WIKI
    
    Returns:
        float -- tf-idf value
    """
    term_tf = tf(term, doc)
    term_df = df(term, docs)
    if term_df > 0:
        term_idf = idf(term_df, len(docs))
        return (1 + math.log(term_tf)) * term_idf
    return 0

# Read Wiki

In [4]:
WIKI = Wiki()
with open("Wiki_inverse_index") as file:
    lines = file.readlines()
    for line in lines:
        indexes = line.split()
        key = indexes.pop(0)
        indexes = list(map(int, indexes))
        WIKI[key] = ReverseIndex(key, indexes)
print("{} Keywords loaded".format(len(WIKI)))

2404293 Keywords loaded


# Question Solving

## Method 1: Pure Term Frequency

* Return occurrence percentage of each answer

In [5]:
def solve_method_one(splited_question_dict):
    splited_question_dict = splited_question_dict.copy()
    splited_question_dict.pop("Question")
    splited_question_dict.pop("Answer : ", None)
    scores = {}
    for option, option_term in splited_question_dict.items():
        scores[option] = len(WIKI[option_term])
    percentage_scores = {}
    for option in splited_question_dict:
        percentage_scores[option] = scores[option] / (sum(scores.values()) or 1)
    return percentage_scores

## Method 2: TF-IDF

1. Fetch keywords from question
2. use (keyword, option) pair to calculate relation

In [6]:
QUESTION_KEYWORD_THRESHOLD = 2.5
def find_question_keywords(splited_question):
    keywords = {}
    for each in splited_question:
        score = tf_idf(each, splited_question, WIKI)
        if score > QUESTION_KEYWORD_THRESHOLD:
            keywords[each] = score
    return keywords
def two_term_interaction_freq(option_term, qestion_keyword, keyword_weight):
    interaction_freq = len(WIKI[option_term] & WIKI[qestion_keyword])
    score = interaction_freq * keyword_weight
    logging.info("{1:{0}<7} & {2:{0}<7} : {3:{0}<7} * {4:{0}<7} => {5:{0}<7}".format(
        chr(12288),
        option_term,
        qestion_keyword,
        interaction_freq,
        keyword_weight,
        score
        )
    )
    return score
def solve_method_two(splited_question_dict):
    splited_question_dict = splited_question_dict.copy()
    keywords = find_question_keywords(splited_question_dict.pop("Question"))
    splited_question_dict.pop("Answer : ", None)
    scores = {}
    for option, option_term in splited_question_dict.items():
        scores[option] = 0
        for keyword, keyword_weight in keywords.items():
            scores[option] += two_term_interaction_freq(option_term, keyword, keyword_weight)
    percentage_scores = {}
    for option in splited_question_dict:
        percentage_scores[option] = scores[option] / (sum(scores.values()) or 1)
    return percentage_scores

## Method 3: Human Take Control

* If answer percentage of Method 1 & Method 2 are all below `CONFIDENCE` => Shout out to human
* Method 1 & Method 2 had different answers => Use Method 2's Answer

In [7]:
CONFIDENCE = 0.6
def double_check(result):
    method_1_ans = max(result["1"], key=result["1"].get)
    method_2_ans = max(result["2"], key=result["2"].get)
    error = None
    if method_1_ans != method_2_ans:
        error = Warning("Different Answer")
    if max(result["1"][method_1_ans], result["2"][method_2_ans]) < CONFIDENCE:
        error = Warning("Low Confidence")
    return method_2_ans, error
def print_error(error, result):
    if not isinstance(error, Warning):
        return
    logging.warning("Question {}: {}".format(question_num, str(error)))
    logging.warning("Method 1: {}".format(result["1"]))
    logging.warning("Method 2: {}".format(result["2"]))
def print_question(question):
    print(question["Question"])
    print("https://www.google.com/search?q={}".format(question["Question"]))
    print("A. {}".format(question["A"]))
    print("B. {}".format(question["B"]))
    print("C. {}".format(question["C"]))
def validate_final_results(final_results, length):
    if len(final_results) != length:
        raise AssertionError("Wrong Length")
    ok = True
    for question_num, ans in enumerate(final_results):
        if ans != "A" and ans != "B" and ans != "C":
            ok = False
            logging.warning("Question {}: invalid answer => {}".format(question_num, ans))
    if not ok:
        raise AssertionError("Invalid Answer")
    print("Happy New Year!!!")
    print()
    print(json.dumps(final_results))
    return True

## Integrated Solve Function

In [8]:
def run_solve(question):
    # Create a working copy
    working_copy = question.copy()
    result = {}
    
    # Split Question into list of terms
    working_copy["Question"] = list(jieba.cut(working_copy["Question"], cut_all=False))
    
    # Solve with two different method
    result["1"] = solve_method_one(working_copy)
    result["2"] = solve_method_two(working_copy)
    
    # Check Results
    answer, error = double_check(result)
    
    # Return result
    return answer, error, result

# Game On

In [33]:
question_url = "https://github.com/UDICatNCHU/QA_Robot_Project/raw/master/questions_example.json"
question_url = input("URL: ")
QUESTIONS = requests.get(question_url).json()

URL: https://www.dropbox.com/s/kq678nfffzta5zg/20_Question.json?dl=1


In [34]:
RESULTS = [{} for _ in range(len(QUESTIONS))]
ERRORS = {}
FINAL = []
for question_num, question in enumerate(QUESTIONS):
    answer, error, result = run_solve(question)
    if error:
        ERRORS[question_num] = error
        logging.warning("Question {}: {}".format(question_num, str(error)))
    FINAL.append(answer)
    RESULTS[question_num] = result



In [None]:
len(ERRORS)

## Manual Fix

In [40]:
question_num = int(input("Fix Question: "))
if question_num in ERRORS:
    print_error(ERRORS[question_num], RESULTS[question_num])
print_question(QUESTIONS[question_num])
print()
print("Original Answer: {}".format(FINAL[question_num]))
print()
print("Result of Method 2:")
for option, option_dist in RESULTS[question_num]["2"].items():
    print(option, option_dist)
FINAL[question_num] = input("Answer of {}: ".format(question_num))

Fix Question: 44




下列何者是全球人口最多的國家？
https://www.google.com/search?q=下列何者是全球人口最多的國家？
A. 美國
B. 印度
C. 中國

Original Answer: A

Result of Method 2:
A 0.5094454820479996
B 0.06615475502244281
C 0.42439976292955756
Answer of 44: C


In [43]:
QUESTIONS[0]

{'Question': '什麼事件是指日本嘉永六年（1853年）美國海軍准將馬修·培理率艦隊駛入江戶灣浦賀海面的事件，培理帶著美國總統米勒德·菲爾莫爾的國書向江戶幕府致意，最後雙方於次年（1854年）簽定《神奈川條約》（《日美和親條約》）。',
 'A': '竹橋事件',
 'B': '黑船來航',
 'C': '巨文島事件'}

In [42]:
for each in QUESTIONS:
    print_question(each)

什麼事件是指日本嘉永六年（1853年）美國海軍准將馬修·培理率艦隊駛入江戶灣浦賀海面的事件，培理帶著美國總統米勒德·菲爾莫爾的國書向江戶幕府致意，最後雙方於次年（1854年）簽定《神奈川條約》（《日美和親條約》）。
https://www.google.com/search?q=什麼事件是指日本嘉永六年（1853年）美國海軍准將馬修·培理率艦隊駛入江戶灣浦賀海面的事件，培理帶著美國總統米勒德·菲爾莫爾的國書向江戶幕府致意，最後雙方於次年（1854年）簽定《神奈川條約》（《日美和親條約》）。
A. 竹橋事件
B. 黑船來航
C. 巨文島事件
《Fate/stay night》是由哪家公司於2004年1月30日發售的PC平台十八禁文字冒險遊戲? 也是該家公司商業化後初次亮相的作品。
https://www.google.com/search?q=《Fate/stay night》是由哪家公司於2004年1月30日發售的PC平台十八禁文字冒險遊戲? 也是該家公司商業化後初次亮相的作品。
A. 任天堂
B. 奈須蘑菇
C. TYPE-MOON
任天堂的王牌遊戲設計師是? 曾任任天堂的情報開發本部（Entertainment Analysis and Development）總監兼總經理，在岩田聰於2015年7月11日逝世後與竹田玄洋一同代理任天堂高層事務，被稱為「瑪利歐之父」，主導開發了瑪利歐系列（Mario）、大金剛系列（Donkey Kong）、薩爾達傳說系列（The Legend of Zelda）、銀河戰士系列（Metroid）等一系列任天堂本家鐵板經典遊戲。
https://www.google.com/search?q=任天堂的王牌遊戲設計師是? 曾任任天堂的情報開發本部（Entertainment Analysis and Development）總監兼總經理，在岩田聰於2015年7月11日逝世後與竹田玄洋一同代理任天堂高層事務，被稱為「瑪利歐之父」，主導開發了瑪利歐系列（Mario）、大金剛系列（Donkey Kong）、薩爾達傳說系列（The Legend of Zelda）、銀河戰士系列（Metroid）等一系列任天堂本家鐵板經典遊戲。
A. 宮本茂
B. 岩田聰
C. 三上真司
四季為春天，夏天，冬天。還有一個是?
https://www.google

## Final Check

In [41]:
validate_final_results(FINAL, len(QUESTIONS))

Happy New Year!!!

["B", "A", "A", "A", "A", "A", "A", "A", "B", "C", "B", "B", "A", "C", "A", "A", "B", "A", "A", "A", "B", "C", "B", "B", "A", "A", "A", "C", "B", "A", "B", "B", "B", "A", "B", "C", "B", "B", "C", "A", "A", "A", "C", "A", "C", "B", "A", "C", "B", "A", "B", "A", "B", "B", "A", "B", "C", "B", "A", "C", "B", "A", "B", "B", "A", "B", "C", "A", "A", "A", "A", "B", "A", "A", "C", "B", "B", "B", "C", "A", "B", "A", "C", "C", "C", "C", "B", "C", "A", "A", "A", "B", "A", "B", "B", "A", "A", "A", "A", "B"]


True

In [15]:
backup = json.dumps(FINAL)

In [28]:
print(backup)

["B", "A", "A", "A", "A", "A", "A", "C", "B", "C", "B", "B", "A", "C", "A", "A", "B", "A", "A", "A", "B", "C", "B", "B", "A", "A", "A", "C", "B", "A", "B", "B", "B", "A", "B", "C", "B", "B", "C", "A", "A", "A", "C", "A", "A", "B", "A", "C", "B", "A", "B", "A", "B", "B", "A", "B", "C", "B", "A", "C", "B", "A", "B", "B", "A", "B", "C", "A", "A", "A", "A", "B", "A", "A", "C", "B", "B", "B", "C", "A", "B", "A", "C", "C", "C", "C", "B", "C", "A", "A", "A", "B", "A", "B", "B", "A", "A", "A", "A", "B"]


# Simulate Single Question

In [None]:
test_question = {
    "Question": "中華民國第14任總統，民主進步黨第16屆黨主席，同時也是台灣歷史上首位女性元首，她是:",
    "A":"蔡正元",
    "B":"蔡英文",
    "C":"洪慈庸"
}

In [None]:
answer, error, result = run_solve(test_question)

In [None]:
print_question(test_question)

In [None]:
print_error(error, result)

In [None]:
print(answer)