## Question Classification

In [None]:
from nltk.tag.stanford import StanfordPOSTagger
from nltk.corpus import wordnet as wn
pos_jar = './stanford-pos-tagger/stanford-postagger.jar'
modelfile = './stanford-pos-tagger/english-bidirectional-distsim.tagger'

st = StanfordPOSTagger(model_filename=modelfile, path_to_jar=pos_jar)
st.tag("what is in front of the notre dame main building?".split())

In [None]:
def classWithHeadWord(head, headNext):
    personList = ['who', 'whom']
    locationList = ['where', 'whence', 'whither']
    quantityList = ['few', 'great', 'little', 'many', 'much']
    if head in personList or headNext in personList:
        return 'PERSON'
    elif head in locationList:
        return 'LOCATION'
    elif head == 'how' and headNext in quantityList:
        return 'QUANTITY'
    elif head == 'when':
        return 'TIME' # if question type is returned as 'TIME', also check 'DATE' in answer sentence
    else:
        return 'OTHER'

In [None]:
def findWordList(word):
    check_word_list = []
    check_word_list.append(word)
    for synset in wn.synsets(word):
        hypernyms = synset.hypernyms()
        for hypernym in hypernyms:
            current_word = hypernym.name().split('.')[0]
            check_word_list.append(current_word)
            for parent_synset in wn.synsets(current_word):
                parent_hypernyms = parent_synset.hypernyms()
                for parent_hypernym in parent_hypernyms:
                    parent_current_word = parent_hypernym.name().split('.')[0]
                    check_word_list.append(parent_current_word)
        check_word_list.append(synset.name().split('.')[0])
        
    return list(set(check_word_list))

In [None]:
def matchList(checkList):
    entities = ['PERSON', 'NORP', 'FACILITY', 'ORGANIZATION', 'GPE', 'LOCATION', 'PRODUCT', 'EVENT', 'WORK OF ART', 'LAW', 'LANGUAGE',
             'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
    for synset in checkList:
        synset = synset.upper()
        for entity in entities:
            if synset.find(entity) != -1:
                return entity
    return 'OTHER'

In [None]:
# a pre-defined dict to classify question type based on head noun
pre_define_dict = {}
pre_define_dict['NORP'] = ['nationality', 'religion']
pre_define_dict['FACILITY'] = ['building', 'airports', 'highways', 'bridges']
pre_define_dict['ORGANIZATION'] = ['company', 'agency', 'institution']
pre_define_dict['GPE'] = ['country', 'city', 'state', 'province']
pre_define_dict['LOCATION'] = ['mountain', 'lake', 'river']
pre_define_dict['PRODUCT'] = ['vehicle', 'weapon', 'food']
pre_define_dict['EVENT'] = ['disaster', 'battle', 'war']
pre_define_dict['WORK OF ART'] = ['book', 'song', 'music', 'novel']
pre_define_dict['DATE'] = ['decade', 'year', 'month', 'day']
pre_define_dict['PERCENT'] = ['percent', 'percentage']

In [None]:
def classWithOtherWords(words):
    wh_tag_list = ['WDT', 'WP', 'WP$', 'WRB']
    tag_pairs = st.tag(words)
    checkList = []
    for i in range(len(tag_pairs)):
        if tag_pairs[i][1] in wh_tag_list:

            second_word = ''
            if i + 1 <= len(tag_pairs):
                second_word = tag_pairs[i + 1][0]
            head_result = classWithHeadWord(tag_pairs[i][0], second_word)
            if head_result != 'OTHER':
                return head_result
            else:
                head_noun = ''
                for j in range(i + 1, len(tag_pairs)):
                    if (tag_pairs[j][1] == 'NN'):
                        if j - 1 >= 0 and j + 1 < len(tag_pairs) and tag_pairs[j - 1][1] == 'IN' and tag_pairs[j + 1][1] == 'IN':
                            continue
                        head_noun = tag_pairs[j][0]
                        break
                if head_noun == '':
                    return 'OTHER'
                else:
                    if head_noun.find('?') != -1:
                        head_noun = head_noun.replace('?', '')
                    
                    for key, value in pre_define_dict.items():
                        if head_noun in value:
                            return key
                    checkList = findWordList(head_noun)
          
    return matchList(checkList)

In [None]:
# this part takes about 10 mins to run

question_classes = []
times = {}
num = 0
times['first'] = 0
times['second'] = 0
times['others'] = 0
for question in questions:
    question = question.lower()
    # print(question)
    num += 1
    if num % 100 == 0:
        print(num)
    words = question.split();
    head_word_result = classWithHeadWord(words[0], words[1])
    if head_word_result != 'OTHER':
        question_classes.append(head_word_result)
        times['first'] += 1
    else:
        other_word_result = classWithOtherWords(words)
        if other_word_result == 'OTHER':
            times['others'] += 1
        else:
            times['second'] += 1
        question_classes.append(other_word_result)
        
print(str(times))

# In answer extraction: 
# if question_classes[i] == 'OTHERS', it means our question classifier cannot recognize this question, then just return "cannot be processed"
        

In [None]:
question_classes