<a href="https://colab.research.google.com/github/kullawattana/thesis_2020_spacy_colab/blob/master/10_find_question_svo_WH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

# object and subject constants
OBJECT_DEPS = {"dobj", "dative", "attr", "oprd"}
SUBJECT_DEPS = {"nsubj", "nsubjpass", "csubj", "agent", "expl"}
# tags that define wether the word is wh-
WH_WORDS = {"WP", "WP$", "WRB"}

# extract the subject, object and verb from the input
def extract_svo(doc):
    sub = []
    at = []
    ve = []
    for token in doc:
        # is this a verb?
        if token.pos_ == "VERB":
            ve.append(token.text)
        # is this the object?
        if token.dep_ in OBJECT_DEPS or token.head.dep_ in OBJECT_DEPS:
            at.append(token.text)
        # is this the subject?
        if token.dep_ in SUBJECT_DEPS or token.head.dep_ in SUBJECT_DEPS:
            sub.append(token.text)
    return " ".join(sub).strip().lower(), " ".join(ve).strip().lower(), " ".join(at).strip().lower()

# wether the doc is a question, as well as the wh-word if any
def is_question(doc):
    # is the first token a verb?
    if len(doc) > 0 and doc[0].pos_ == "VERB":
        return True, ""
    # go over all words
    for token in doc:
        # is it a wh- word?
        if token.tag_ in WH_WORDS:
            return True, token.text.lower()
    return False, ""

# gather the user input and gather the info
while True:    
    doc = nlp(input("> "))
    # print out the pos and deps
    for token in doc:
        print("Token {} POS: {}, dep: {}".format(token.text, token.pos_, token.dep_))

    # get the input information
    subject, verb, attribute = extract_svo(doc)
    question, wh_word = is_question(doc)
    print("svo:, subject: {}, verb: {}, attribute: {}, question: {}, wh_word: {}".format(subject, verb, attribute, question, wh_word))

    '''
> this is a book
Token this POS: DET, dep: nsubj
Token is POS: AUX, dep: ROOT
Token a POS: DET, dep: det
Token book POS: NOUN, dep: attr
svo:, subject: this, verb: , attribute: a book, question: False, wh_word: 
'''