# Query Mapping - Natural Language Understanding Task

## Manually create data for natural language understanding

In order to map natural language query to formatted query, we first need to create some labeled data.

For validation purpose, I'll only create data for several simple queries with a predefined pattern.

In [None]:
import json

In [None]:
dev_path = 'data/dev.json'

Labels are character based! This means a number '431' would be separated into 3 tokens, an English word 'lemon' would become 5 tokens. This is for implementation convinience. Please take this into account when you load texts and tokenize them with other methods.

We'll create a small number of data from dev dataset.

In [None]:
import random
from random import sample

# number of dialogues to sample our data
num_dial = 5000

# count total number of dev
with open(dev_path) as f:
    line_cnt = sum(1 for _ in f)

random.seed(42)
dial_idxs = sample(range(line_cnt), num_dial)
# print("sampled dialogue indexes:", dial_idxs)

For dataset generation, I defined some naive patterns for every task.

In [None]:
used_predicate_set = set(['毕业院校', '作者', '妻子', '丈夫', '导演', '国籍'])
# subject means "query about subject"
question_templates = {
    # {"object_type": {"@value": "学校"}, "predicate": "毕业院校", "subject_type": "人物"}
    '毕业院校':
    {
        'ask_subject': ["有哪些人从{}毕业", "{}有哪些知名校友", "从{}毕业的名人有哪些"],
        'ask_object': ["{}毕业于哪里", "{}从哪里毕业", "{}以前在哪里读书", "{}的毕业院校是哪里", "{}的毕业院校是什么"]
    },
    # {"object_type": {"@value": "人物"}, "predicate": "作者", "subject_type": "图书作品"}
    '作者':
    {
        'ask_subject': ["{}有哪些作品", "{}写了什么书", "{}写了哪些书", "{}有什么著作"],
        'ask_object': ["{}是谁的作品", "{}是谁写的", "谁写了{}", "{}的作者是谁"]
    },
    '妻子':
    {
        'ask_object': ["{}的妻子是谁", "{}的老婆是谁", "{}的配偶是谁", "{}和谁结婚了"]
    },
    '丈夫':
    {
        'ask_object': ["{}的丈夫是谁", "{}的老公是谁", "{}的配偶是谁", "{}和谁结婚了"]
    },
    # {"object_type": {"@value": "人物"}, "predicate": "导演", "subject_type": "影视作品"}
    '导演':
    {
        'ask_subject': ["{}有哪些影视作品", "{}导演了哪些电影", "{}导演了哪些电视剧"],
        'ask_object': ["{}是谁导演的", "{}是谁的作品", "{}的导演是谁"]
    },
    #{"object_type": {"@value": "国家"}, "predicate": "国籍", "subject_type": "人物"}
    '国籍':
    {
        'ask_object': ["{}来自于哪个国家", "{}是哪个国家的人", "{}的国籍是什么"]
    }
}

domain_specific_slot_labels = {
    # {"object_type": {"@value": "学校"}, "predicate": "毕业院校", "subject_type": "人物"}
    '毕业院校': {'subject_label': ['B_name', 'I_name'], 'object_label': ['B_school', 'I_school']},
    # {"object_type": {"@value": "人物"}, "predicate": "作者", "subject_type": "图书作品"}
    '作者': {'subject_label': ['B_book', 'I_book'], 'object_label': ['B_name', 'I_name']},
    '妻子': {'subject_label': ['B_name', 'I_name'], 'object_label': ['B_name', 'I_name']},
    '丈夫': {'subject_label': ['B_name', 'I_name'], 'object_label': ['B_name', 'I_name']},
    # {"object_type": {"@value": "人物"}, "predicate": "导演", "subject_type": "影视作品"}
    '导演': {'subject_label': ['B_film', 'I_film'], 'object_label': ['B_name', 'I_name']},
    #{"object_type": {"@value": "国家"}, "predicate": "国籍", "subject_type": "人物"}
    '国籍': {'subject_label': ['B_name', 'I_name'], 'object_label': ['B_country', 'I_country']}
}

domain_specific_intentions = {
    '毕业院校':
    {
        'ask_subject': 'ask_alumni',
        'ask_object': 'ask_school'
    },
    '作者':
    {
        'ask_subject': 'ask_books',
        'ask_object': 'ask_author'
    },
    '妻子':
    {
        'ask_object': 'ask_wife'
    },
    '丈夫':
    {
        'ask_object': 'ask_husband'
    },
    '导演':
    {
        'ask_subject': 'ask_films',
        'ask_object': 'ask_director'
    },
    '国籍':
    {
        'ask_object': 'ask_nationality'
    }
}

all_slot_labels = ['PAD', 'UNK', 'O', 'B_name', 'I_name'] # name refers to human name
all_intentions = ['UNK']
for labels in domain_specific_slot_labels.values():
    # this keeps orders (compared with using set)
    labels = labels['subject_label'] + labels['object_label']
    all_slot_labels += [label for label in labels if label not in all_slot_labels]
for intentions in domain_specific_intentions.values():
    if intentions['ask_object'] not in all_intentions:
        all_intentions.append(intentions['ask_object'])
    if 'ask_subject' in intentions and intentions['ask_subject'] not in all_intentions:
        all_intentions.append(intentions['ask_subject'])
print("slot labels:", all_slot_labels)
print("intentions:", all_intentions)

In [None]:
import linecache
from random import choice, random

questions = []
question_bios = [] #boundary, inside, outside
question_intentions = []

for dial_idx in dial_idxs:
    line = linecache.getline(dev_path, dial_idx)
    spo_list = json.loads(line)['spo_list']
    for spo in spo_list:
        if spo['predicate'] in used_predicate_set:
            question_template = question_templates[spo['predicate']]
            # query object with known subject
            question_object_template = choice(question_template['ask_object']) # randomly choose a question template
            question_object = question_object_template.format(spo['subject'])
            # fill bio sequence
            question_object_bio = ['O'] * len(question_object)
            subject_idx = question_object.find(spo['subject'])
            subject_label = domain_specific_slot_labels[spo['predicate']]['subject_label']
            question_object_bio[subject_idx] = subject_label[0]
            for i in range(subject_idx+1, subject_idx+len(spo['subject'])):
                question_object_bio[i] = subject_label[1]
            questions.append(question_object)
            question_bios.append(question_object_bio)
            question_intentions.append(domain_specific_intentions[spo['predicate']]['ask_object'])
            # since it is less frequent to query subject, we generate less such query
            if random() < 0.5 and 'ask_subject' in question_template:
                # query subject with known object
                question_subject_template = choice(question_template['ask_subject'])
                question_subject = question_subject_template.format(spo['object'])
                # fill bio sequence
                question_subject_bio = ['O'] * len(question_subject)
                object_idx = question_subject.find(spo['object'])
                object_label = domain_specific_slot_labels[spo['predicate']]['object_label']
                try:
                    question_subject_bio[object_idx] = object_label[0]
                except:
                    print('object_idx', object_idx)
                    print('question subject:', question_subject)
                    print('question template:', question_template)
                    continue
                for i in range(object_idx+1, object_idx+len(spo['object'])):
                    question_subject_bio[i] = object_label[1]
                questions.append(question_subject)
                question_bios.append(question_subject_bio)
                question_intentions.append(domain_specific_intentions[spo['predicate']]['ask_subject'])
print(f"generated {len(questions)} questions")

In [None]:
questions[:10]

In [None]:
question_bios[:10]

In [None]:
question_intentions[:10]

### Split train dev and test dataset

In [None]:
train_fraction = 0.8
dev_fraction = 0.1
test_fraction = 0.1
num_total = len(questions)
num_train = int(train_fraction * num_total)
num_dev = int(dev_fraction * num_total)
num_test = num_total - num_train - num_dev
print(f"Samples for training: {num_train}, for dev: {num_dev}, for test: {num_test}")

In [None]:
train_questions = questions[:num_train]
train_bios = question_bios[:num_train]
train_intentions = question_intentions[:num_train]

dev_questions = questions[num_train:num_train+num_dev]
dev_bios = question_bios[num_train:num_train+num_dev]
dev_intentions = question_intentions[num_train:num_train+num_dev]

test_questions = questions[num_train+num_dev:]
test_bios = question_bios[num_train+num_dev:]
test_intentions = question_intentions[num_train+num_dev:]

data_dict = {'train': (train_questions, train_bios, train_intentions),
            'dev': (dev_questions, dev_bios, dev_intentions),
            'test': (test_questions, test_bios, test_intentions)}

In [None]:
' '.join(train_bios[0])

Save the data. We name this dataset `naive`.

In [None]:
%%bash

mkdir -p data/naive/train
mkdir -p data/naive/dev
mkdir -p data/naive/test

In [None]:
with open('data/naive/intent_label.txt', 'w') as f:
    for intention in all_intentions:
        f.write("%s\n" % intention)
with open('data/naive/slot_label.txt', 'w') as f:
    for slot_label in all_slot_labels:
        f.write("%s\n" % slot_label)
for item in ['train', 'dev', 'test']:
    with open(f"data/naive/{item}/seq.in", 'w') as f:
        for question in data_dict[item][0]:
            f.write("%s\n" % question)
    with open(f"data/naive/{item}/seq.out", 'w') as f:
        for bio in data_dict[item][1]:
            f.write("%s\n" % ' '.join(bio))
    with open(f"data/naive/{item}/label", 'w') as f:
        for intent in data_dict[item][2]:
            f.write("%s\n" % intent)

## Train JointBERT Model

First you need to install all required python packages.

In [None]:
!pip install -r requirements.txt

Then run following script for training. Available options for `--task` is defined in `data_loader.py`. `--model_dir` specifies where to store trained models.

In [None]:
%% bash

python3 main.py --task naive \
                  --model_type bert \
                  --model_dir naive_model \
                  --do_train --do_eval

## Evaluation

Run script `predict.py` to evaluated trained models.

In [None]:
%%bash

python predict.py --input_file data/naive/test/seq.in --output output/naive_test.out --model_dir naive_model