# `convert_text_to_opennmt_format.py`

In [1]:
import json
import copy
from pycorenlp import StanfordCoreNLP
from sys import argv
from pprint import pprint as pprint

nlp = StanfordCoreNLP('http://localhost:9000')

In [2]:
!cat ./data/ner_features
with open('data/sample.txt', 'r') as myfile:
    text = myfile.read()
print(text)

DATE	1
DURATION	2
LOCATION	3
MISC	4
MONEY	5
NUMBER	6
O	7
ORDINAL	8
ORGANIZATION	9
PADDING	10
PERCENT	11
PERSON	12
TIME	13
Singapore was declared independent on 9 August 1965. 
National Service lasts two years. 
Singapore University of Technology and Design is located near Changi Business Park.
Joseph Schooling in Singaporean.
The tuition fee for NUS engineering course is $8000 per year.
My GPA at SMU is 3.30.
I finished second in a recent marathon.
I currently work for 2359Media.
70% of the Earth's surface is covered with water.
Heng Swee Keat will be the next prime minister of Singapore.
Christmas Day on 25th December every year.
We will have lunch at noon today.



In [3]:
output = nlp.annotate(text, properties={
    'annotators': 'tokenize,ssplit,pos,ner',
    'outputFormat': 'json'}
)
if type(output) == str:
    output =json.loads(output, encoding='utf-8', strict=False)
pprint(output)

{'sentences': [{'index': 0,
                'parse': 'SENTENCE_SKIPPED_OR_UNPARSABLE',
                'tokens': [{'after': ' ',
                            'before': '',
                            'characterOffsetBegin': 0,
                            'characterOffsetEnd': 9,
                            'index': 1,
                            'lemma': 'Singapore',
                            'ner': 'LOCATION',
                            'originalText': 'Singapore',
                            'pos': 'NNP',
                            'word': 'Singapore'},
                           {'after': ' ',
                            'before': ' ',
                            'characterOffsetBegin': 10,
                            'characterOffsetEnd': 13,
                            'index': 2,
                            'lemma': 'be',
                            'ner': 'O',
                            'originalText': 'was',
                            'pos': 'VBD',
                        

                            'characterOffsetEnd': 347,
                            'index': 3,
                            'lemma': 'work',
                            'ner': 'O',
                            'originalText': 'work',
                            'pos': 'VBP',
                            'word': 'work'},
                           {'after': ' ',
                            'before': ' ',
                            'characterOffsetBegin': 348,
                            'characterOffsetEnd': 351,
                            'index': 4,
                            'lemma': 'for',
                            'ner': 'O',
                            'originalText': 'for',
                            'pos': 'IN',
                            'word': 'for'},
                           {'after': '',
                            'before': ' ',
                            'characterOffsetBegin': 352,
                            'characterOffsetEnd': 361,
                            

In [4]:
def get_possible_ans_tags(ner_features_path='data/ner_features'):
    with open(ner_features_path, 'r') as f:
        possible_ans_tags = [line.split('\t')[0] for line in f]
    possible_ans_tags = [tag for tag in possible_ans_tags if tag != 'O']
    possible_ans_tags.append('CD')
    possible_ans_tags = set(possible_ans_tags)
    return possible_ans_tags

possible_ans_tags = get_possible_ans_tags()
pprint(possible_ans_tags)

{'CD',
 'DATE',
 'DURATION',
 'LOCATION',
 'MISC',
 'MONEY',
 'NUMBER',
 'ORDINAL',
 'ORGANIZATION',
 'PADDING',
 'PERCENT',
 'PERSON',
 'TIME'}


In [5]:
def get_featured_sents(corenlp_output):
    sents = []
    for sentence in corenlp_output['sentences']:
        sent_start_ind = sentence['index']
        sent = []
        for token in sentence['tokens']:
            token_start_ind = token['index']
            word = token['originalText']
            lower_word = word.lower()
            if (word[0] == word[0].upper() and word[0] != word[0].lower()):
                case_tag = 'UP'
            else:
                case_tag = 'LOW'
            ner_tag = token['ner']
            pos_tag = token['pos']
            sent.append(({'token': lower_word, 'ner': ner_tag, 'case_tag': case_tag, 'pos_tag': pos_tag}))
        sents.append(sent)
    return sents
sents = get_featured_sents(output)
pprint(sents)

[[{'case_tag': 'UP', 'ner': 'LOCATION', 'pos_tag': 'NNP', 'token': 'singapore'},
  {'case_tag': 'LOW', 'ner': 'O', 'pos_tag': 'VBD', 'token': 'was'},
  {'case_tag': 'LOW', 'ner': 'O', 'pos_tag': 'VBN', 'token': 'declared'},
  {'case_tag': 'LOW', 'ner': 'O', 'pos_tag': 'JJ', 'token': 'independent'},
  {'case_tag': 'LOW', 'ner': 'O', 'pos_tag': 'IN', 'token': 'on'},
  {'case_tag': 'LOW', 'ner': 'DATE', 'pos_tag': 'CD', 'token': '9'},
  {'case_tag': 'UP', 'ner': 'DATE', 'pos_tag': 'NNP', 'token': 'august'},
  {'case_tag': 'LOW', 'ner': 'DATE', 'pos_tag': 'CD', 'token': '1965'},
  {'case_tag': 'LOW', 'ner': 'O', 'pos_tag': '.', 'token': '.'}],
 [{'case_tag': 'UP',
   'ner': 'ORGANIZATION',
   'pos_tag': 'NNP',
   'token': 'national'},
  {'case_tag': 'UP',
   'ner': 'ORGANIZATION',
   'pos_tag': 'NNP',
   'token': 'service'},
  {'case_tag': 'LOW', 'ner': 'O', 'pos_tag': 'VBZ', 'token': 'lasts'},
  {'case_tag': 'LOW', 'ner': 'DURATION', 'pos_tag': 'CD', 'token': 'two'},
  {'case_tag': 'LOW',

In [6]:
def add_answers_tag(sents, possible_ans_tags):
    for sent in sents:
        ans_tag = 'O'
        for token in sent:
            if token['ner'] in possible_ans_tags or token['pos_tag'] in possible_ans_tags:
                if ans_tag == 'B' or ans_tag == 'I':
                    ans_tag = 'I'
                else:
                    ans_tag = 'B'
            else:
                ans_tag = 'O'
            token['ans_tag'] = ans_tag
add_answers_tag(sents, possible_ans_tags)
pprint(sents)

[[{'ans_tag': 'B',
   'case_tag': 'UP',
   'ner': 'LOCATION',
   'pos_tag': 'NNP',
   'token': 'singapore'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'VBD',
   'token': 'was'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'VBN',
   'token': 'declared'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'JJ',
   'token': 'independent'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'IN',
   'token': 'on'},
  {'ans_tag': 'B',
   'case_tag': 'LOW',
   'ner': 'DATE',
   'pos_tag': 'CD',
   'token': '9'},
  {'ans_tag': 'I',
   'case_tag': 'UP',
   'ner': 'DATE',
   'pos_tag': 'NNP',
   'token': 'august'},
  {'ans_tag': 'I',
   'case_tag': 'LOW',
   'ner': 'DATE',
   'pos_tag': 'CD',
   'token': '1965'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': '.',
   'token': '.'}],
 [{'ans_tag': 'B',
   'case_tag': 'UP',
   'ner': 'ORGANIZATION',
   'pos_tag': 'NNP',
   'token': '

In [7]:
def copy_sent_with_one_ans(sent, begin_ind, end_ind):
    new_sent = copy.deepcopy(sent)
    for ind, token in enumerate(new_sent):
        if ind < begin_ind or ind > end_ind:
            token['ans_tag'] = 'O'
    return new_sent

def separate_and_duplicate_ans_sents(sents):
    new_sents = []
    for sent in sents:
        begin_ind, end_ind = None, None
        for ind, token in enumerate(sent):
            if token['ans_tag'] == 'B':
                begin_ind = ind
            elif token['ans_tag'] == 'I':
                continue
            else:
                if begin_ind is not None:
                    end_ind = ind - 1
                    new_sent = copy_sent_with_one_ans(sent, begin_ind, end_ind)
                    begin_ind = None
                    new_sents.append(new_sent)
                else:
                    begin_ind, end_ind = None, None
    return new_sents
sents = separate_and_duplicate_ans_sents(sents)
pprint(sents)

[[{'ans_tag': 'B',
   'case_tag': 'UP',
   'ner': 'LOCATION',
   'pos_tag': 'NNP',
   'token': 'singapore'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'VBD',
   'token': 'was'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'VBN',
   'token': 'declared'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'JJ',
   'token': 'independent'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'IN',
   'token': 'on'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'DATE',
   'pos_tag': 'CD',
   'token': '9'},
  {'ans_tag': 'O',
   'case_tag': 'UP',
   'ner': 'DATE',
   'pos_tag': 'NNP',
   'token': 'august'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'DATE',
   'pos_tag': 'CD',
   'token': '1965'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': '.',
   'token': '.'}],
 [{'ans_tag': 'O',
   'case_tag': 'UP',
   'ner': 'LOCATION',
   'pos_tag': 'NNP',
   'token': 'sing

   'pos_tag': 'NN',
   'token': 'tuition'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'NN',
   'token': 'fee'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'IN',
   'token': 'for'},
  {'ans_tag': 'O',
   'case_tag': 'UP',
   'ner': 'ORGANIZATION',
   'pos_tag': 'NNP',
   'token': 'nus'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'NN',
   'token': 'engineering'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'NN',
   'token': 'course'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'VBZ',
   'token': 'is'},
  {'ans_tag': 'B',
   'case_tag': 'LOW',
   'ner': 'MONEY',
   'pos_tag': '$',
   'token': '$'},
  {'ans_tag': 'I',
   'case_tag': 'LOW',
   'ner': 'MONEY',
   'pos_tag': 'CD',
   'token': '8000'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'IN',
   'token': 'per'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'DURATION',
 

   'ner': 'O',
   'pos_tag': 'NN',
   'token': 'minister'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'IN',
   'token': 'of'},
  {'ans_tag': 'B',
   'case_tag': 'UP',
   'ner': 'LOCATION',
   'pos_tag': 'NNP',
   'token': 'singapore'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': '.',
   'token': '.'}],
 [{'ans_tag': 'B',
   'case_tag': 'UP',
   'ner': 'DATE',
   'pos_tag': 'NNP',
   'token': 'christmas'},
  {'ans_tag': 'I',
   'case_tag': 'UP',
   'ner': 'DATE',
   'pos_tag': 'NNP',
   'token': 'day'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'O',
   'pos_tag': 'IN',
   'token': 'on'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'DATE',
   'pos_tag': 'JJ',
   'token': '25th'},
  {'ans_tag': 'O',
   'case_tag': 'UP',
   'ner': 'DATE',
   'pos_tag': 'NNP',
   'token': 'december'},
  {'ans_tag': 'O',
   'case_tag': 'LOW',
   'ner': 'SET',
   'pos_tag': 'DT',
   'token': 'every'},
  {'ans_tag': 'O',
   'case_tag': 'LOW'

In [8]:
def convert_sents_to_opennmt(sents):
    featured_sents = []
    for sent in sents:
        featured_source_tokens = []
        for token in sent:
            featured_source_token = "{}￨{}￨{}￨{}￨{}".format(
                token['token'],
                token['ans_tag'],
                token['case_tag'],
                token['pos_tag'],
                token['ner']
            )
            featured_source_tokens.append(featured_source_token)
        featured_sents.append(" ".join(featured_source_tokens))
    return featured_sents
opennmt_sents = convert_sents_to_opennmt(sents)
for sent in opennmt_sents:
    print(sent)
    # this output is piped to get qnas.py

singapore￨B￨UP￨NNP￨LOCATION was￨O￨LOW￨VBD￨O declared￨O￨LOW￨VBN￨O independent￨O￨LOW￨JJ￨O on￨O￨LOW￨IN￨O 9￨O￨LOW￨CD￨DATE august￨O￨UP￨NNP￨DATE 1965￨O￨LOW￨CD￨DATE .￨O￨LOW￨.￨O
singapore￨O￨UP￨NNP￨LOCATION was￨O￨LOW￨VBD￨O declared￨O￨LOW￨VBN￨O independent￨O￨LOW￨JJ￨O on￨O￨LOW￨IN￨O 9￨B￨LOW￨CD￨DATE august￨I￨UP￨NNP￨DATE 1965￨I￨LOW￨CD￨DATE .￨O￨LOW￨.￨O
national￨B￨UP￨NNP￨ORGANIZATION service￨I￨UP￨NNP￨ORGANIZATION lasts￨O￨LOW￨VBZ￨O two￨O￨LOW￨CD￨DURATION years￨O￨LOW￨NNS￨DURATION .￨O￨LOW￨.￨O
national￨O￨UP￨NNP￨ORGANIZATION service￨O￨UP￨NNP￨ORGANIZATION lasts￨O￨LOW￨VBZ￨O two￨B￨LOW￨CD￨DURATION years￨I￨LOW￨NNS￨DURATION .￨O￨LOW￨.￨O
singapore￨B￨UP￨NNP￨ORGANIZATION university￨I￨UP￨NNP￨ORGANIZATION of￨I￨LOW￨IN￨ORGANIZATION technology￨I￨UP￨NNP￨ORGANIZATION and￨I￨LOW￨CC￨ORGANIZATION design￨I￨UP￨NNP￨ORGANIZATION is￨O￨LOW￨VBZ￨O located￨O￨LOW￨JJ￨O near￨O￨LOW￨IN￨O changi￨O￨UP￨NNP￨LOCATION business￨O￨UP￨NNP￨LOCATION park￨O￨UP￨NNP￨LOCATION .￨O￨LOW￨.￨O
singapore￨O￨UP￨NNP￨ORGANIZATION university￨O￨UP￨NNP￨ORGANIZATION of￨O

# `get_qna.py`

In [9]:
# -*- coding: utf-8 -*-
import zmq, sys, json
from signal import signal, SIGPIPE, SIG_DFL

In [10]:
data = [{"src": line} for line in opennmt_sents]

In [11]:
class ConnectionHandler:
    def __init__(self):
        signal(SIGPIPE, SIG_DFL)
        self.sock = zmq.Context().socket(zmq.REQ)
        self.sock.connect("tcp://127.0.0.1:5556")

    def __call__(self, data):
        self.sock.send_string(json.dumps(data))
        recieved = json.loads(str(self.sock.recv(), "utf-8"), encoding='utf-8', strict=False)
        print("Output from openmnt \n")
        pprint(recieved)
        print("\n\n\n\n")
        recieved = [(row[0]['tgt'], row[0]['pred_score'], row[0]['src']) for row in recieved]
        print("Output that is relevant")
        pprint(recieved)
        print("\n\n\n\n")
        return get_with_answers(recieved)

# does this merely process the answers?
def get_with_answers(recieved):
    answers = []
    for _, _, src in recieved:
        tokens = src.split(' ')
        answer = []
        for token in tokens:
            features = token.split('￨')
            word = features[0]
            ans_tag = features[1]
            if ans_tag == 'B' or ans_tag == 'I':
                answer.append(word)
            elif answer:
                break
        answers.append(' '.join(answer))
    return [(recieved[i][0], answers[i], recieved[i][1]) for i in range(len(recieved))]

In [12]:
connect = ConnectionHandler()
received = connect(data)

for target, answer, score in sorted(received, key=lambda x: x[2], reverse=True):
    print("{}\t{}\t{}".format(target, answer, score))

Output from openmnt 

[[{'attn': [[0.99102300405502,
             0.0037460965104401,
             0.00092615053290501,
             0.0012628746917471,
             0.0025424077175558,
             5.9968009736622e-05,
             9.4836314019631e-06,
             1.78015743586e-05,
             0.00041221251012757],
            [1.4661324030385e-06,
             0.11344105005264,
             0.88650673627853,
             4.9313908675686e-05,
             3.0605920642301e-07,
             1.1565532531677e-08,
             7.572441007575e-10,
             7.8667972136515e-10,
             1.1150099226143e-06],
            [2.7094037591269e-07,
             3.2391406421084e-05,
             0.99922150373459,
             0.00074580480577424,
             1.8678381508153e-08,
             1.2291422324395e-09,
             3.6530442559801e-10,
             6.4651023423679e-10,
             2.116089881099e-08],
            [2.3581554486896e-09,
             2.4916758434301e-07,
        

            [0.035756379365921,
             0.00032643679878674,
             2.5576127882232e-05,
             9.4586511067973e-07,
             6.763656710973e-05,
             0.006831269711256,
             0.13109746575356,
             0.81207245588303,
             0.013753815554082,
             5.525813321583e-05,
             2.1893114876548e-07,
             1.2533623703348e-05],
            [0.00041075062472373,
             0.00019330058421474,
             0.00041515234624967,
             3.9125061448431e-06,
             7.7592067100341e-06,
             0.00040824635652825,
             0.00028049506363459,
             0.02318930439651,
             0.97507840394974,
             1.1219395673834e-05,
             3.4557705586025e-09,
             1.474168925597e-06],
            [0.15031167864799,
             0.032463669776917,
             0.08526735752821,
             0.38582715392113,
             0.13729481399059,
             0.14811015129089,
             0.0