In [14]:
from pathlib import Path
import sys
root = Path.cwd().parent
sys.path.append(str(root))

import urbans
from grammar.dictionary import dic_en2jp
from grammar.source_grammar import EN_source_transduction_grammar as src_grammar
from grammar.transduction_rule import src_to_target_grammar

from grammar.postproc import post_processing, pref_pattern

In [15]:
post_processor = post_processing()

In [16]:
# The translator in modified urbans can be created with empty dictionaries (only assigning src_grammar) and thus acting as a parser
# for the purpose of parsing tag-word pairs, failures, ambiguities, etc.
word_parser = urbans.Translator(src_grammar=src_grammar, src_to_tgt_grammar=dict(), src_to_tgt_dictionary=dict())

In [47]:
# concatenate all the English question files and move it to our source directory
# suppose that we have the mcwq folder which can be downloaded from https://github.com/coastalcph/seq2sparql/mcwq

concatenate_all_en_files = """
# first add a new line to each file otherwise the first sentences are concatenated with the last ones in other files
find mcwq/translations/ -type f -name \'*.en.txt\' -exec sh -c "echo \n >> {}" \; |
find mcwq/translations/ -type f -name \'*.en.txt\' -exec cat {} + > Questions.txt"""
os.system(concatenate_all_en_files)

0

In [48]:
with open('Questions.txt') as f:
    lines = f.readlines()

lines = list(set(lines))
# del lines[lines.index('\n')]

In [49]:
len(lines)

105461

## Word Parsing
Repeat parsing and revising grammar. The tag_word_set refers to the ones for which we should provide lexical translations.

In [50]:
tag_word_set, failed_sentences, ambiguity_sentences = word_parser.parse_words(sentences=lines)

100%|██████████| 105461/105461 [18:13<00:00, 96.43it/s] 

Word parsing completed! 0 sentences failed. 37280 sentences occurred ambiguity.





In [None]:
# reparsing
tag_word_set_2, failed_sentences_2, ambiguity_sentences_2 = word_parser.parse_words(sentences=['Was a Chinese actor that M2 was written by and starred a composer\n'])

In [None]:
tag_word_set

In [None]:
# draw the trees to analyze
import nltk
analyze_sen = "Who was influenced by a composer influenced by M3 and influenced by M4 and M5 and influenced by M1"
for i in ambiguity_sentences[analyze_sen]:
    i.draw()

## Translation
Initial translation for further analysis of ambiguities, etc.

In [51]:
e2j_translator = urbans.Translator(src_grammar=src_grammar, src_to_tgt_grammar=src_to_target_grammar, src_to_tgt_dictionary=dic_en2jp)

In [53]:
trans_sentence, trans_map = e2j_translator.translate(ambiguity_sentences, remove_space=True, verbose=True)

100%|██████████| 37280/37280 [08:34<00:00, 72.46it/s] 


In [54]:
len(trans_map)

613

#### Disambiguity
613 out of 37280 ambiguities harmful

In [55]:
# with open('ambiguous_question.txt', 'w') as f:
#     for line in ambiguity_sentences:
#         f.write(line)
try: 
    ambiguity_sentences
except NameError:
    ambiguity_sentences = None


if ambiguity_sentences == None:
    with open('ambiguous_question.txt', 'r') as f:
        ambiguity_sentences = f.readlines()


Post-processing (preferred patterns) for ambiguities systematically caused by certain patterns

In [57]:
e2j_translator = urbans.Translator(src_grammar=src_grammar, src_to_tgt_grammar=src_to_target_grammar, src_to_tgt_dictionary=dic_en2jp)
trans_sentence, trans_map = e2j_translator.translate(ambiguity_sentences, remove_space=True, prefered_pattern=pref_pattern, verbose=True)

100%|██████████| 37280/37280 [08:15<00:00, 75.27it/s] 


Harmful ambiguities after postprocessing

536 >> 438 >> 322 out of 37280

In [58]:
len(trans_map)

322

In [None]:
test_sent =  'Who was influenced by a composer influenced by M3 and influenced by M4 and M5 and influenced by M1'
e2j_translator = urbans.Translator(src_grammar=src_grammar, src_to_tgt_grammar=src_to_target_grammar, src_to_tgt_dictionary=dic_en2jp)
trans_sentence, trans_map = e2j_translator.translate(test_sent, remove_space=True, prefered_pattern=pref_pattern)

## Test/Evaluation
Select some samples for pre-evaluation before formal assessment

In [59]:
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [60]:
vq = random.choices(lines, k=1000)

In [61]:
e2j_translator = urbans.Translator(src_grammar=src_grammar, src_to_tgt_grammar=src_to_target_grammar, src_to_tgt_dictionary=dic_en2jp)
trans_vq, trans_map_vq = e2j_translator.translate(vq, remove_space=True, prefered_pattern=pref_pattern)

In [62]:
vq_trans = [(s,t) for s, t in zip(vq, trans_vq)]

### Evaluate with GOLD

In [None]:
gd_jp = pd.read_csv('CWQ Annotation - GOLD-JP.csv')

In [None]:
dev, test = train_test_split(gd_jp, test_size=0.66, random_state=42)

In [None]:
scr_dev = list(dev['questionPatternModEntities'])
scr_test = list(test['questionPatternModEntities'])
gd_dev = list(dev['questionPatternModEntitiesJP'])
gd_test = list(test['questionPatternModEntitiesJP'])

In [None]:
e2j_translator = urbans.Translator(src_grammar=src_grammar, src_to_tgt_grammar=src_to_target_grammar, src_to_tgt_dictionary=dic_en2jp)
trans_dev, trans_map_dev = e2j_translator.translate(scr_dev, remove_space=True, prefered_pattern=pref_pattern)
trans_dev = post_processor.replace(trans_dev)

In [None]:
from sacrebleu.metrics import BLEU
from sacrebleu.tokenizers import tokenizer_ja_mecab

In [None]:
bleu = BLEU(tokenize='ja-mecab')
bleu.corpus_score(hypotheses=trans_dev, references=[gd_dev])

On dev: BLEU = 97.98 99.3/98.4/97.5/96.9 (BP = 1.000 ratio = 1.000 hyp_len = 1206 ref_len = 1206)

In [None]:
comp = []
for a, b in zip(gd_dev,trans_dev):
    if a != b:
        comp.append((a, b))

In [None]:
pref_pattern = [{"S -> was NominalSub Vobl","commonNoun -> F commonNounHead"},
                {"S -> was NominalSub Vobl","commonNounHead -> F commonNounHead"},
                {"caseS -> Name pS"},
                {"NPQ -> WhWNominal"}
                ]
e2j_translator = urbans.Translator(src_grammar=src_grammar, src_to_tgt_grammar=src_to_target_grammar, src_to_tgt_dictionary=dic_en2jp)
trans_test, trans_map_test = e2j_translator.translate(scr_test, remove_space=True, prefered_pattern=pref_pattern)
trans_test = post_processing(trans_test,pattern_pairs=pattern_pairs)

In [None]:
bleu = BLEU(tokenize='ja-mecab')
bleu.corpus_score(hypotheses=trans_test, references=[gd_test])

On test: BLEU = 97.07 99.2/97.9/96.4/95.0 (BP = 1.000 ratio = 1.000 hyp_len = 2306 ref_len = 2307)

In [None]:
comp_test = []
for a, b in zip(gd_test,trans_test):
    if a != b:
        comp_test.append((a, b))

In [None]:
len(comp_test)

# Dataset translation

In [None]:
# load questions
with open('Questions.txt') as f:
    lines = f.readlines()

lines = list(set(lines))
del lines[lines.index('\n')]

In [None]:
pref_pattern = [{"S -> was NominalSub Vobl","commonNoun -> F commonNounHead"},
                {"S -> was NominalSub Vobl","commonNounHead -> F commonNounHead"},
                {"caseS -> Name pS"},
                {"NPQ -> WhWNominal"}
                ]
e2j_translator = urbans.Translator(src_grammar=src_grammar, src_to_tgt_grammar=src_to_target_grammar, src_to_tgt_dictionary=dic_en2jp)
lines_jp, maps_jp = e2j_translator.translate(lines, remove_space=True, prefered_pattern=pref_pattern)
lines_jp = post_processor.replace(lines_jp)

In [None]:
len(lines) == len(lines_jp)
lines_en2jp = {}
for en, jp in zip(lines, lines_jp):
    lines_en2jp[en] = jp

In [None]:
# import csv

# with open('ambiguity_maps.csv', 'w') as f:
#     for key in maps_jp.keys():
#         f.write("%s, %s\n" % (key, maps_jp[key]))

# Translate samples for manual evaluation

In [None]:
me_en = pd.read_csv('CWQ Annotation - EN.csv')

In [None]:
sentences = list(me_en['questionWithBrackets'].dropna())

In [None]:
import re

In [None]:
# replace bracketed entities with M1 as placeholder
ent_register = []
rep_sentences = []
for seten in sentences:
    ents = re.findall("\[.*?\]", set)
    for ent in ents:
        ent_register.append(ent)
    rep_sentences.append(re.sub("\[.*?\]", 'M1', set))

In [None]:
e2j_translator = urbans.Translator(src_grammar=src_grammar, src_to_tgt_grammar=src_to_target_grammar, src_to_tgt_dictionary=dic_en2jp)
trans_rep_sen, trans_map_test = e2j_translator.translate(rep_sentences, remove_space=True, prefered_pattern=pref_pattern)
trans_sen = post_processor(trans_rep_sen)

In [None]:
# put the bracketed entities back
me_jp = []
for sen in trans_sen:
    while re.search('M1', sen):
        sen = sen.replace('M1', ent_register.pop(0), 1)
    me_jp.append(sen)


In [None]:
dic = [{'questionWithBrackets':en, 'questionWithBrackets_jp':jp} for en,jp in zip(sentences, me_jp)]

In [None]:
import csv

In [None]:
with open('CWQ Annotation - JP.csv','w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=['questionWithBrackets', 'questionWithBrackets_jp'])
    writer.writeheader()
    writer.writerows(dic)

# Statistics

In [64]:
grammar = set(src_grammar.split('\n'))

In [65]:
grammar.remove('')

In [66]:
len(grammar)

252

252 totally, 37 terminals

In [67]:
symb = []
for g in grammar:
    for s in g.split():
        if "\'" in s or "\"" in s:
            continue
        symb.append(s)
symb = set(symb)



125 EN monolingual grammar, 22 terminals

dict terminal 75-122 pairs 287

In [68]:
list_en = set()
list_jp = set()
ct = 0
for tag, dct in dic_en2jp.items():
    for e, j in dct.items():
        list_en.add(e)
        list_jp.add(j)
        ct+=1