-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractor.gentile.py
77 lines (64 loc) · 2.08 KB
/
extractor.gentile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
Gentile, sense tree-to-string model.
Rule extractor.
- Raphael 2012.8
"""
import sys, os
from abraham.setting import setting
from gentile.extractor import Extractor
from gentile.sense import SenseTree
#sys.argv.append("config.yaml")
setting.load(["file_source_tree","file_source_dep",
"file_target","file_alignment","rule_table_path"])
if len(sys.argv) == 7:
_,FILE_SOURCE_TREE,FILE_SOURCE_DEP,FILE_TARGET,FILE_ALIGNMENT,PATH_RULETABLES,_ = sys.argv
else:
FILE_SOURCE_TREE = setting.file_source_tree
FILE_SOURCE_DEP = setting.file_source_dep
FILE_TARGET = setting.file_target
FILE_ALIGNMENT = setting.file_alignment
PATH_RULETABLES = setting.rule_table_path
linesTree = open(FILE_SOURCE_TREE).readlines()
linesDep = open(FILE_SOURCE_DEP).read().split("\n\n")
linesTarget = open(FILE_TARGET).readlines()
linesAlignment = open(FILE_ALIGNMENT).readlines()
print "[GENTILE] Extracting ..."
print "Tree: %d lines, Dep: %d blocks" % (len(linesTree), len(linesDep))
assert len(linesTree) <= len(linesDep)
def buildAlignmentMap(textAlignment):
"""
Convert alignment text to a dict.
"""
mapAlignment = {}
pairs = textAlignment.split(" ")
for pair in pairs:
src, tgt = map(int, pair.split("-"))
src += 1
mapAlignment.setdefault(src, []).append(tgt)
return mapAlignment
if not os.path.exists(PATH_RULETABLES):
os.mkdir(PATH_RULETABLES)
fileOutput = open("%s/rules.extracted" % PATH_RULETABLES, "w")
nLine = 0
while nLine < len(linesTree):
# if nLine < 500000:
# nLine += 1
# continue
textTree = linesTree[nLine].strip()
if not textTree:
assert nLine == len(linesTree) - 1
continue
textDep = linesDep[nLine].strip()
textTarget = linesTarget[nLine].strip()
textAlignment =linesAlignment[nLine].strip()
mapAlignment = buildAlignmentMap(textAlignment)
targets = textTarget.split(" ")
senseTree = SenseTree(textTree, textDep)
extractor = Extractor(senseTree, targets, mapAlignment)
fileOutput.write(extractor.exportRuleStrings())
fileOutput.write("\n")
nLine += 1
if nLine % 1000 == 0:
print nLine,
sys.stdout.flush()
fileOutput.close()