-
Notifications
You must be signed in to change notification settings - Fork 0
/
min_cutalign.py
142 lines (135 loc) · 5.28 KB
/
min_cutalign.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf-8 -*-
# Copyright 2013 by Hao Wang
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
from copy import deepcopy
import math,time
import pprint
__min__=10e-7
import logging
logging.basicConfig( level=logging.INFO)
# A A
# +--------+-------+
# B | AB | _AB |
# +--------+-------+
# _B | A_B | _A_B |
# +--------+-------+
def FmeasureXY(matrix, i1, j1, i2, j2, i3, j3):
Pi1j1, Pi1j2, Pi1j3 = matrix[i1][j1] , matrix[i1][j2], matrix[i1][j3]
Pi2j1, Pi2j2, Pi2j3 = matrix[i2][j1] , matrix[i2][j2], matrix[i2][j3]
Pi3j1, Pi3j2, Pi3j3 = matrix[i3][j1] , matrix[i3][j2], matrix[i3][j3]
AB = Pi2j2 + Pi1j1 - Pi2j1 - Pi1j2
_A_B = Pi3j3 + Pi2j2 - Pi2j3 - Pi3j2
_AB = Pi2j3 + Pi1j2 - Pi1j3 - Pi2j2
A_B = Pi3j2 + Pi2j1 - Pi3j1 - Pi2j2
_ABA_B = A_B + _AB
_A_BAB = AB + _A_B
score = AB/(_ABA_B + AB + AB) + _A_B/(_ABA_B + _A_B + _A_B)
score_ = _AB/(_A_BAB + _AB + _AB) + A_B/(_A_BAB + A_B + A_B)
return (score, score_)
def read_tt(tt_file, reverse):
tt=defaultdict(dict)
for line in tt_file:
if reverse:
tw, sw, _, scores, _ = line.rstrip("\n").split("\t")
else:
sw, tw, _, scores, _ = line.rstrip("\n").split("\t")
pst, pts = map(float, scores.split())
tt[sw][tw] = math.sqrt(pst*pts)
return tt
def read_tt_e2f(e2f_file, f2e_file, reverse):
tt=defaultdict(dict)
for line in e2f_file:
sw, tw, score = line.rstrip("\n").split()
tt[sw][tw] = float(score)
for line in f2e_file:
tw, sw, score = line.rstrip("\n").split()
tt[sw][tw] = math.sqrt(tt[sw][tw]*float(score))
return tt
def alignment_matrix(tt,sws,tws):
ls,lt=map(len,[sws,tws])
matrix = [[0.0] * lt for _ in range(ls)]
for i in range(ls):
for j in range(lt):
matrix[i][j] = tt[sws[i]].get(tws[j], __min__)
return matrix
def accumulated_alignment_matrix(matrix):
lx,ly=len(matrix),len(matrix[0])
accumulated_matrix=[[0.0] * (ly+1) for _ in range(lx+1)]
accumulated_matrix[1][1] = matrix[0][0]
for i in range(0,lx):
for j in range(0,ly):
accumulated_matrix[i+1][j+1] = accumulated_matrix[i][j+1] + accumulated_matrix[i+1][j ]\
- accumulated_matrix[i][j] + matrix[i][j]
return accumulated_matrix
def search_best_partition(matrix, i1, j1, i3, j3,lx,ly):
bestScore = -2.0
bestPartition = None
for i2 in range(i1+1, i3):
for j2 in range(j1+1, j3):
score, score_ = FmeasureXY(matrix, i1, j1, i2, j2, i3, j3)
current_score = max(score, score_)
if current_score > bestScore:
bestScore = current_score
bestPartition = (i2,j2,( current_score == score))
return bestPartition
def partitionize(matrix, i1, j1, i2, j2, links):
# Set a minimal block size.
# If the size of the block is bigger, then call recursively.
lx,ly = len(matrix),len(matrix[0])
maxblocksize = 3
if i2 - i1 <= 1 or j2 - j1 <= 1:
links.update((i, j) for i in range(i1, i2) for j in range(j1, j2))
return { (i1,i2): ((j1,j2), None, None) }
i, j, mainDiag = search_best_partition(matrix, i1, j1, i2, j2,lx,ly)
print (i, j, mainDiag)
if mainDiag:
tree1 = partitionize(matrix, i1, j1, i, j, links)
tree2 = partitionize(matrix, i, j, i2, j2, links)
else:
tree1 = partitionize(matrix, i, j1, i2, j, links)
tree2 = partitionize(matrix, i1, j, i, j2, links)
return {(i1,i2):((j1,j2), tree1, tree2) }
def min_cutnalign(tt, stdin):
for i,line in enumerate(stdin):
if i % 10000 ==0:
logging.info('Line %d'%i)
sws, tws = map(lambda x: x.split(), line.rstrip("\n").split("\t"))
lx , ly = len(sws), len(tws)
matrix = alignment_matrix(tt, sws, tws)
accumulated_matrix = accumulated_alignment_matrix(matrix)
links = set()
tree = partitionize(accumulated_matrix, 0, 0, lx, ly, links)
print (" ".join(str(x)+"-"+str(y) for x,y in links))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description="""
This is an python script for real-time word alignment if given the translation table in advance.
""")
parser.add_argument('-t', action='store', dest='tt', type=argparse.FileType('r'),
help='translation table')
parser.add_argument('-i', action='store', dest='input', type=argparse.FileType('r'),
help='parallel corpus')
parser.add_argument('-r', action='store', dest='reverse', type=bool, default =False,
help='parallel corpus')
parser.add_argument('--e2f', action='store', dest='e2f', type=argparse.FileType('r'),
help='parallel corpus')
parser.add_argument('--f2e', action='store', dest='f2e', type=argparse.FileType('r'),
help='parallel corpus')
parser.add_argument('--version', action='version', version='%(prog)s 0.1')
args = parser.parse_args()
if args.tt:
tt = read_tt(args.tt,args.reverse)
if args.e2f and args.f2e:
tt = read_tt_e2f(args.e2f,args.f2e,args.reverse)
min_cutnalign(tt, args.input)