-
Notifications
You must be signed in to change notification settings - Fork 0
/
segmentation.bas
211 lines (192 loc) · 5.23 KB
/
segmentation.bas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
B4J=true
Group=Default Group
ModulesStructureVersion=1
Type=StaticCode
Version=6.51
@EndOfDesignText@
'Static code module
Sub Process_Globals
Private fx As JFX
Private sourceRules As List
Private previousSourceLang As String
Private targetRules As List
Private previousTargetLang As String
Private rules As List
Private previousText As String
Public cascade As Boolean=False
End Sub
Sub readRules(lang As String,srxPath As String,isSource As Boolean)
If sourceRules.IsInitialized=False Then
sourceRules.Initialize
End If
If targetRules.IsInitialized=False Then
targetRules.Initialize
End If
If isSource Then
If previousSourceLang<>lang Then
previousSourceLang=lang
If File.Exists(srxPath,"") Then
sourceRules=SRX.readRules(srxPath,lang)
Else
sourceRules=SRX.readRules(File.Combine(File.DirAssets,"segmentationRules.srx"),lang)
End If
End If
rules=sourceRules
Else
If previousTargetLang<>lang Then
previousTargetLang=lang
If File.Exists(srxPath,"") Then
targetRules=SRX.readRules(srxPath,lang)
Else
targetRules=SRX.readRules(File.Combine(File.DirAssets,"segmentationRules.srx"),lang)
End If
End If
rules=targetRules
End If
End Sub
Public Sub resetLangs
previousSourceLang=""
previousTargetLang=""
End Sub
Sub segmentedTxt(text As String,sentenceLevel As Boolean,sourceLang As String,path As String,isSource As Boolean) As ResumableSub
'Log("text"&text)
readRules(sourceLang,path,isSource)
Dim segments As List
segments.Initialize
If text.Trim="" Then
segments.Add(text)
Return segments
End If
Dim splitted As List
splitted.Initialize
splitted.AddAll(Regex.Split(CRLF,text))
If sentenceLevel Then
Dim index As Int=-1
'Log("para"&splitted)
For Each para As String In splitted
index=index+1
wait for (paragraphInSegments(para)) Complete (resultList As List)
segments.AddAll(resultList)
'Log(para)
'Log(segments)
'Log(segments.Size)
If segments.Size>0 Then
Dim last As String
last=segments.Get(segments.Size-1)
If index<>splitted.Size-1 Then
last=last&CRLF
Else if text.EndsWith(CRLF)=True Then
last=last&CRLF
End If
segments.set(segments.Size-1,last)
Else
segments.Add(para&CRLF) ' if there are several LFs at the beginning
End If
Next
Else
segments.AddAll(splitted)
End If
'Log(segments)
Return segments
End Sub
Sub paragraphInSegments(text As String) As ResumableSub
Dim previousText As String
Dim segments As List
segments.Initialize
Dim breakPositionsMap As Map
breakPositionsMap.Initialize
breakPositionsMap=getPositions("yes",text)
Dim nonbreakPositionsMap As Map
nonbreakPositionsMap.Initialize
nonbreakPositionsMap=getPositions("no",text)
Dim finalBreakPositions As List
finalBreakPositions.Initialize
For Each pos As Int In breakPositionsMap.Keys
If nonbreakPositionsMap.ContainsKey(pos) Then
If cascade=False Then
If breakPositionsMap.Get(pos)<nonbreakPositionsMap.Get(pos) Then
finalBreakPositions.Add(pos)
End If
End If
Else
finalBreakPositions.Add(pos)
End If
Next
finalBreakPositions.Sort(True)
'Log(text)
'Log("start")
'Log(breakPositionsMap)
'Log(nonbreakPositionsMap)
'Log(finalBreakPositions)
For Each pos As Int In finalBreakPositions
Dim textTobeAdded As String
textTobeAdded=text.SubString2(previousText.Length,pos)
segments.Add(textTobeAdded)
previousText=text.SubString2(0,pos)
Next
If previousText.Length<>text.Length Then
segments.Add(text.SubString2(previousText.Length,text.Length))
End If
Return segments
End Sub
Sub removeDuplicated(source As List)
Dim newList As List
newList.Initialize
For Each index As Int In source
If newList.IndexOf(index)=-1 Then
newList.Add(index)
End If
Next
source.Clear
source.AddAll(newList)
End Sub
Sub getPositions(break As String,text As String) As Map
Dim breakPositions As Map
breakPositions.Initialize
'Dim textLeft As String
Dim index As Int=-1
For Each rule As Map In rules
'Log(rule)
index=index+1
If rule.Get("break")<>break Then
Continue
End If
'textLeft=text
Dim beforeBreak,afterBreak As String
beforeBreak=rule.Get("beforebreak")
afterBreak=rule.Get("afterbreak")
Dim bbm As Matcher
bbm=Regex.Matcher2(beforeBreak,32,text)
If beforeBreak<>"null" Then
Do While bbm.Find
If afterBreak="null" Then
addPosition(bbm.GetEnd(0),breakPositions,index)
End If
Dim abm As Matcher
abm=Regex.Matcher2(afterBreak,32,text)
Do While abm.Find
If bbm.GetEnd(0)=abm.GetStart(0) Then
addPosition(bbm.GetEnd(0),breakPositions,index)
Exit
End If
Loop
Loop
Else if afterBreak<>"null" Then
Dim abm As Matcher
abm=Regex.Matcher2(afterBreak,32,text)
Do While abm.Find
addPosition(abm.GetStart(0),breakPositions,index)
Loop
End If
Next
Return breakPositions
End Sub
Sub addPosition(pos As Int,breakPositions As Map,ruleIndex As Int)
If breakPositions.ContainsKey(pos) Then
If breakPositions.Get(pos)<ruleIndex Then
breakPositions.Put(pos,ruleIndex)
End If
Else
breakPositions.Put(pos,ruleIndex)
End If
End Sub