-
Notifications
You must be signed in to change notification settings - Fork 5
/
indesign.py
204 lines (170 loc) · 6.97 KB
/
indesign.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
from __future__ import absolute_import
import io
import re
from itertools import count
from lxml import etree
from ucf import UCF
from openformats.handlers import Handler
from openformats.transcribers import Transcriber
from openformats.strings import OpenString
class InDesignHandler(Handler):
"""A handler class that parses and compiles .idml files that are created
in Adobe's InDesign.
IDML files contain multiple XML fragments that can be parsed to extract
strings from.
"""
name = "InDesign"
extension = "idml"
SPECIFIER = None
PROCESSES_BINARY = True
CONTENT_REGEX = re.compile(r'(<Content>)(.*)?(</Content>)')
SPECIAL_CHARACTERS_REGEX = re.compile(r'<\?ACE \d+\?>|<Br/>;')
""" Parse Methods """
def __init__(self, *args, **kwargs):
self.order = count()
self.stringset = []
super(InDesignHandler, self).__init__(*args, **kwargs)
def parse(self, content, **kwargs):
""" Parses .idml file content and returns the resource template and
stringset.
* Use UCF to unpack `content` to xml fragments
* Parse all Story fragments to extract the translatable strings
and replace them with a replacement hash
* Pack the fragments back to create the template
* Return the (template, stringset) tuple
"""
idml = UCF(io.BytesIO(content))
ordered_stories = self._get_ordered_stories(idml)
# Iterate over the contents of the IDML file
for key in ordered_stories:
try:
story_content = idml[key]
except KeyError:
continue
story_content = self._find_and_replace(story_content)
# Update the XML file to contain the template strings
idml[key] = str(story_content)
out = io.BytesIO()
idml.save(out)
template = out.getvalue()
return template, self.stringset
def _get_ordered_stories(self, idml):
"""
Try to find the order the stories appear in the indesign document
* Parse designmap.xml to get the StoryList attribute.
* Return a list with the idml keys of the stories in the order they
appear in StoryList
"""
STORY_KEY = 'Stories/Story_{}.xml'
BACKING_STORY = 'XML/BackingStory.xml'
designmap = idml.get('designmap.xml')
designmap_tree = etree.fromstring(designmap)
story_ids = designmap_tree.attrib.get("StoryList", "").split()
story_keys = [STORY_KEY.format(s) for s in story_ids]
# In case there are stories that is not referenced in designmap.xml,
# append them at the end of the list
all_stories = {
k for k in idml.keys()
if k.startswith('Stories') or k == BACKING_STORY
}
story_keys.extend(all_stories - set(story_keys))
return story_keys
def _can_skip_content(self, string):
"""
Checks if the contents of an XML files are translateable.
Strings that contain only special characters or can be evaluated
to a nunber are skipped.
"""
if not self.SPECIAL_CHARACTERS_REGEX.sub('', string).strip():
return True
try:
float(string.strip())
return True
except ValueError:
pass
return False
def _find_and_replace(self, story_xml):
"""
Finds all the translatable content in the given XML string
replaces it with the string_hash and returns the resulting
template while updating `self.stringset` in the process.
args:
story_xml (str): The xml content of a single Story of the IDML file
returns:
the input string with all translatable content replaced by the
md5 hash of the string.
"""
template = self.CONTENT_REGEX.sub(self._replace, story_xml)
return template
def _replace(self, match):
""" Implements the logic used by `self.CONTENT_REGEX.sub(...)` to
replace strings with their template replacement and appends new strings
to `self.stringset`.
"""
opening_tag, string, closing_tag = match.groups()
string = string.decode('utf-8')
if self._can_skip_content(string):
return match.group()
order = next(self.order)
string_object = OpenString(str(order), string, order=order)
self.stringset.append(string_object)
return "".join((opening_tag, string_object.template_replacement,
closing_tag))
""" Compile Methods """
def compile(self, template, stringset, **kwargs):
# The content is a binary IDML file
idml = UCF(io.BytesIO(template))
current_string = None
stringset = iter(stringset)
# Iterate over the contents of the IDML file
for key in self._get_ordered_stories(idml):
try:
story_content = idml[key]
except KeyError:
continue
story_content = idml[key]
compiled_story, current_string = self._compile_story(
story_content, stringset, current_string=current_string
)
idml[key] = compiled_story
out = io.BytesIO()
idml.save(out)
return out.getvalue()
def _compile_story(self, story_content, stringset, current_string=None):
""" Handles the compilation of a single story
args:
story_content: the xml content of the story
stringset: a stringset iterator
current_string: the last string that has been left unprocessed
from the previous story
returns:
compiled_story: the compiled story content
current_string: the last unprocessed string to be passed to the
next story
"""
transcriber = Transcriber(story_content)
hash_regex = re.compile('[a-z,0-9]{32}_tr')
found = True
while found:
try:
if not current_string:
current_string = next(stringset)
hash_position = story_content.index(
current_string.template_replacement
)
except ValueError:
found = False
except StopIteration:
break
else:
transcriber.copy_until(hash_position)
transcriber.add(current_string.string.encode('utf-8'))
transcriber.skip(len(current_string.template_replacement))
current_string = None
# Update the XML file to contain the template strings
transcriber.copy_until(len(story_content))
compiled_story = transcriber.get_destination()
# in case there are any hashes that have not been replaced, replace
# them with an empty string
compiled_story = hash_regex.sub('', compiled_story)
return compiled_story, current_string