-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract-tokens.py
46 lines (36 loc) · 1.64 KB
/
extract-tokens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from lxml import etree
from collections import OrderedDict
# Create a dictionary to store unique headwords and forms
words = OrderedDict()
def parse_xml(xml_file):
# Parse the XML file using the provided XSD schemas
parser = etree.XMLParser(
schema=etree.XMLSchema(etree.parse("Sloleks.3.0/xml_schemas/morphological_lexicon.xsd")),
)
tree = etree.parse(xml_file, parser)
# Extract "headword" elements
for entry in tree.xpath("/lexicon/entry"):
headword = entry.findtext("head/headword/lemma")
frequency = int(entry.findtext("head/measureList/measure"))
if frequency > 100 and headword not in words:
words[headword] = frequency
# Extract "form" elements
for orthography in tree.xpath("/lexicon/entry/body/wordFormList/wordForm/formRepresentations/orthographyList/orthography"):
form = orthography.findtext("form")
frequency = int(orthography.findtext("measureList/measure"))
if frequency > 100 and form not in words:
words[form] = frequency
if __name__ == "__main__":
# The number of input files
num_files = 102
# Process each input file and write the results to the output file
for i in range(1, num_files + 1):
input_filename = f"Sloleks.3.0/sloleks_3.0_{i:03d}.xml"
parse_xml(input_filename)
print(i)
# sort words by occurrences (more frequent first)
sorted_words = sorted(words.items(), key=lambda item: item[1], reverse=True)
with open("output.txt", 'w', encoding='utf-8') as output_file:
output_file.write("")
for word in sorted_words:
output_file.write(word[0] + '\n')