forked from proycon/pynlpl
/
sonar2folia.py
executable file
·115 lines (98 loc) · 3.52 KB
/
sonar2folia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python
#-*- coding:utf-8 -*-
#---------------------------------------------------------------
# PyNLPl - Conversion script for converting SoNaR/D-Coi from D-Coi XML to FoLiA XML
# by Maarten van Gompel, ILK, Tilburg University
# http://ilk.uvt.nl/~mvgompel
# proycon AT anaproy DOT nl
#
# Licensed under GPLv3
#
#----------------------------------------------------------------
# Usage: sonar2folia.py sonar-input-dir output-dir nr-of-threads
import sys
import os
if __name__ == "__main__":
sys.path.append(sys.path[0] + '/../..')
os.environ['PYTHONPATH'] = sys.path[0] + '/../..'
import pynlpl.formats.folia as folia
import pynlpl.formats.sonar as sonar
from multiprocessing import Pool, Process
import datetime
import codecs
def process(data):
i, filename = data
category = os.path.basename(os.path.dirname(filename))
progress = round((i+1) / float(len(index)) * 100,1)
print "#" + str(i+1) + " " + filename + ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + ' ' + str(progress) + '%'
try:
doc = folia.Document(file=filename)
except Exception as e:
print >> sys.stderr,"ERROR loading " + filename + ":" + str(e)
return False
filename = filename.replace(sonardir,'')
if filename[0] == '/':
filename = filename[1:]
if filename[-4:] == '.pos':
filename = filename[:-4]
if filename[-4:] == '.tok':
filename = filename[:-4]
if filename[-4:] == '.ilk':
filename = filename[:-4]
#Load document prior to tokenisation
try:
pretokdoc = folia.Document(file=sonardir + '/' + filename)
except:
print >> sys.stderr,"WARNING unable to load pretokdoc " + filename
pretokdoc = None
if pretokdoc:
for p2 in pretokdoc.paragraphs():
try:
p = doc[p2.id]
except:
print >> sys.stderr,"ERROR: Paragraph " + p2.id + " not found. Tokenised and pre-tokenised versions out of sync?"
continue
if p2.text:
p.text = p2.text
try:
os.mkdir(foliadir + os.path.dirname(filename))
except:
pass
try:
doc.save(foliadir + filename)
except:
print >> sys.stderr,"ERROR saving " + foliadir + filename
try:
f = codecs.open(foliadir + filename.replace('.xml','.tok.txt'),'w','utf-8')
f.write(unicode(doc))
f.close()
except:
print >> sys.stderr,"ERROR saving " + foliadir + filename.replace('.xml','.tok.txt')
sys.stdout.flush()
sys.stderr.flush()
return True
def outputexists(filename, sonardir, foliadir):
filename = filename.replace(sonardir,'')
if filename[0] == '/':
filename = filename[1:]
if filename[-4:] == '.pos':
filename = filename[:-4]
if filename[-4:] == '.tok':
filename = filename[:-4]
if filename[-4:] == '.ilk':
filename = filename[:-4]
return os.path.exists(foliadir + filename)
if __name__ == '__main__':
sonardir = sys.argv[1]
foliadir = sys.argv[2]
threads = int(sys.argv[3])
if foliadir[-1] != '/': foliadir += '/'
try:
os.mkdir(foliadir[:-1])
except:
pass
print "Building index..."
index = list(enumerate([ x for x in sonar.CorpusFiles(sonardir,'pos', "", lambda x: True, True) if not outputexists(x, sonardir, foliadir) ]))
print "Processing..."
p = Pool(threads)
p.map(process, index )