Skip to content

Commit

Permalink
Added class decoder
Browse files Browse the repository at this point in the history
git-svn-id: https://ilk.uvt.nl/svn/trunk/sources/pynlpl@13400 12f355fe-0486-481a-ad91-c297ab22b4e3
  • Loading branch information
proycon committed Oct 8, 2011
1 parent 7dfc347 commit 5f7897b
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 5 deletions.
32 changes: 27 additions & 5 deletions textprocessors.py
Expand Up @@ -14,8 +14,9 @@
import unicodedata
import string
import codecs
import array
from statistics import FrequencyList
from datatypes import intarraytobytearray, containsnullbyte
from datatypes import intarraytobytearray, bytearraytoint, containsnullbyte

try:
from itertools import permutations
Expand Down Expand Up @@ -219,9 +220,9 @@ def __init__(self, f, **kwargs):
elif isinstance(f, str):
f = codecs.open(f,'r','utf-8')
for line in f:
cls, word = line.strip().split('\t')[1]
if self.decoder: self.class2word[cls] = word
if self.encoder: self.word2class[word] = cls
cls, word = line.strip().split('\t',2)
if self.decoder: self.class2word[int(cls)] = word
if self.encoder: self.word2class[word] = int(cls)
f.close()
else:
raise Exception("Expected FrequencyList or filename, got " + str(type(f)))
Expand Down Expand Up @@ -275,12 +276,33 @@ def encodefile(self, fromfile, tofile):
fto = open(tofile,'w')
for line in ffrom:
a = intarraytobytearray( self.encodeseq( line.strip().split(' ') ))
a.append(0) #delimiter
a.append(1) #newline
a.append(0) #delimiter
a.tofile(fto)
fto.close()
ffrom.close()


def decodefile(self, filename):
f = open(filename)
buffer = array.array('B')
line = []
b = chr(0)
nullbyte = chr(0)
while b != "":
b = f.read(1)
if b == "": break
if b == nullbyte:
cls = bytearraytoint(buffer)
if cls == 1:
yield line
line = []
else:
line.append( self.decode(cls) )
buffer = array.array('B')
else:
buffer.append(ord(b))
f.close()



Expand Down
9 changes: 9 additions & 0 deletions tools/classdecode.py
@@ -0,0 +1,9 @@
#!/usr/bin/env python
#-*- coding:utf-8 -*-

from pynlpl.textprocessors import Classer
import sys

classer = Classer(sys.argv[1])
for line in classer.decodefile(sys.argv[2]):
print " ".join(line)

0 comments on commit 5f7897b

Please sign in to comment.