diff --git a/datatypes.py b/datatypes.py index b68e37fe..a0f95b99 100644 --- a/datatypes.py +++ b/datatypes.py @@ -16,6 +16,7 @@ import bisect +import array class Queue: #from AI: A Modern Appproach : http://aima.cs.berkeley.edu/python/utils.html """Queue is an abstract class/interface. There are three types: @@ -378,8 +379,63 @@ def walk(self, leavesonly=True, maxdepth=None, _depth = 0): else: for results in child.walk(leavesonly, maxdepth, _depth + 1): yield results - + + + +def containsnullbyte(i): + while True: + r = i % 256 + if i % 256 == 0: + return True + if i >= 256: + i = i / 256 + else: + return False + +def inttobytearray(i,bigendian=False, nonullbyte=False): + #convert int to byte array + a = array.array('B') + while True: + r = i % 256 + #print hex(r), bin(r) + if nonullbyte and r == 0: + raise ValueError("Null byte encountered") + a.append(r) + if i >= 256: + i = i / 256 + else: + break + if bigendian: a.reverse() + return a + + +def bytearraytoint(a,bigendian=False): + i = 0 + for n,b in enumerate(a): + if bigendian: n = len(a) - 1 - n + i += b * 256**n + return i + +def intarraytobytearray(intarray,bigendian=False): + """Converts an array of integers (with some value restrictions) to an array of bytes in which elements are NULL-byte delimited""" + a = array.array('B') + l = len(intarray) + for n, i in enumerate(intarray): + a += inttobytearray(i,bigendian,True) + if n < l - 1: + a.append(0) + return a + +def bytearraytointarray(bytearray, bigendian=False): + """Converts a NULL-byte delimited array of bytes into an array of integers""" + a = array.array('I') + begin = 0 + for n, b in enumerate(bytearray): + if b == 0: + a.append( bytearraytoint(b[begin:n]) ) + a.append( bytearraytoint(b[begin:len(bytearray)]) ) + return a #class SuffixTree(object): diff --git a/textprocessors.py b/textprocessors.py index 3438cf0c..6ba77445 100644 --- a/textprocessors.py +++ b/textprocessors.py @@ -15,6 +15,7 @@ import string import codecs from statistics import FrequencyList +from datatypes import intarraytobytearray, containsnullbyte try: from itertools import permutations @@ -170,34 +171,62 @@ def swap(tokens, maxdist=2): class Classer(object): - def __init__(self, f, encoder=True, decoder=True, encoding=None): + def __init__(self, f, **kwargs): """Pass either a filename or a frequency list""" - self.encoder = encoder - self.decoder = decoder + if 'decoder' in kwargs: + self.decoder = bool(kwargs['decoder']) + else: + self.decoder = True + + if 'encoder' in kwargs: + self.encoder = bool(kwargs['encoder']) + else: + self.encoder = True + + self.newestclass = 0 + if self.decoder: - self.class2word = [] - if self.encoder: + self.class2word = {} + if self.encoder: self.word2class = {} - if isinstance(f, FrequencyList): - for word, count in f: - self.class2word.append(word) - if self.encoder: - for cls, word in enumerate(self.class2word): - self.word2class[word] = cls + + if 'encoding' in kwargs and kwargs['encoding']: + self.encoding = kwargs['encoding'] + else: + self.encoding = None + + if 'filesupport' in kwargs: + self.filesupport = bool(kwargs['filesupport']) + else: + self.filesupport = False + + if self.filesupport: + self.newestclass = 1 #0 and 1 are reserved for space and newline + + if isinstance(f, FrequencyList): + for word, _ in f: + self.newestclass += 1 + if self.filesupport: + while containsnullbyte(self.newestclass): + self.newestclass += 1 + print self.newestclass, word + if self.decoder: + self.class2word[self.newestclass] = word + if self.encoder: + self.word2class[word] = self.newestclass if not self.decoder: del self.class2word elif isinstance(f, str): f = codecs.open(f,'r','utf-8') - cls = 0 for line in f: - word = line.strip().split('\t')[1] - if self.decoder: self.class2word.append(word) + cls, word = line.strip().split('\t')[1] + if self.decoder: self.class2word[cls] = word if self.encoder: self.word2class[word] = cls - cls += 1 f.close() else: raise Exception("Expected FrequencyList or filename, got " + str(type(f))) - self.encoding = encoding + + def save(self, filename): if not self.decoder: raise Exception("Decoder not enabled!") @@ -205,8 +234,9 @@ def save(self, filename): f = codecs.open(filename,'w',self.encoding) else: f = open(filename,'w') - for cls, word in enumerate(self.class2word): - f.write( str(cls) + '\t' + word + '\n') + for cls, word in sorted(self.class2word.items()): + if cls: + f.write( str(cls) + '\t' + word + '\n') f.close() def decode(self, x): @@ -240,20 +270,17 @@ def __len__(self): return len(self.word2class) def encodefile(self, fromfile, tofile): + assert self.filesupport ffrom = open(fromfile,'r') fto = open(tofile,'w') for line in ffrom: - seq = self.encodeseq(line.strip().split(' ')) - a = array.array('L') - for i in seq: - a.append(i) - a.tofile(f) + a = intarraytobytearray( self.encodeseq( line.strip().split(' ') )) + a.append(1) #newline + a.tofile(fto) fto.close() ffrom.close() - - diff --git a/tools/classencode.py b/tools/classencode.py old mode 100644 new mode 100755 index 746084c2..2ead253d --- a/tools/classencode.py +++ b/tools/classencode.py @@ -13,14 +13,12 @@ for i, line in enumerate(f): if (i % 10000 == 0): print >>sys.stderr, "\tLine " + str(i+1) - if DOTOKENIZE: - line = crude_tokenizer(line.strip()) line = line.strip().split(' ') freqlist.append(line) f.close() print >>sys.stderr, "Building classer" -classer = Classer(freqlist) +classer = Classer(freqlist, filesupport=True ) classer.save(filename + '.cls') print >>sys.stderr, "Encoding data"