Improved classer, added ability to output corpus in binary compressed…

… format, optimised for later processing (n-gram/skipgram counting). git-svn-id: https://ilk.uvt.nl/svn/trunk/sources/pynlpl@13399 12f355fe-0486-481a-ad91-c297ab22b4e3
w495 · Oct 7, 2011 · 7dfc347 · 7dfc347
1 parent d17e160
commit 7dfc347
Show file tree

Hide file tree

Showing 3 changed files with 110 additions and 29 deletions.
diff --git a/datatypes.py b/datatypes.py
@@ -16,6 +16,7 @@
 
 
 import bisect
+import array
 
 class Queue: #from AI: A Modern Appproach : http://aima.cs.berkeley.edu/python/utils.html
     """Queue is an abstract class/interface. There are three types:
@@ -378,8 +379,63 @@ def walk(self, leavesonly=True, maxdepth=None, _depth = 0):
                     else:
                         for results in child.walk(leavesonly, maxdepth, _depth + 1):
                             yield results
-
+
+
+
+def containsnullbyte(i):
+     while True:
+        r = i % 256
+        if i % 256 == 0:
+            return True
+        if i >= 256:
+            i = i / 256
+        else:
+            return False
+
 
+def inttobytearray(i,bigendian=False, nonullbyte=False):
+    #convert int to byte array
+    a = array.array('B')
+    while True:
+        r = i % 256
+        #print hex(r), bin(r)
+        if nonullbyte and r == 0:
+            raise ValueError("Null byte encountered")
+        a.append(r)
+        if i >= 256:
+            i = i / 256
+        else:
+            break
+    if bigendian: a.reverse()
+    return a
+
+
+def bytearraytoint(a,bigendian=False):
+    i = 0
+    for n,b in enumerate(a):
+        if bigendian: n = len(a) - 1 - n
+        i += b * 256**n
+    return i
+
+def intarraytobytearray(intarray,bigendian=False):
+    """Converts an array of integers (with some value restrictions) to an array of bytes in which elements are NULL-byte delimited"""
+    a = array.array('B')
+    l = len(intarray)
+    for n,  i in enumerate(intarray):
+        a += inttobytearray(i,bigendian,True)
+        if n < l - 1:
+            a.append(0) 
+    return a
+
+def bytearraytointarray(bytearray, bigendian=False):
+    """Converts a NULL-byte delimited array of bytes into an array of integers"""
+    a = array.array('I')
+    begin = 0    
+    for n, b in enumerate(bytearray):
+        if b == 0:
+            a.append( bytearraytoint(b[begin:n]) )
+    a.append( bytearraytoint(b[begin:len(bytearray)]) )
+    return a
 
 
 #class SuffixTree(object):

diff --git a/textprocessors.py b/textprocessors.py
@@ -15,6 +15,7 @@
 import string
 import codecs
 from statistics import FrequencyList
+from datatypes import intarraytobytearray, containsnullbyte
 
 try:
     from itertools import permutations
@@ -170,43 +171,72 @@ def swap(tokens, maxdist=2):
 
 
 class Classer(object):
-    def __init__(self, f, encoder=True, decoder=True, encoding=None):
+    def __init__(self, f, **kwargs):
         """Pass either a filename or a frequency list"""
-        self.encoder = encoder
-        self.decoder = decoder
+        if 'decoder' in kwargs:
+            self.decoder = bool(kwargs['decoder'])
+        else:
+            self.decoder = True
+
+        if 'encoder' in kwargs:
+            self.encoder = bool(kwargs['encoder'])
+        else:
+            self.encoder = True
+
+        self.newestclass = 0
+
         if self.decoder:
-            self.class2word = []
-        if self.encoder:
+            self.class2word = {} 
+        if self.encoder:    
             self.word2class = {}
-        if isinstance(f, FrequencyList):            
-            for word, count in f:       
-                self.class2word.append(word)            
-            if self.encoder:
-                for cls, word in enumerate(self.class2word):
-                    self.word2class[word] = cls
+
+        if 'encoding' in kwargs and kwargs['encoding']:
+            self.encoding = kwargs['encoding']
+        else:
+            self.encoding = None
+
+        if 'filesupport' in kwargs:
+            self.filesupport = bool(kwargs['filesupport'])
+        else:
+            self.filesupport = False
+
+        if self.filesupport:
+            self.newestclass = 1 #0 and 1 are reserved for space and newline
+
+        if isinstance(f, FrequencyList):                        
+            for word, _ in f:  
+                self.newestclass += 1
+                if self.filesupport:
+                    while containsnullbyte(self.newestclass): 
+                        self.newestclass += 1
+                print self.newestclass, word
+                if self.decoder:
+                    self.class2word[self.newestclass] = word  
+                if self.encoder:
+                    self.word2class[word] = self.newestclass
             if not self.decoder:
                 del self.class2word
         elif isinstance(f, str):
             f = codecs.open(f,'r','utf-8')      
-            cls = 0
             for line in f:
-                word = line.strip().split('\t')[1]
-                if self.decoder: self.class2word.append(word)
+                cls, word = line.strip().split('\t')[1]
+                if self.decoder: self.class2word[cls] = word
                 if self.encoder: self.word2class[word] = cls 
-                cls += 1
             f.close()
         else: 
             raise Exception("Expected FrequencyList or filename, got " + str(type(f)))
-        self.encoding = encoding
+
+
 
     def save(self, filename):
         if not self.decoder: raise Exception("Decoder not enabled!")
         if self.encoding:
             f = codecs.open(filename,'w',self.encoding)   
         else:
             f = open(filename,'w')
-        for cls, word in enumerate(self.class2word):
-            f.write( str(cls) + '\t' + word + '\n')
+        for cls, word in sorted(self.class2word.items()):
+            if cls:
+                f.write( str(cls) + '\t' + word + '\n')
         f.close()
 
     def decode(self, x):
@@ -240,20 +270,17 @@ def __len__(self):
             return len(self.word2class)
 
     def encodefile(self, fromfile, tofile):
+        assert self.filesupport
         ffrom = open(fromfile,'r')  
         fto = open(tofile,'w')
         for line in ffrom:
-            seq = self.encodeseq(line.strip().split(' '))
-            a = array.array('L')
-            for i in seq:
-                a.append(i)
-            a.tofile(f)            
+            a = intarraytobytearray( self.encodeseq( line.strip().split(' ') ))
+            a.append(1) #newline
+            a.tofile(fto)            
         fto.close()
         ffrom.close()
 
 
-
-
 
 
 

diff --git a/tools/classencode.py b/tools/classencode.py
@@ -13,14 +13,12 @@
 for i, line in enumerate(f):            
     if (i % 10000 == 0): 
         print >>sys.stderr, "\tLine " + str(i+1)
-    if DOTOKENIZE: 
-        line = crude_tokenizer(line.strip())
     line = line.strip().split(' ')
     freqlist.append(line)
 f.close()
 
 print >>sys.stderr, "Building classer"
-classer = Classer(freqlist)
+classer = Classer(freqlist, filesupport=True )
 classer.save(filename + '.cls')
 
 print >>sys.stderr, "Encoding data"