Skip to content

Commit

Permalink
Improved classer, added ability to output corpus in binary compressed…
Browse files Browse the repository at this point in the history
… format, optimised for later processing (n-gram/skipgram counting).

git-svn-id: https://ilk.uvt.nl/svn/trunk/sources/pynlpl@13399 12f355fe-0486-481a-ad91-c297ab22b4e3
  • Loading branch information
proycon committed Oct 7, 2011
1 parent d17e160 commit 7dfc347
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 29 deletions.
58 changes: 57 additions & 1 deletion datatypes.py
Expand Up @@ -16,6 +16,7 @@


import bisect
import array

class Queue: #from AI: A Modern Appproach : http://aima.cs.berkeley.edu/python/utils.html
"""Queue is an abstract class/interface. There are three types:
Expand Down Expand Up @@ -378,8 +379,63 @@ def walk(self, leavesonly=True, maxdepth=None, _depth = 0):
else:
for results in child.walk(leavesonly, maxdepth, _depth + 1):
yield results




def containsnullbyte(i):
while True:
r = i % 256
if i % 256 == 0:
return True
if i >= 256:
i = i / 256
else:
return False


def inttobytearray(i,bigendian=False, nonullbyte=False):
#convert int to byte array
a = array.array('B')
while True:
r = i % 256
#print hex(r), bin(r)
if nonullbyte and r == 0:
raise ValueError("Null byte encountered")
a.append(r)
if i >= 256:
i = i / 256
else:
break
if bigendian: a.reverse()
return a


def bytearraytoint(a,bigendian=False):
i = 0
for n,b in enumerate(a):
if bigendian: n = len(a) - 1 - n
i += b * 256**n
return i

def intarraytobytearray(intarray,bigendian=False):
"""Converts an array of integers (with some value restrictions) to an array of bytes in which elements are NULL-byte delimited"""
a = array.array('B')
l = len(intarray)
for n, i in enumerate(intarray):
a += inttobytearray(i,bigendian,True)
if n < l - 1:
a.append(0)
return a

def bytearraytointarray(bytearray, bigendian=False):
"""Converts a NULL-byte delimited array of bytes into an array of integers"""
a = array.array('I')
begin = 0
for n, b in enumerate(bytearray):
if b == 0:
a.append( bytearraytoint(b[begin:n]) )
a.append( bytearraytoint(b[begin:len(bytearray)]) )
return a


#class SuffixTree(object):
Expand Down
77 changes: 52 additions & 25 deletions textprocessors.py
Expand Up @@ -15,6 +15,7 @@
import string
import codecs
from statistics import FrequencyList
from datatypes import intarraytobytearray, containsnullbyte

try:
from itertools import permutations
Expand Down Expand Up @@ -170,43 +171,72 @@ def swap(tokens, maxdist=2):


class Classer(object):
def __init__(self, f, encoder=True, decoder=True, encoding=None):
def __init__(self, f, **kwargs):
"""Pass either a filename or a frequency list"""
self.encoder = encoder
self.decoder = decoder
if 'decoder' in kwargs:
self.decoder = bool(kwargs['decoder'])
else:
self.decoder = True

if 'encoder' in kwargs:
self.encoder = bool(kwargs['encoder'])
else:
self.encoder = True

self.newestclass = 0

if self.decoder:
self.class2word = []
if self.encoder:
self.class2word = {}
if self.encoder:
self.word2class = {}
if isinstance(f, FrequencyList):
for word, count in f:
self.class2word.append(word)
if self.encoder:
for cls, word in enumerate(self.class2word):
self.word2class[word] = cls

if 'encoding' in kwargs and kwargs['encoding']:
self.encoding = kwargs['encoding']
else:
self.encoding = None

if 'filesupport' in kwargs:
self.filesupport = bool(kwargs['filesupport'])
else:
self.filesupport = False

if self.filesupport:
self.newestclass = 1 #0 and 1 are reserved for space and newline

if isinstance(f, FrequencyList):
for word, _ in f:
self.newestclass += 1
if self.filesupport:
while containsnullbyte(self.newestclass):
self.newestclass += 1
print self.newestclass, word
if self.decoder:
self.class2word[self.newestclass] = word
if self.encoder:
self.word2class[word] = self.newestclass
if not self.decoder:
del self.class2word
elif isinstance(f, str):
f = codecs.open(f,'r','utf-8')
cls = 0
for line in f:
word = line.strip().split('\t')[1]
if self.decoder: self.class2word.append(word)
cls, word = line.strip().split('\t')[1]
if self.decoder: self.class2word[cls] = word
if self.encoder: self.word2class[word] = cls
cls += 1
f.close()
else:
raise Exception("Expected FrequencyList or filename, got " + str(type(f)))
self.encoding = encoding



def save(self, filename):
if not self.decoder: raise Exception("Decoder not enabled!")
if self.encoding:
f = codecs.open(filename,'w',self.encoding)
else:
f = open(filename,'w')
for cls, word in enumerate(self.class2word):
f.write( str(cls) + '\t' + word + '\n')
for cls, word in sorted(self.class2word.items()):
if cls:
f.write( str(cls) + '\t' + word + '\n')
f.close()

def decode(self, x):
Expand Down Expand Up @@ -240,20 +270,17 @@ def __len__(self):
return len(self.word2class)

def encodefile(self, fromfile, tofile):
assert self.filesupport
ffrom = open(fromfile,'r')
fto = open(tofile,'w')
for line in ffrom:
seq = self.encodeseq(line.strip().split(' '))
a = array.array('L')
for i in seq:
a.append(i)
a.tofile(f)
a = intarraytobytearray( self.encodeseq( line.strip().split(' ') ))
a.append(1) #newline
a.tofile(fto)
fto.close()
ffrom.close()







Expand Down
4 changes: 1 addition & 3 deletions tools/classencode.py 100644 → 100755
Expand Up @@ -13,14 +13,12 @@
for i, line in enumerate(f):
if (i % 10000 == 0):
print >>sys.stderr, "\tLine " + str(i+1)
if DOTOKENIZE:
line = crude_tokenizer(line.strip())
line = line.strip().split(' ')
freqlist.append(line)
f.close()

print >>sys.stderr, "Building classer"
classer = Classer(freqlist)
classer = Classer(freqlist, filesupport=True )
classer.save(filename + '.cls')

print >>sys.stderr, "Encoding data"
Expand Down

0 comments on commit 7dfc347

Please sign in to comment.