Skip to content

Commit

Permalink
added
Browse files Browse the repository at this point in the history
git-svn-id: https://ilk.uvt.nl/svn/trunk/sources/pynlpl@13398 12f355fe-0486-481a-ad91-c297ab22b4e3
  • Loading branch information
proycon committed Oct 7, 2011
1 parent 6102cca commit d17e160
Showing 1 changed file with 27 additions and 0 deletions.
27 changes: 27 additions & 0 deletions tools/classencode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env python
#-*- coding:utf-8 -*-

from pynlpl.textprocessors import Classer
from pynlpl.statistics import FrequencyList
import sys

filename = sys.argv[1]

print >>sys.stderr, "Counting tokens"
f = open(filename)
freqlist = FrequencyList()
for i, line in enumerate(f):
if (i % 10000 == 0):
print >>sys.stderr, "\tLine " + str(i+1)
if DOTOKENIZE:
line = crude_tokenizer(line.strip())
line = line.strip().split(' ')
freqlist.append(line)
f.close()

print >>sys.stderr, "Building classer"
classer = Classer(freqlist)
classer.save(filename + '.cls')

print >>sys.stderr, "Encoding data"
classer.encodefile(filename, filename + '.clsenc')

0 comments on commit d17e160

Please sign in to comment.