Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Add an almost completely automated data generation script.

  • Loading branch information...
commit 59a68ac9849a32035ee5c22a62a3ba73db2c6894 1 parent 790a003
@CasperVector CasperVector authored
View
19 SConstruct
@@ -136,6 +136,7 @@ bins = [
'src/tslmpack',
'src/genpyt',
'src/getwordfreq',
+ 'src/sunpinyin-dictgen',
]
man1s = [
@@ -200,24 +201,24 @@ def GetOS():
return platform.uname()[0]
def CreateEnvironment():
- tar = 'tar'
make = 'make'
wget = 'wget'
+ tar = 'tar'
if GetOS() == 'Darwin':
wget = 'curl -O'
elif GetOS() == 'FreeBSD':
- wget = 'fetch'
make = 'gmake'
+ wget = 'fetch'
elif GetOS() == 'SunOS':
- tar = 'gtar'
make = 'gmake'
-
+ tar = 'gtar'
libln_builder = Builder(action='cd ${TARGET.dir} && ln -s ${SOURCE.name} ${TARGET.name}')
- env = Environment(ENV=os.environ, CFLAGS=cflags, CXXFLAGS=cflags,
- TAR=tar, MAKE=make, WGET=wget,
- CPPPATH=['.'] + allinc(),
- tools=['default', 'textfile'])
- env.Append(BUILDERS={'InstallAsSymlink': libln_builder})
+ env = Environment(ENV = os.environ, CFLAGS = cflags, CXXFLAGS = cflags,
+ MAKE = make, WGET = wget, TAR = tar,
+ CPPPATH = ['.'] + allinc(),
+ tools = ['default', 'textfile'])
+ env.Append(BUILDERS = {'InstallAsSymlink': libln_builder})
+ env['ENDIANNESS'] = "be" if sys.byteorder == "big" else "le"
return env
def PassVariables(envvar, env):
View
8 doc/README.in
@@ -1,6 +1,14 @@
How to install data files for sunpinyin
=======================================
+For normal users
+----------------
+
+In a temporary directory, run `sunpinyin-dictgen' as root.
+
+For developers and expert users
+-------------------------------
+
Get `lm_sc.t3g.arpa.tar.bz2' and `dict.utf8.tar.bz2' from [1],
unpack them into some directory, and issue the following commands in
that directory:
View
3  doc/SConscript
@@ -1,10 +1,9 @@
-import sys
Import('env')
env.Substfile('README.in', SUBST_DICT = {'@PREFIX@': env['PREFIX']})
env.Substfile('SLM-inst.mk.in', SUBST_DICT = {
'@DATADIR@': env['DATADIR'],
- '@ENDIANNESS@': "be" if sys.byteorder == "big" else "le"
+ '@ENDIANNESS@': env['ENDIANNESS']
})
# -*- indent-tabs-mode: nil -*- vim:et:ts=4
View
12 src/SConscript
@@ -42,4 +42,16 @@ env.Program('getwordfreq', ['portability.o', 'slm/slm.o',
env.Program('testvc', ['slm/thread/ValueCompress.o', 'slm/thread/test_vc.o'])
+env.Substfile('sunpinyin-dictgen.mk.in', SUBST_DICT = {
+ '@MAKE@': env['MAKE'],
+ '@TAR@': env['TAR'],
+ '@WGET@': env['WGET'],
+ '@DATADIR@': env['DATADIR'],
+ '@ENDIANNESS@': env['ENDIANNESS'],
+ })
+env.Command('sunpinyin-dictgen', 'sunpinyin-dictgen.mk', [
+ Copy("$TARGET", "$SOURCE"),
+ Chmod("$TARGET", 0755),
+ ])
+
# -*- indent-tabs-mode: nil -*- vim:et:ts=4
View
48 src/sunpinyin-dictgen.mk.in
@@ -0,0 +1,48 @@
+#!/usr/bin/@MAKE@ -f
+# -*- mode: makefile; indent-tabs-mode: t -*- vim:noet:ts=4
+
+WGET = @WGET@
+TAR = @TAR@
+ENDIANNESS = @ENDIANNESS@
+DATA_DIR = '@DATADIR@/sunpinyin'
+
+DL_LIST = 'http://code.google.com/p/open-gram/downloads/list'
+DL_ROOT = 'http://open-gram.googlecode.com/files/'
+DICT_PAT = 'dict\.utf8-[0-9]\+.tar.bz2'
+TSLM_PAT = 'lm_sc\.t3g\.arpa-[0-9]\+.tar.bz2'
+
+DICT_AR = $(shell w3m -dump ${DL_LIST} | grep -o ${DICT_PAT} | sort | tail -n 1)
+TSLM_AR = $(shell w3m -dump ${DL_LIST} | grep -o ${TSLM_PAT} | sort | tail -n 1)
+
+all: install
+
+${DICT_AR}:
+ ${WGET} ${DL_ROOT}/$@
+
+dict.utf8: ${DICT_AR}
+ ${TAR} xmf $^
+
+${TSLM_AR}:
+ ${WGET} ${DL_ROOT}/$@
+
+lm_sc.t3g.arpa: ${TSLM_AR}
+ ${TAR} xmf $^
+
+lm_sc.t3g.orig: dict.utf8 lm_sc.t3g.arpa
+ tslmpack lm_sc.t3g.arpa dict.utf8 $@
+
+lm_sc.t3g: lm_sc.t3g.orig
+ tslmendian -e ${ENDIANNESS} -i $^ -o $@
+
+pydict3_sc.bin: dict.utf8 lm_sc.t3g
+ genpyt -e ${ENDIANNESS} -i dict.utf8 -s lm_sc.t3g \
+ -l lm_sc.t3g.log -o $@
+
+install: lm_sc.t3g pydict3_sc.bin
+ install -d ${DATA_DIR}
+ install -Dm644 $^ ${DATA_DIR}
+
+clean:
+ rm -rf ${DICT_AR} ${TSLM_AR} dict.utf8 lm_sc.t3g.arpa \
+ lm_sc.t3g.orig lm_sc.t3g lm_sc.t3g.log pydict3_sc.bin
+
Please sign in to comment.
Something went wrong with that request. Please try again.