hspell-0.7

synhershko · Jun 11, 2010 · ca0ff28 · ca0ff28
1 parent e05fa29
commit ca0ff28
Show file tree

Hide file tree

Showing 35 changed files with 3,306 additions and 661 deletions.
diff --git a/Makefile b/Makefile
@@ -19,6 +19,9 @@ BIN = $(PREFIX)/bin
 SHARE = $(PREFIX)/share/hspell
 LIBEXEC = $(PREFIX)/lib/hspell
 MAN1 = $(PREFIX)/man/man1
+MAN3 = $(PREFIX)/man/man3
+LIBDIR = $(PREFIX)/lib
+INCLUDEDIR = $(PREFIX)/include
 
 all: cfrontend
 
@@ -118,12 +121,22 @@ clean:
               c/corlist.o c/dict_radix.o c/find_sizes.o c/gimatria.o \
 	      c/hspell.o c/tclHash.o c/hebrew.wgz c/hebrew.wgz.sizes \
 	      c/hebrew.wgz.prefixes c/dout.nouns.shemp.gz c/shemp.dat \
-	      c/dout.nouns.wolig.gz c/dout.verbs.gz c/hspell c/find_sizes
+	      c/dout.nouns.wolig.gz c/dout.verbs.gz c/hspell c/find_sizes \
+	      c/prefixes.c c/libhspell.o c/libhspell.a \
+	      c/hebrew.wgz.desc c/hebrew.wgz.stems
 
 ################################################
 # for the C front-end
 cfrontend:
-	(cd c; $(MAKE) EXTRACFLAGS='-DDICTIONARY_BASE=\"$(DESTDIR)/$(SHARE)/hebrew.wgz\"')
+	(cd c; $(MAKE) EXTRACFLAGS='-DDICTIONARY_BASE=\"$(SHARE)/hebrew.wgz\"')
+
+
+# To include a full morphological analyzer in "hspell -l", run "make linginfo"
+# instead of just "make". But watch out - this slows down the build, and the
+# installed data files will be 4 times as large. But don't worry - this feature
+# has no speed impact on hspell unless the -l option is actually used.
+linginfo:
+	(cd c; $(MAKE) EXTRACFLAGS='-DDICTIONARY_BASE=\"$(SHARE)/hebrew.wgz\" -DUSE_LINGINFO' EXTRAOBJECTS='linginfo.o' dolinginfo)
 
 install: install_cfrontend
 CHSPELL=hspell
@@ -136,29 +149,43 @@ install_cfrontend: cfrontend
 	chmod 755 $(DESTDIR)/$(BIN)/multispell
 	test -d $(DESTDIR)/$(SHARE) || mkdir -m 755 -p $(DESTDIR)/$(SHARE)
 	cp c/hebrew.wgz c/hebrew.wgz.prefixes c/hebrew.wgz.sizes $(DESTDIR)/$(SHARE)/
-	(cd $(DESTDIR)/$(SHARE); chmod 644 hebrew.wgz hebrew.wgz.prefixes hebrew.wgz.sizes)
+	gzip -9 < spellinghints > $(DESTDIR)/$(SHARE)/hebrew.wgz.hints
+	(cd $(DESTDIR)/$(SHARE); chmod 644 hebrew.wgz hebrew.wgz.prefixes hebrew.wgz.sizes hebrew.wgz.hints)
+	test ! -f c/hebrew.wgz.stems || cp c/hebrew.wgz.stems c/hebrew.wgz.desc $(DESTDIR)/$(SHARE)/
+	(cd $(DESTDIR)/$(SHARE); test ! -f hebrew.wgz.stems || chmod 644 hebrew.wgz.stems hebrew.wgz.desc)
 	-rm -f $(DESTDIR)/$(BIN)/hspell-i
 	-ln -s $(CHSPELL) $(DESTDIR)/$(BIN)/hspell-i
 	test -d $(DESTDIR)/$(MAN1) || mkdir -m 755 -p $(DESTDIR)/$(MAN1)
 	cp hspell.1 $(DESTDIR)/$(MAN1)/
 	chmod 644 $(DESTDIR)/$(MAN1)/hspell.1
+	test -d $(DESTDIR)/$(MAN3) || mkdir -m 755 -p $(DESTDIR)/$(MAN3)
+	cp c/hspell.3 $(DESTDIR)/$(MAN3)/
+	chmod 644 $(DESTDIR)/$(MAN3)/hspell.3
+	test -d $(DESTDIR)/$(LIBDIR) || mkdir -m 755 -p $(DESTDIR)/$(LIBDIR)
+	cp c/libhspell.a $(DESTDIR)/$(LIBDIR)/
+	chmod 644 $(DESTDIR)/$(LIBDIR)/libhspell.a
+	test -d $(DESTDIR)/$(INCLUDEDIR) || mkdir -m 755 -p $(DESTDIR)/$(INCLUDEDIR)
+	cp c/hspell.h c/linginfo.h $(DESTDIR)/$(INCLUDEDIR)/
+	chmod 644 $(DESTDIR)/$(INCLUDEDIR)/hspell.h $(DESTDIR)/$(INCLUDEDIR)/linginfo.h
 
 ################################################
 # for creating an hspell distribution tar
 PACKAGE = hspell
-VERSION = 0.6
+VERSION = 0.7
 DISTFILES = COPYING INSTALL LICENSE README WHATSNEW TODO \
 	Makefile stats wunzip.c wzip \
 	hspell.pl hspell.1 \
 	wolig.pl wolig.dat biza-nouns milot extrawords \
 	woo woo.dat biza-verbs \
 	likelyerrors spellinghints \
 	hspell.spec \
-	c/Makefile c/README c/corlist.c c/corlist.h c/dict_radix.c \
-	c/dict_radix.h c/find_sizes.c c/gimatria.c c/gimatria.h c/hspell.c \
+	c/Makefile c/README c/corlist.c c/dict_radix.c \
+	c/dict_radix.h c/find_sizes.c c/gimatria.c c/hspell.c \
+	c/hspell.h c/libhspell.c \
 	c/pmerge c/PrefixBits.pl c/genprefixes.pl \
 	c/hash.h c/tclHash.c c/tclHash.h \
-	multispell
+        c/binarize-desc.pl c/pack-desc.pl c/linginfo.c c/linginfo.h \
+	multispell c/hspell.3
 
 DISTDIR = $(PACKAGE)-$(VERSION)
 

diff --git a/README b/README
@@ -1,4 +1,4 @@
-This is version 0.6 of Hspell, the free Hebrew spellchecker and morphology
+This is version 0.7 of Hspell, the free Hebrew spellchecker and morphology
 engine.
 
 It is a fully working Hebrew spellchecker, not a toy release. On typical
@@ -175,10 +175,14 @@ easy to do and will greatly enhance the Hspell project's usefulness.
   including, for example, Hebrew search engines, Hebrew dictionaries (with
   definitions and/or translation to another language), and machine translation
   software.
-  The "-v" option (in releases 0.5 and earlier) to hspell is a small
-  demonstration of what hspell's morphology engine is capable of. Even the
-  spellchecker itself could benefit from more use of those techniques: e.g.,
-  it could explain how the corrections it suggests were derived.
+  The "-v" option (in releases 0.5 and earlier) to hspell was a small
+  demonstration of what hspell's morphology engine is capable of. The
+  "-l" option in releases 0.7 and later (when compiled with this support
+  on) is already a full-featured morphological analyzer, so this task is
+  basically done.
+  Even the spellchecker itself could benefit from more use of those
+  techniques: e.g., it could explain how the corrections it suggests were
+  derived.
 
 
 Hspell's license

diff --git a/TODO b/TODO
@@ -1,3 +1,8 @@
+- disallow empty prefixes for infinitives in the enumeration. probable reason is
+  a bug in dmask2ps.
+- put desc_sizes.h into runtime-read file.
+- give better names to functions.
+
 Really important stuff to be done:
 * Extend the list of known verbs, nouns, proper nouns, relation words, etc.
   In this one, the general public could really help.

diff --git a/WHATSNEW b/WHATSNEW
@@ -1,3 +1,38 @@
+Release 0.7 (December 22, 2003):
+
+ * Some incorrect words purged or fixed, and many more words added: over
+   1,000 base words added.
+
+ * Portability improvements (thanks to Baruch Even).
+
+ * New run-time option -H causes hspell to accept He Ha-she'ela in the text.
+
+ * The "-n" option (for spelling hints, an explanation of certain kinds of
+   errors) that was lost in version 0.6, is now back. The "likelyerrors"
+   feature is still missing in this version.
+
+ * A new, and somewhat tentative, Documented C API (libhspell.a library,
+   spell.h header, hspell.3 manual). This are not available in the normal
+   RPM - compile the sources yourself if you want them.
+
+ * A full morphological analyzer (explaining all the possible ways to read
+   a given word, how each reading was derived and its syntactic properties)
+   is now available when Hspell is run with the "-l" option.
+
+   Because the data files needed for this feature currently take up as much
+   as 4 times the space needed only for spell-checking, and because the
+   collection and sorting of linguistic information makes the compilation
+   takes much longer, this feature is not compiled by default, unless
+   compilation is done with "make linginfo". Also, the normal RPM does not
+   include this feature, but the "fat" RPM does include it.
+   An innovative parade of hash symbols now appears on screen to the benefit
+   of the bored builder :)
+
+ * Vocabulary: 406,629 words (when including kinuyim on verbs)
+               based on 8042 nouns, 1783 adjectives, 5113 verb stems, and
+	       1880 other words
+
+-----------------------------------------------------------------------------
 Release 0.6 (August 5, 2003):
 
  In this release, the Hspell front-end (the hspell program) was rewritten.
@@ -23,6 +58,7 @@ Release 0.6 (August 5, 2003):
    accepted with prefixes. Also, these words are never used as suggested
    corrections.
 
+
  * "hspell -a" was made more compatible with the standard "ispell -a" (thanks
    to Mooffie). Hspell is now known to be used with LyX, KDE, Geresh and Emacs.
 

diff --git a/biza-verbs b/biza-verbs
@@ -152,6 +152,8 @@
 # ����� ������ ��������
 ���
 ���
+���
+���
 -----
 # ���� ������� �� ������
 ����

diff --git a/c/Makefile b/c/Makefile
@@ -5,14 +5,20 @@ PERL=perl
 #all: test.wgz.sizes
 all: hebrew.wgz.sizes hspell
 
-hspell: hspell.o dict_radix.o gimatria.o corlist.o tclHash.o
-	cc -o hspell hspell.o dict_radix.o gimatria.o corlist.o tclHash.o
+libhspell.a: dict_radix.o gimatria.o corlist.o libhspell.o $(EXTRAOBJECTS)
+	-rm -f $@
+	ar cr $@ $^
+	-ranlib $@
 
+hspell: hspell.o tclHash.o libhspell.a
+	cc -o hspell hspell.o tclHash.o libhspell.a
+
+# TODO: update this dependency list:
 hspell.o dict_radix.o: dict_radix.h
-hspell.o gimatria.o: gimatria.h
-hspell.o corlist.o: corlist.h
 hspell.o: prefixes.c hash.h tclHash.h
 tclHash.o: tclHash.c tclHash.h
+corlist.o gimatria.o hspell.o libhspell.o: hspell.h
+linginfo.o: dmask.c
 
 prefixes.c: genprefixes.pl
 	$(PERL) -w ./genprefixes.pl >prefixes.c
@@ -37,6 +43,59 @@ hebrew.wgz hebrew.wgz.prefixes: pmerge $(DICTS) $(GZDICTS)
 hebrew.wgz.sizes: hebrew.wgz find_sizes
 	gzip -dc hebrew.wgz | ./find_sizes >hebrew.wgz.sizes
 
+###################################### optional linginfo stuff ##############
+# See comment on linginfo in ../Makefile.
+EXTRAOBJECTS=
+
+dolinginfo: linginfo_data hspell
+
+# hebrew.wgz contains all the words without any prefix hints like B,L,+ we
+#   previously had. 
+# hebrew.wgz.prefixes is the prefix hints (one byte per word, compressed).
+# hebrew.wgz.sizes contains the memory sizes that reading hebrew.wgz will
+#   require (this makes it easier for hspell to preallocate the needed sizes).
+# dmask.c contains an array of all possible values of the description bitmask.
+#   It is generated by pack-desc.pl. This array is not too long (no more than
+#   300 for the default dictionary).
+#
+# In the following long rule, the complete list of all words with linguistic
+# details is concatanated and sent to binarize-desc.pl, which converts the
+# detail information of each word into bitmap (called dmask), produces a
+# spesifier that tell which prefixes are accepted with the word, and writes its
+# stem. Then the words list is sorted, packed (a-la uniq), and the output files
+# are written.
+#
+# NOTE/TODO:
+# The "linginfo_data:" target line below is ugly and un-make-like. Not only
+# that, it doesn't know when it's necessary to build the files again, and
+# when it is not. The better make targets (hebrew.wgz et al. and
+# hebrew.wgz.sizes) are commented out because the same targets were used above
+# for building the version without linginfo. When building with linginfo
+# becomes the default, we should remove the following line and uncomment the
+# real targets.
+linginfo_data: binarize-desc.pl pack-desc.pl $(DICTS) $(GZDICTS) find_sizes
+#hebrew.wgz hebrew.wgz.prefixes hebrew.wgz.desc hebrew.wgz.stems hebrew.wgz.lingsizes.tmp dmask.c: binarize-desc.pl pack-desc.pl $(DICTS) $(GZDICTS)
+	(gzip -dc $(GZDICTS); cat $(DICTS)) | $(PERL) binarize-desc.pl | \
+		sort -u | $(PERL) pack-desc.pl -p hebrew.wgz.prefixes.tmp \
+		-d hebrew.wgz.desc.tmp -s hebrew.wgz.stems.tmp \
+		-l hebrew.wgz.lingsizes.tmp | \
+		../wzip | gzip -9 > hebrew.wgz
+
+	-rm -f hebrew.wgz.prefixes
+	gzip -9 < hebrew.wgz.prefixes.tmp >hebrew.wgz.prefixes
+	-rm -f hebrew.wgz.prefixes.tmp
+	-rm -f hebrew.wgz.desc
+	gzip -9 < hebrew.wgz.desc.tmp >hebrew.wgz.desc
+	-rm -f hebrew.wgz.desc.tmp
+	-rm -f hebrew.wgz.stems
+	gzip -9 < hebrew.wgz.stems.tmp >hebrew.wgz.stems
+	-rm -f hebrew.wgz.stems.tmp
+#
+#hebrew.wgz.sizes: hebrew.wgz find_sizes hebrew.wgz.lingsizes.tmp
+	gzip -dc hebrew.wgz | ./find_sizes >hebrew.wgz.sizes
+	cat hebrew.wgz.lingsizes.tmp >> hebrew.wgz.sizes
+############################################################################
+
 
 # dout.* are the outputs from the various word-list generators with the
 # -d option (i.e., with an explanation on how each word was derived)
@@ -59,5 +118,5 @@ dout.nouns.shemp.gz:  shemp.dat ../wolig.pl
 #SEDCMD=s/\+//
 SEDCMD=/\+/d
 
-dout.verbs.gz: ../woo ../woo.dat
+dout.verbs.gz shemp.dat: ../woo ../woo.dat
 	$(PERL) -w ../woo -d ../woo.dat | sed "$(SEDCMD)" | gzip -4 > $@