Skip to content

Commit

Permalink
hspell-0.7
Browse files Browse the repository at this point in the history
  • Loading branch information
Dan Kenigsberg committed Jun 11, 2010
1 parent e05fa29 commit ca0ff28
Show file tree
Hide file tree
Showing 35 changed files with 3,306 additions and 661 deletions.
41 changes: 34 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ BIN = $(PREFIX)/bin
SHARE = $(PREFIX)/share/hspell
LIBEXEC = $(PREFIX)/lib/hspell
MAN1 = $(PREFIX)/man/man1
MAN3 = $(PREFIX)/man/man3
LIBDIR = $(PREFIX)/lib
INCLUDEDIR = $(PREFIX)/include

all: cfrontend

Expand Down Expand Up @@ -118,12 +121,22 @@ clean:
c/corlist.o c/dict_radix.o c/find_sizes.o c/gimatria.o \
c/hspell.o c/tclHash.o c/hebrew.wgz c/hebrew.wgz.sizes \
c/hebrew.wgz.prefixes c/dout.nouns.shemp.gz c/shemp.dat \
c/dout.nouns.wolig.gz c/dout.verbs.gz c/hspell c/find_sizes
c/dout.nouns.wolig.gz c/dout.verbs.gz c/hspell c/find_sizes \
c/prefixes.c c/libhspell.o c/libhspell.a \
c/hebrew.wgz.desc c/hebrew.wgz.stems

################################################
# for the C front-end
cfrontend:
(cd c; $(MAKE) EXTRACFLAGS='-DDICTIONARY_BASE=\"$(DESTDIR)/$(SHARE)/hebrew.wgz\"')
(cd c; $(MAKE) EXTRACFLAGS='-DDICTIONARY_BASE=\"$(SHARE)/hebrew.wgz\"')


# To include a full morphological analyzer in "hspell -l", run "make linginfo"
# instead of just "make". But watch out - this slows down the build, and the
# installed data files will be 4 times as large. But don't worry - this feature
# has no speed impact on hspell unless the -l option is actually used.
linginfo:
(cd c; $(MAKE) EXTRACFLAGS='-DDICTIONARY_BASE=\"$(SHARE)/hebrew.wgz\" -DUSE_LINGINFO' EXTRAOBJECTS='linginfo.o' dolinginfo)

install: install_cfrontend
CHSPELL=hspell
Expand All @@ -136,29 +149,43 @@ install_cfrontend: cfrontend
chmod 755 $(DESTDIR)/$(BIN)/multispell
test -d $(DESTDIR)/$(SHARE) || mkdir -m 755 -p $(DESTDIR)/$(SHARE)
cp c/hebrew.wgz c/hebrew.wgz.prefixes c/hebrew.wgz.sizes $(DESTDIR)/$(SHARE)/
(cd $(DESTDIR)/$(SHARE); chmod 644 hebrew.wgz hebrew.wgz.prefixes hebrew.wgz.sizes)
gzip -9 < spellinghints > $(DESTDIR)/$(SHARE)/hebrew.wgz.hints
(cd $(DESTDIR)/$(SHARE); chmod 644 hebrew.wgz hebrew.wgz.prefixes hebrew.wgz.sizes hebrew.wgz.hints)
test ! -f c/hebrew.wgz.stems || cp c/hebrew.wgz.stems c/hebrew.wgz.desc $(DESTDIR)/$(SHARE)/
(cd $(DESTDIR)/$(SHARE); test ! -f hebrew.wgz.stems || chmod 644 hebrew.wgz.stems hebrew.wgz.desc)
-rm -f $(DESTDIR)/$(BIN)/hspell-i
-ln -s $(CHSPELL) $(DESTDIR)/$(BIN)/hspell-i
test -d $(DESTDIR)/$(MAN1) || mkdir -m 755 -p $(DESTDIR)/$(MAN1)
cp hspell.1 $(DESTDIR)/$(MAN1)/
chmod 644 $(DESTDIR)/$(MAN1)/hspell.1
test -d $(DESTDIR)/$(MAN3) || mkdir -m 755 -p $(DESTDIR)/$(MAN3)
cp c/hspell.3 $(DESTDIR)/$(MAN3)/
chmod 644 $(DESTDIR)/$(MAN3)/hspell.3
test -d $(DESTDIR)/$(LIBDIR) || mkdir -m 755 -p $(DESTDIR)/$(LIBDIR)
cp c/libhspell.a $(DESTDIR)/$(LIBDIR)/
chmod 644 $(DESTDIR)/$(LIBDIR)/libhspell.a
test -d $(DESTDIR)/$(INCLUDEDIR) || mkdir -m 755 -p $(DESTDIR)/$(INCLUDEDIR)
cp c/hspell.h c/linginfo.h $(DESTDIR)/$(INCLUDEDIR)/
chmod 644 $(DESTDIR)/$(INCLUDEDIR)/hspell.h $(DESTDIR)/$(INCLUDEDIR)/linginfo.h

################################################
# for creating an hspell distribution tar
PACKAGE = hspell
VERSION = 0.6
VERSION = 0.7
DISTFILES = COPYING INSTALL LICENSE README WHATSNEW TODO \
Makefile stats wunzip.c wzip \
hspell.pl hspell.1 \
wolig.pl wolig.dat biza-nouns milot extrawords \
woo woo.dat biza-verbs \
likelyerrors spellinghints \
hspell.spec \
c/Makefile c/README c/corlist.c c/corlist.h c/dict_radix.c \
c/dict_radix.h c/find_sizes.c c/gimatria.c c/gimatria.h c/hspell.c \
c/Makefile c/README c/corlist.c c/dict_radix.c \
c/dict_radix.h c/find_sizes.c c/gimatria.c c/hspell.c \
c/hspell.h c/libhspell.c \
c/pmerge c/PrefixBits.pl c/genprefixes.pl \
c/hash.h c/tclHash.c c/tclHash.h \
multispell
c/binarize-desc.pl c/pack-desc.pl c/linginfo.c c/linginfo.h \
multispell c/hspell.3

DISTDIR = $(PACKAGE)-$(VERSION)

Expand Down
14 changes: 9 additions & 5 deletions README
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
This is version 0.6 of Hspell, the free Hebrew spellchecker and morphology
This is version 0.7 of Hspell, the free Hebrew spellchecker and morphology
engine.

It is a fully working Hebrew spellchecker, not a toy release. On typical
Expand Down Expand Up @@ -175,10 +175,14 @@ easy to do and will greatly enhance the Hspell project's usefulness.
including, for example, Hebrew search engines, Hebrew dictionaries (with
definitions and/or translation to another language), and machine translation
software.
The "-v" option (in releases 0.5 and earlier) to hspell is a small
demonstration of what hspell's morphology engine is capable of. Even the
spellchecker itself could benefit from more use of those techniques: e.g.,
it could explain how the corrections it suggests were derived.
The "-v" option (in releases 0.5 and earlier) to hspell was a small
demonstration of what hspell's morphology engine is capable of. The
"-l" option in releases 0.7 and later (when compiled with this support
on) is already a full-featured morphological analyzer, so this task is
basically done.
Even the spellchecker itself could benefit from more use of those
techniques: e.g., it could explain how the corrections it suggests were
derived.


Hspell's license
Expand Down
5 changes: 5 additions & 0 deletions TODO
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
- disallow empty prefixes for infinitives in the enumeration. probable reason is
a bug in dmask2ps.
- put desc_sizes.h into runtime-read file.
- give better names to functions.

Really important stuff to be done:
* Extend the list of known verbs, nouns, proper nouns, relation words, etc.
In this one, the general public could really help.
Expand Down
36 changes: 36 additions & 0 deletions WHATSNEW
Original file line number Diff line number Diff line change
@@ -1,3 +1,38 @@
Release 0.7 (December 22, 2003):

* Some incorrect words purged or fixed, and many more words added: over
1,000 base words added.

* Portability improvements (thanks to Baruch Even).

* New run-time option -H causes hspell to accept He Ha-she'ela in the text.

* The "-n" option (for spelling hints, an explanation of certain kinds of
errors) that was lost in version 0.6, is now back. The "likelyerrors"
feature is still missing in this version.

* A new, and somewhat tentative, Documented C API (libhspell.a library,
spell.h header, hspell.3 manual). This are not available in the normal
RPM - compile the sources yourself if you want them.

* A full morphological analyzer (explaining all the possible ways to read
a given word, how each reading was derived and its syntactic properties)
is now available when Hspell is run with the "-l" option.

Because the data files needed for this feature currently take up as much
as 4 times the space needed only for spell-checking, and because the
collection and sorting of linguistic information makes the compilation
takes much longer, this feature is not compiled by default, unless
compilation is done with "make linginfo". Also, the normal RPM does not
include this feature, but the "fat" RPM does include it.
An innovative parade of hash symbols now appears on screen to the benefit
of the bored builder :)

* Vocabulary: 406,629 words (when including kinuyim on verbs)
based on 8042 nouns, 1783 adjectives, 5113 verb stems, and
1880 other words

-----------------------------------------------------------------------------
Release 0.6 (August 5, 2003):

In this release, the Hspell front-end (the hspell program) was rewritten.
Expand All @@ -23,6 +58,7 @@ Release 0.6 (August 5, 2003):
accepted with prefixes. Also, these words are never used as suggested
corrections.


* "hspell -a" was made more compatible with the standard "ispell -a" (thanks
to Mooffie). Hspell is now known to be used with LyX, KDE, Geresh and Emacs.

Expand Down
2 changes: 2 additions & 0 deletions biza-verbs
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@
# ����� ������ ��������
���
���
���
���
-----
# ���� ������� �� ������
����
Expand Down
69 changes: 64 additions & 5 deletions c/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,20 @@ PERL=perl
#all: test.wgz.sizes
all: hebrew.wgz.sizes hspell

hspell: hspell.o dict_radix.o gimatria.o corlist.o tclHash.o
cc -o hspell hspell.o dict_radix.o gimatria.o corlist.o tclHash.o
libhspell.a: dict_radix.o gimatria.o corlist.o libhspell.o $(EXTRAOBJECTS)
-rm -f $@
ar cr $@ $^
-ranlib $@

hspell: hspell.o tclHash.o libhspell.a
cc -o hspell hspell.o tclHash.o libhspell.a

# TODO: update this dependency list:
hspell.o dict_radix.o: dict_radix.h
hspell.o gimatria.o: gimatria.h
hspell.o corlist.o: corlist.h
hspell.o: prefixes.c hash.h tclHash.h
tclHash.o: tclHash.c tclHash.h
corlist.o gimatria.o hspell.o libhspell.o: hspell.h
linginfo.o: dmask.c

prefixes.c: genprefixes.pl
$(PERL) -w ./genprefixes.pl >prefixes.c
Expand All @@ -37,6 +43,59 @@ hebrew.wgz hebrew.wgz.prefixes: pmerge $(DICTS) $(GZDICTS)
hebrew.wgz.sizes: hebrew.wgz find_sizes
gzip -dc hebrew.wgz | ./find_sizes >hebrew.wgz.sizes

###################################### optional linginfo stuff ##############
# See comment on linginfo in ../Makefile.
EXTRAOBJECTS=

dolinginfo: linginfo_data hspell

# hebrew.wgz contains all the words without any prefix hints like B,L,+ we
# previously had.
# hebrew.wgz.prefixes is the prefix hints (one byte per word, compressed).
# hebrew.wgz.sizes contains the memory sizes that reading hebrew.wgz will
# require (this makes it easier for hspell to preallocate the needed sizes).
# dmask.c contains an array of all possible values of the description bitmask.
# It is generated by pack-desc.pl. This array is not too long (no more than
# 300 for the default dictionary).
#
# In the following long rule, the complete list of all words with linguistic
# details is concatanated and sent to binarize-desc.pl, which converts the
# detail information of each word into bitmap (called dmask), produces a
# spesifier that tell which prefixes are accepted with the word, and writes its
# stem. Then the words list is sorted, packed (a-la uniq), and the output files
# are written.
#
# NOTE/TODO:
# The "linginfo_data:" target line below is ugly and un-make-like. Not only
# that, it doesn't know when it's necessary to build the files again, and
# when it is not. The better make targets (hebrew.wgz et al. and
# hebrew.wgz.sizes) are commented out because the same targets were used above
# for building the version without linginfo. When building with linginfo
# becomes the default, we should remove the following line and uncomment the
# real targets.
linginfo_data: binarize-desc.pl pack-desc.pl $(DICTS) $(GZDICTS) find_sizes
#hebrew.wgz hebrew.wgz.prefixes hebrew.wgz.desc hebrew.wgz.stems hebrew.wgz.lingsizes.tmp dmask.c: binarize-desc.pl pack-desc.pl $(DICTS) $(GZDICTS)
(gzip -dc $(GZDICTS); cat $(DICTS)) | $(PERL) binarize-desc.pl | \
sort -u | $(PERL) pack-desc.pl -p hebrew.wgz.prefixes.tmp \
-d hebrew.wgz.desc.tmp -s hebrew.wgz.stems.tmp \
-l hebrew.wgz.lingsizes.tmp | \
../wzip | gzip -9 > hebrew.wgz

-rm -f hebrew.wgz.prefixes
gzip -9 < hebrew.wgz.prefixes.tmp >hebrew.wgz.prefixes
-rm -f hebrew.wgz.prefixes.tmp
-rm -f hebrew.wgz.desc
gzip -9 < hebrew.wgz.desc.tmp >hebrew.wgz.desc
-rm -f hebrew.wgz.desc.tmp
-rm -f hebrew.wgz.stems
gzip -9 < hebrew.wgz.stems.tmp >hebrew.wgz.stems
-rm -f hebrew.wgz.stems.tmp
#
#hebrew.wgz.sizes: hebrew.wgz find_sizes hebrew.wgz.lingsizes.tmp
gzip -dc hebrew.wgz | ./find_sizes >hebrew.wgz.sizes
cat hebrew.wgz.lingsizes.tmp >> hebrew.wgz.sizes
############################################################################


# dout.* are the outputs from the various word-list generators with the
# -d option (i.e., with an explanation on how each word was derived)
Expand All @@ -59,5 +118,5 @@ dout.nouns.shemp.gz: shemp.dat ../wolig.pl
#SEDCMD=s/\+//
SEDCMD=/\+/d

dout.verbs.gz: ../woo ../woo.dat
dout.verbs.gz shemp.dat: ../woo ../woo.dat
$(PERL) -w ../woo -d ../woo.dat | sed "$(SEDCMD)" | gzip -4 > $@
Loading

0 comments on commit ca0ff28

Please sign in to comment.