Permalink
Fetching contributors…
Cannot retrieve contributors at this time
244 lines (219 sloc) 8.26 KB
/* Copyright 2004-2012 Nadav Har'El and Dan Kenigsberg */
/* this little program creates hunspell or aspell dictionaries for Hebrew
* according to the hebrew.wgz*.
* We create a single rule for each of hspell's "word specifier". Each rule
* expands to all the prefixes that provide that specifier (and the null
* prefix is implied and NEEDAFFIX is specified for each word where this is
* not appropriate).
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "prefixes.c"
#include "hspell.h"
#define PREFIXFILE_COMMAND "gzip -dc hebrew.wgz.prefixes | ./specfilter"
/* Convert a number in the range 0..52 (currently) to a readable character
that can be used as the rule (prefix set) name. To facilitate merging our
word list with an English one (for spell-checking mixed text in software
that does not support multiple word lists), we do not use the upper-case
latin characters. Currently we use the lower-case letters, in addition to
Hebrew characters (aspell and myspell have no problems with non-ascii
characters) - but almost any symbols can be used to.
A note for future expansion: Aspell has problems with a backslash, while
Myspell works with them - so we will have to skip the backslash character
if we use symbols. But with the digits and other symbols, there's plenty
of room for future expansion.
*/
static inline char num_to_char(int i)
{
if(i<0){
fprintf(stderr,"internal error: num_to_char(%d)\n",i);
exit(1);
} else if(i<26){
return 'a'+i;
} else if(i<52){
return 'à'+(i-26);
} else {
fprintf(stderr,"internal error: num_to_char(%d) ran out of symbols\n",i);
exit(1);
}
}
/* Usage: mk_he_affix <hunspell> <affixfile> <dictfile>
* Where <hunspell> is: 0 for aspell, 1 for hunspell. Hunspell and aspell have
* some different affix file features, and also different encoding requirements
* (aspell requires ISO-8859-8, while hunspell is, for an unknown reason,
* 10 times faster if we give it UTF-8).
*/
int main(int argc, char *argv[])
{
int i, specifier;
char seen_specifiers[100], rulechar;
int already_seen=0, seen, count;
char needaffix=0;
FILE *prefixfp, *wordsfp;
FILE *afffp, *dicfp;
int prefixes_size = 0;
char *prefix_is_word;
int hunspell;
if(argc!=4){
fprintf(stderr,"%d\n",argc);
fprintf(stderr,"Usage: %s <hunspell> <affixfile> <dictfile>\n", argv[0]);
exit(1);
}
hunspell=atoi(argv[1]);
if(hunspell){
char s[256];
/* Unfortunately, the dictionary file should start with an approximate
* count of the number of words. Note that this count is only approximate
* as we also add a list of stand-alone prefixes at the end.
*/
snprintf(s, sizeof(s), "gzip -dc hebrew.wgz | ./wunzip | wc -l > %s", argv[3]);
system(s);
snprintf(s, sizeof(s), "iconv -f iso-8859-8 -t utf-8 >%s", argv[2]);
afffp = popen(s, "w");
snprintf(s, sizeof(s), "iconv -f iso-8859-8 -t utf-8 >>%s", argv[3]);
dicfp = popen(s, "w");
} else {
afffp = fopen(argv[2], "w");
dicfp = fopen(argv[3], "w");
}
fprintf(afffp, "# This file was generated automatically from data prepared\n"
"# by the Hspell project (http://hspell.ivrix.org.il/).\n"
"# Hspell version %d.%d%s was used.\n"
"# The conversion was carried out in %s\n",
HSPELL_VERSION_MAJOR,HSPELL_VERSION_MINOR,HSPELL_VERSION_EXTRA,
__DATE__);
fprintf(afffp, "# Copyright 2004-2012, Nadav Har'El and Dan Kenigsberg\n");
fprintf(afffp, "# The dictionary (this file and the corresponding word list)\n"
"# is licensed under the GNU Affero General Public License\n"
"# (AGPL) version 3.\n");
if(hunspell){
fprintf(afffp,
"SET UTF-8\n"
"TRY éåäàòçë÷'\"ùñæãâáøðîèöúôíóêõïì\n"
"WORDCHARS àáâãäåæçèéëìîðñòôö÷øùúíïêóõ'\"\n"
"BREAK 3\n"
"BREAK ^\"\n"
"BREAK \"$\n"
"BREAK ^'\n"
"MAP 10\n"
"MAP êëç\n"
"MAP íî\n"
"MAP ïð\n"
"MAP óô\n"
"MAP õö\n"
"MAP ë÷\n"
"MAP àò # for English\n"
"MAP âä # for Russian\n"
"MAP öñ # for Arabic\n"
"MAP çëø # for French\n"
);
}
prefixfp = popen(PREFIXFILE_COMMAND, "r");
while ((specifier=fgetc(prefixfp))!= EOF) {
for(i=0, seen=0; (i<already_seen) && !seen; i++) {
if (seen_specifiers[i] == specifier) seen = 1; }
if (seen) continue;
seen_specifiers[already_seen++] = specifier;
/* count the number of matching prefixes */
for (i=1, count=0; prefixes_noH[i]!=0; i++) {
if (masks_noH[i] & specifier) {
if (!strcmp("å",prefixes_noH[i])) count += 2;
else count += 4;
}
}
rulechar = num_to_char(already_seen-1);
fprintf(afffp, "PFX %c N %d\n",rulechar,count);
/* print one rule for each legal prefix that goes with this word type,
* and remember to double initial waw if a prefix is prepended.
*
* The empty prefix, prefixes_nohH[0], needs special treatment. While
* other allowed prefixes need to be explictly added to the rules (as we
* do below), the empty prefix is by default allowed, and if it is not
* desired we need to explicitly disallow it with a special flag on
* every word for which we don't want to allow the empty prefix, with a
* special NEEDAFFIX flag.
* Unfortunately, NEEDAFFIX is only supported by hunspell; Aspell ignores
* it, and therefore mistakenly accepts the maqor natuy without a prefix,
* e.g., éùåï, as in ìéùåï but without the prefix.
*/
if (!(masks_noH[0] & specifier)){
/* Too bad this isn't supported because only one NEEDAFFIX allowed.
* So we'll need to have a single NEEDAFFIX flag, and specify it on
* individual words that need it
*/
/* fprintf(afffp, "NEEDAFFIX %c\n",rulechar); */
needaffix=1;
}
for (i=1; prefixes_noH[i]!=0; i++) {
if (masks_noH[i] & specifier) {
if (!strcmp("å",prefixes_noH[i])) {
fprintf(afffp, "PFX %c 0 %s .\n",rulechar,prefixes_noH[i]);
fprintf(afffp, "PFX %c 0 %s\" .\n",rulechar,prefixes_noH[i]);
} else {
fprintf(afffp, "PFX %c 0 %s [^å]\n",rulechar,prefixes_noH[i]);
fprintf(afffp, "PFX %c 0 %s åå\n",rulechar,prefixes_noH[i]);
fprintf(afffp, "PFX %c 0 %s\" .\n",rulechar,prefixes_noH[i]);
fprintf(afffp, "PFX %c 0 %så å[^å]\n",rulechar,prefixes_noH[i]);
}
}
}
prefixes_size = i;
fprintf(afffp, "\n");
}
if (hunspell && needaffix) {
needaffix = num_to_char(already_seen);
fprintf(afffp, "NEEDAFFIX %c\n",needaffix);
}
pclose(prefixfp);
if(hunspell)
pclose(afffp);
else
fclose(afffp);
prefix_is_word = (char *)calloc(sizeof(char),prefixes_size);
/* and now, translate hebrew.wgz+hebrew.wgz.prefix into aspell-style word
* list. */
prefixfp = popen(PREFIXFILE_COMMAND, "r");
wordsfp = popen("gzip -dc hebrew.wgz|./wunzip", "r");
while ((specifier=fgetc(prefixfp))!= EOF) {
char word[100];
int len, j;
/* find the specifier place (which infers which aspell rule apply to its
* word) */
for(i=0; (i<already_seen) && (seen_specifiers[i]!=specifier) ; i++);
fgets(word, sizeof(word)-3,wordsfp);
/* write down whether this word is also a legal prefix (and therefore should
not be written again later) */
for (j=1; prefixes_noH[j]!=0; j++) {
if (!strcmp(word,prefixes_noH[j])) {
if (masks_noH[0] & specifier) /* this word is allowed on its own */
prefix_is_word[j] = 1;
break;
}
}
word[strlen(word)-1]='\0'; /* remove trailing newline */
fprintf(dicfp,"%s",word);
putc('/', dicfp);
putc(num_to_char(i), dicfp);
if (hunspell && !(masks_noH[0] & specifier))
/* because we can't specify NEEDAFFIX for several prefixes, unfortunately
* we need to use one ("needaffix") and put it on individual words */
putc(needaffix, dicfp);
putc('\n', dicfp);
}
pclose(prefixfp);
pclose(wordsfp);
/* accept "dangling" prefixes, that many times precede numbers and latin */
/* but make sure not to repeat words that already appear in the dictionary.
* This may cause unwanted warning. */
for (i=1; prefixes_noH[i]!=0; i++) {
if (!prefix_is_word[i])
fprintf(dicfp, "%s\n", prefixes_noH[i]);
}
free(prefix_is_word);
if(hunspell)
pclose(dicfp);
else
fclose(dicfp);
return 0;
}