Permalink
Fetching contributors…
Cannot retrieve contributors at this time
331 lines (302 sloc) 8.14 KB
/* Copyright (C) 2003-2004 Nadav Har'El and Dan Kenigsberg */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "linginfo.h"
#include "dmask.c"
/* For an explanation of this bizarre set of definitions, see the comment
in dict_radix.c, before a similar set. */
#ifdef HAVE_ZLIB
#define BUFFERED_ZLIB
#undef FILE
#undef pclose
#undef pclose
#undef getc
#ifdef BUFFERED_ZLIB
#include "gzbuffered.h"
#undef gzopen
#undef gzdopen
#define FILE void /* void* can be either normal FILE* or gzbFile*. Eek. */
#define popen(path,mode) gzb_open(path,mode)
#define gzopen(path,mode) gzb_open(path,mode)
#define gzdopen(path,mode) gzb_dopen(path,mode)
#define pclose(f) (gzb_close((gzbFile *)(f)))
#define getc(f) (gzb_getc(((gzbFile *)(f))))
#define fgets(s,n,f) (gzb_gets((s),(n),((gzbFile *)(f))))
#else
#include <zlib.h>
#define FILE void /* FILE* is void*, a.k.a. voidp or gzFile */
#define pclose(f) (gzclose((f)))
#define popen(path,mode) (gzopen((path),(mode)))
#define getc(f) (gzgetc((f)))
#define fgets(s,n,f) (gzgets((s),(n),(f)))
#endif
#undef fgetc
#define fgetc(f) getc(f)
#endif /* HAVE_ZLIB */
static char *flat, **lookup;
static int lookuplen;
extern int hspell_debug;
static int dcode2dmask(const char *dcode) {
int i = dcode[0]-'A'+(dcode[1]-'A')*26;
return dmasks[i];
}
static char *dmask2text(char *s, int dmask) {
char *c;
s[0]=0;
switch(dmask & D_TYPEMASK) {
case D_NOUN: c="ò"; break;
case D_VERB: c="ô"; break;
case D_ADJ: c="ú"; break;
case 0: c="x"; break;
default: c="";
}
strcat(s,c);
/* In few cases, both masculine and faminine are possible */
if(dmask & D_GENDERMASK & D_MASCULINE) { strcat(s,""); }
if(dmask & D_GENDERMASK & D_FEMININE) { strcat(s,""); }
switch(dmask & D_GUFMASK) {
case D_FIRST: c=",1"; break;
case D_SECOND: c=",2"; break;
case D_THIRD: c=",3"; break;
default: c="";
}
strcat(s,c);
switch(dmask & D_NUMMASK) {
case D_SINGULAR: c=",éçéã"; break;
case D_DOUBLE: c=",æåâé"; break;
case D_PLURAL: c=",øáéí"; break;
default: c="";
}
strcat(s,c);
switch(dmask & D_TENSEMASK) {
case D_PAST: c=",òáø"; break;
case D_PRESENT: c=",äååä"; break;
case D_FUTURE: c=",òúéã"; break;
case D_IMPERATIVE: c=",öéååé"; break;
case D_INFINITIVE: c=",î÷åø"; break;
case D_BINFINITIVE: c=",î÷åø,á"; break;
default: c="";
}
strcat(s,c);
if (dmask & D_SPECNOUN) {strcat(s,",ôøèé");}
if (dmask & D_OSMICHUT) {strcat(s,",ñîéëåú");}
if (dmask & D_OMASK) {
strcat(s,",ëéðåé/");
switch(dmask & D_OGENDERMASK) {
case D_OMASCULINE: c="æ"; break;
case D_OFEMININE: c="ð"; break;
default: c="";
}
strcat(s,c);
switch(dmask & D_OGUFMASK) {
case D_OFIRST: c=",1"; break;
case D_OSECOND: c=",2"; break;
case D_OTHIRD: c=",3"; break;
default: c="";
}
strcat(s,c);
switch(dmask & D_ONUMMASK) {
case D_OSINGULAR: c=",éçéã"; break;
case D_ODOUBLE: c=",æåâé"; break;
case D_OPLURAL: c=",øáéí"; break;
default: c="";
}
strcat(s,c);
}
return s;
}
char *linginfo_desc2text(char *text, const char *desc, int i) {
int dmask;
if (desc[i*2]==0) return 0;
dmask = dcode2dmask(&desc[i*2]);
dmask2text(text,dmask);
return text;
}
/* find the prefixes required by a word according to its details */
static int linginfo_dmask2ps(int dmask) {
int specifier;
if ((dmask&D_TYPEMASK)==D_VERB) {
if ((dmask&D_TENSEMASK)==D_IMPERATIVE) {
specifier = PS_IMPER;
} else if ((dmask&D_TENSEMASK)!=D_PRESENT) {
specifier = PS_VERB;
} else if (dmask & D_OSMICHUT || dmask & D_OMASK) {
specifier = PS_NONDEF;
} else specifier = PS_ALL;
/* TODO I feel that this may lead to a bug with ìéôåì and other infinitives that
* did not loose their initial lamed. I should correct this all the way from
* woo.pl */
if ((dmask&D_TENSEMASK)==D_INFINITIVE) specifier = PS_L;
else if ((dmask&D_TENSEMASK)==D_BINFINITIVE) specifier = PS_B;
} else if (((dmask&D_TYPEMASK)==D_NOUN) || ((dmask&D_TYPEMASK) == D_ADJ)) {
if (dmask & D_OSMICHUT || dmask & D_OMASK
|| dmask & D_SPECNOUN) {
specifier = PS_NONDEF;
} else {
specifier = PS_ALL;
}
} else specifier = PS_ALL;
return specifier;
}
int linginfo_desc2ps(const char *desc, int i) {
int dmask;
if (desc[i*2]==0) return 0;
dmask = dcode2dmask(&desc[i*2]);
return linginfo_dmask2ps(dmask);
}
char *linginfo_stem2text(const char *stem, int i) {
int wp;
if (stem[i*3]==0) return 0;
wp = stem[i*3]-33+(stem[i*3+1]-33)*94+
(stem[i*3+2]-33)*94*94;
return lookup[wp];
}
/* currently linginfo_init reopens the words file, reinterprets it, and stores
* it flat in memory. If it sounds silly to you, you probably can hear. */
int linginfo_init(const char *dir) {
FILE *fp,*fpstems,*fpdesc;
char *current;
char s[1024],stem[100],desc[100];
int i=0,j;
int flatsize;
snprintf(s,sizeof(s),"%s.sizes",dir);
if(!(fp=fopen(s,"r"))){
fprintf(stderr,"Hspell: can't open %s.\n",s);
return 0;
}
fscanf(fp,"%*d %*d %*d"); /* ignore non linginfo sizes */
if(fscanf(fp,"%d %d",&flatsize,&lookuplen)!=2){
fprintf(stderr,"Hspell: can't read from %s.\n",s);
return 0;
}
fclose(fp);
current = flat = (char *)malloc(flatsize);
lookup = (char **)malloc(sizeof(char *)*lookuplen);
if (!current || !lookup) {
fprintf (stderr, "Hspell: alloc failed\n");
return 0;
}
/* read dictionary into memory */
/* TODO: have better quoting for filename, or use zlib directly */
#ifdef HAVE_ZLIB
snprintf(s,sizeof(s),"%s",dir);
#else
snprintf(s,sizeof(s),"gzip -dc '%s'",dir);
#endif
if(!(fp=popen(s,"r"))){
fprintf(stderr,"Hspell: can't open %s.\n",s);
return 0;
}
#ifdef HAVE_ZLIB
snprintf(s,sizeof(s),"%s.stems",dir);
#else
snprintf(s,sizeof(s),"gzip -dc '%s.stems'",dir);
#endif
if(!(fpstems=popen(s,"r"))){
fprintf(stderr,"Hspell: can't open %s.\n",s);
pclose(fp);
return 0;
}
#ifdef HAVE_ZLIB
snprintf(s,sizeof(s),"%s.desc",dir);
#else
snprintf(s,sizeof(s),"gzip -dc '%s.desc'",dir);
#endif
if(!(fpdesc=popen(s,"r"))){
fprintf(stderr,"Hspell: can't open %s.\n",s);
pclose(fp);
pclose(fpstems);
return 0;
}
/* The following code for reading wunzip'ed word list is copied from
* wunzip.c and repeats what was done dict_radix.c's do_read_dict(). It
* would be much nicer to read the word list only once. */
{
char sbuf[256];
int slen=0;
int c,n;
while(1){
c=fgetc(fp);
if((c>='0' && c<='9') || c==EOF){
/* new word - output old word first */
sbuf[slen]='\0';
lookup[i++] = current;
for(j=0; j<=slen; j++) current++[0]=sbuf[j];
if (!fgets(stem,sizeof(stem),fpstems)) {
fprintf(stderr, "Hspell: linginfo: unexpected end of file in stems file\n");
return 0;
}
if (!fgets(desc,sizeof(desc),fpdesc)) {
fprintf(stderr, "Hspell: linginfo: unexpected end of file in description file\n");
return 0;
}
for (j=0; desc[j]!='\n' && desc[j]!=0; j++) {
current++[0]=desc[j];
}
current++[0]=0;
for (j=0; stem[j]!='\n' && stem[j]!=0; j++) {
current++[0]=stem[j];
}
current++[0]=0;
if (c==EOF) break;
/* and read how much to go back */
n=0;
do {
/* base 10... */
n*=10;
n+=(c-'0');
} while ((c=fgetc(fp))!=EOF && c>='0' && c<='9');
slen-=n;
if(slen<0 || slen >= sizeof(sbuf)-1){
fprintf(stderr,"Hspell: bad backlength %d... giving up.\n", slen);
return 0;
}
/* we got a new letter c - continue the loop */
}
/* word letter - add it */
if(slen>=sizeof(sbuf)-1){
fprintf(stderr,"Hspell: word too long... giving up.\n");
return 0;
}
sbuf[slen++]=c;
}
}
pclose(fp);
pclose(fpstems);
pclose(fpdesc);
if (hspell_debug) {
fprintf (stderr, "linginfo: finished reading %d words and stems\n",i);
}
return 1;
}
int linginfo_lookup(const char *word, char **desc, char **stem)
{
int res,i=0,bottom=0,top=lookuplen;
while (top>=bottom) {
if (i==(top-bottom)/2 + bottom) {
return 0;
}
i=(top-bottom)/2 + bottom;
if (hspell_debug) fprintf(stderr,"bot=%d i=%d top=%d) %s\n",bottom,i,top, lookup[i]);
res = strcmp(lookup[i],word);
if (res>0) {top=i;}
else if (res<0) {bottom=i;}
else {
int len,desclen;
len = strlen(lookup[i]);
*desc = lookup[i]+len+1;
desclen = strlen(*desc);
*stem = *desc+desclen+1;
return 1;
}
}
return 0;
}
int linginfo_free(void) {
if (lookup) {
free(lookup);
free(flat);
}
return 1;
}