Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

first commit

  • Loading branch information...
commit 220461484c6d342a849e4399b2cca76b0718f8a7 0 parents
@tvondra authored
34 META.json
@@ -0,0 +1,34 @@
+{
+ "name": "shared_ispell",
+ "abstract": "Provides a shared ispell dictionary - initialized once and stored in shared segment.",
+ "description": "Allows you to allocate area within a shared segment and use it for ispell dictionaries.",
+ "version": "1.0.0",
+ "maintainer": "Tomas Vondra <tv@fuzzy.cz>",
+ "license": "bsd",
+ "prereqs": {
+ "runtime": {
+ "requires": {
+ "PostgreSQL": "9.0.0"
+ }
+ }
+ },
+ "provides": {
+ "query_histogram": {
+ "file": "shared_ispell--1.0.0.sql",
+ "version": "1.0.0"
+ }
+ },
+ "resources": {
+ "repository": {
+ "url": "https://github.com:tvondra/shared_ispell.git",
+ "web": "http://github.com/tvondra/shared_ispell",
+ "type": "git"
+ }
+ },
+ "tags" : ["ispell", "shared", "fulltext", "dictionary"],
+ "meta-spec": {
+ "version": "1.0.0",
+ "url": "http://pgxn.org/meta/spec.txt"
+ },
+ "release_status" : "testing"
+}
18 Makefile
@@ -0,0 +1,18 @@
+MODULE_big = shared_ispell
+OBJS = src/shared_ispell.o src/spell.o
+
+EXTENSION = shared_ispell
+DATA = sql/shared_ispell--1.0.0.sql
+MODULES = shared_ispell
+
+CFLAGS=`pg_config --includedir-server`
+
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+
+all: shared_ispell.so
+
+shared_ispell.so: $(OBJS)
+
+%.o : src/%.c
84 README
@@ -0,0 +1,84 @@
+Shared ISpell Dictionary
+========================
+This PostgreSQL extension provides a shared ispell dictionary, i.e.
+a dictionary that's stored in shared segment. The traditional ispell
+implementation means that each session initializes and stores the
+dictionary on it's own, which means a lot of CPU/RAM is wasted.
+
+This extension allocates an area in shared segment (you have to
+choose the size in advance) and then loads the dictionary into it
+when it's used for the first time.
+
+If you need just snowball-type dictionaries, this extension is not
+really interesting for you. But if you really need an ispell
+dictionary, this may save you a lot of resources.
+
+
+Install
+-------
+Installing the extension is quite simple, especially if you're on 9.1.
+In that case all you need to do is this:
+
+ $ make install
+
+and then (after connecting to the database)
+
+ db=# CREATE EXTENSION shared_ispell;
+
+If you're on pre-9.1 version, you'll have to do the second part manually
+by running the SQL script (shared_ispell--x.y.sql) in the database. If
+needed, replace MODULE_PATHNAME by $libdir.
+
+
+Config
+------
+No the functions are created, but you still need to load the shared
+module. This needs to be done from postgresql.conf, as the module
+needs to allocate space in the shared memory segment. So add this to
+the config file (or update the current values)
+
+ # libraries to load
+ shared_preload_libraries = 'shared_ispell'
+
+ # known GUC prefixes
+ custom_variable_classes = 'shared_ispell'
+
+ # config of the shared memory
+ shared_ispell.max_size = 30MB
+
+Yes, there's a single GUC variable that defines the maximum size of
+the shared segment. This is a hard limit, the shared segment is not
+extensible and you need to set it so that all the dictionaries fit
+into it and not much memory is wasted.
+
+Set it higher than you need, load all the dictionaries and check the
+log - after loading each dictionary, there's a LOG message with info
+about how much memory is available. Use that to tweak the GUC.
+
+The shared segment can contain seve
+
+Using the dictionary
+--------------------
+Technically, the extension defines a 'shared_ispell' template that
+you may use to define custom dictionaries. E.g. you may do this
+
+ CREATE TEXT SEARCH DICTIONARY czech_shared (
+ TEMPLATE = shared_ispell,
+ DictFile = czech,
+ AffFile = czech,
+ StopWords = czech
+ );
+
+ CREATE TEXT SEARCH CONFIGURATION public.czech_shared
+ ( COPY = pg_catalog.simple );
+
+ ALTER TEXT SEARCH CONFIGURATION czech_shared
+ ALTER MAPPING FOR asciiword, asciihword, hword_asciipart,
+ word, hword, hword_part
+ WITH czech_shared;
+
+and then do the usual stuff, e.g.
+
+ SELECT ts_lexize('czech_shared', 'automobile');
+
+or whatever you want.
6 shared_ispell.control
@@ -0,0 +1,6 @@
+# shared ispell dictionary
+comment = 'Provides shared ispell dictionaries.'
+default_version = '1.0.0'
+relocatable = true
+
+module_pathname = '$libdir/shared_ispell'
28 sql/shared_ispell--1.0.0.sql
@@ -0,0 +1,28 @@
+CREATE OR REPLACE FUNCTION shared_dispell_init(internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME', 'dispell_init'
+ LANGUAGE C IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION shared_dispell_lexize(internal,internal,internal,internal)
+ RETURNS internal
+ AS 'MODULE_PATHNAME', 'dispell_lexize'
+ LANGUAGE C IMMUTABLE;
+
+CREATE TEXT SEARCH TEMPLATE shared_ispell (
+ INIT = shared_dispell_init,
+ LEXIZE = shared_dispell_lexize
+);
+
+CREATE TEXT SEARCH DICTIONARY czech_shared (
+ TEMPLATE = shared_ispell,
+ DictFile = czech,
+ AffFile = czech,
+ StopWords = czech
+);
+
+CREATE TEXT SEARCH CONFIGURATION public.czech_shared ( COPY = pg_catalog.simple );
+
+ALTER TEXT SEARCH CONFIGURATION czech_shared
+ ALTER MAPPING FOR asciiword, asciihword, hword_asciipart,
+ word, hword, hword_part
+ WITH czech_shared;
478 src/shared_ispell.c
@@ -0,0 +1,478 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+
+#include <sys/types.h>
+#include <sys/ipc.h>
+
+#include "postgres.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/fd.h"
+
+#include "commands/explain.h"
+#include "executor/executor.h"
+#include "executor/instrument.h"
+#include "utils/guc.h"
+#include "commands/defrem.h"
+#include "tsearch/ts_locale.h"
+#include "storage/lwlock.h"
+
+#include "libpq/md5.h"
+
+#include "spell.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+/* private functions */
+static void ispell_shmem_startup(void);
+
+/* This segment is initialized in the first process that accesses it (see
+ * ispell_shmem_startup function).
+ */
+#define SEGMENT_NAME "shared_ispell"
+
+static int max_ispell_mem_size = (30*1024*1024); /* 50MB by default */
+
+/* Saved hook values in case of unload */
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+void _PG_init(void);
+void _PG_fini(void);
+
+/* used to allocate memory in the shared segment */
+typedef struct SegmentInfo {
+ LWLockId lock;
+ char *firstfree; /* first free address (always maxaligned) */
+ size_t available; /* free space remaining at firstfree */
+ /* the shared segment (info and data) */
+ SharedIspellDict * dict;
+} SegmentInfo;
+
+/* These are used to allocate data within shared segment */
+static SegmentInfo * segment_info = NULL;
+
+static char * shalloc(int bytes);
+static void copyIspellDict(IspellDict * dict, SharedIspellDict * copy);
+
+/*
+ * Module load callback
+ */
+void
+_PG_init(void)
+{
+
+ /* */
+ if (! process_shared_preload_libraries_in_progress) {
+ elog(ERROR, "shared_ispell has to be loaded using shared_preload_libraries");
+ return;
+ }
+
+ /* Define custom GUC variables. */
+ DefineCustomIntVariable("shared_ispell.max_size",
+ "Max amount of memory to allocate for ispell dictionaries.",
+ NULL,
+ &max_ispell_mem_size,
+ (30*1024*1024),
+ (1024*1024), INT_MAX,
+ PGC_POSTMASTER,
+ GUC_UNIT_BLOCKS,
+#if (PG_VERSION_NUM >= 90100)
+ NULL,
+#endif
+ NULL,
+ NULL);
+
+ EmitWarningsOnPlaceholders("shared_ispell");
+
+ /*
+ * Request additional shared resources. (These are no-ops if we're not in
+ * the postmaster process.) We'll allocate or attach to the shared
+ * resources in ispell_shmem_startup().
+ */
+ RequestAddinShmemSpace(max_ispell_mem_size);
+ RequestAddinLWLocks(1);
+
+ /* Install hooks. */
+ prev_shmem_startup_hook = shmem_startup_hook;
+ shmem_startup_hook = ispell_shmem_startup;
+
+}
+
+
+/*
+ * Module unload callback
+ */
+void
+_PG_fini(void)
+{
+ /* Uninstall hooks. */
+ shmem_startup_hook = prev_shmem_startup_hook;
+}
+
+
+/* This is probably the most important part - allocates the shared
+ * segment, initializes it etc. */
+static
+void ispell_shmem_startup() {
+
+ bool found = FALSE;
+ char * segment;
+
+ if (prev_shmem_startup_hook)
+ prev_shmem_startup_hook();
+ /*
+ * Create or attach to the shared memory state, including hash table
+ */
+ LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+ segment = ShmemInitStruct(SEGMENT_NAME,
+ max_ispell_mem_size,
+ &found);
+
+ elog(DEBUG1, "initializing shared ispell segment (size: %d B)",
+ max_ispell_mem_size);
+
+ if (! found) {
+
+ memset(segment, 0, max_ispell_mem_size);
+
+ segment_info = (SegmentInfo*)segment;
+
+ segment_info->lock = LWLockAssign();
+ segment_info->firstfree = segment + MAXALIGN(sizeof(SegmentInfo));
+ segment_info->available = max_ispell_mem_size - (int)(segment_info->firstfree - segment);
+
+ elog(DEBUG1, "shared memory segment (shared ispell) successfully created");
+
+ }
+
+ LWLockRelease(AddinShmemInitLock);
+
+}
+
+static
+SharedIspellDict * get_shared_dict(char * words, char * affixes) {
+
+ SharedIspellDict * dict = segment_info->dict;
+
+ while (dict != NULL) {
+ if ((strcmp(dict->dictFile, words) == 0) && (strcmp(dict->affixFile, affixes) == 0)) {
+ return dict;
+ }
+ dict = dict->next;
+ }
+
+ return NULL;
+}
+
+Datum dispell_init(PG_FUNCTION_ARGS);
+Datum dispell_lexize(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(dispell_init);
+PG_FUNCTION_INFO_V1(dispell_lexize);
+
+StopList stoplist;
+
+Datum
+dispell_init(PG_FUNCTION_ARGS)
+{
+ List *dictoptions = (List *) PG_GETARG_POINTER(0);
+ char *dictFile = NULL, *affFile = NULL, *stopFile = NULL;
+ bool affloaded = false,
+ dictloaded = false,
+ stoploaded = false;
+ ListCell *l;
+
+ IspellDict * dict;
+ SharedIspellDict * shdict;
+
+ foreach(l, dictoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+
+ if (pg_strcasecmp(defel->defname, "DictFile") == 0)
+ {
+ if (dictloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple DictFile parameters")));
+ dictFile = defGetString(defel);
+ dictloaded = true;
+ }
+ else if (pg_strcasecmp(defel->defname, "AffFile") == 0)
+ {
+ if (affloaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple AffFile parameters")));
+ affFile = defGetString(defel);
+ affloaded = true;
+ }
+ else if (pg_strcasecmp(defel->defname, "StopWords") == 0)
+ {
+ if (stoploaded)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("multiple StopWords parameters")));
+ stopFile = defGetString(defel);
+ stoploaded = true;
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized Ispell parameter: \"%s\"",
+ defel->defname)));
+ }
+ }
+
+ /* search if the dictionary is already initialized */
+ LWLockAcquire(segment_info->lock, LW_EXCLUSIVE);
+
+ shdict = get_shared_dict(dictFile, affFile);
+
+ /* init if needed */
+ if (shdict == NULL) {
+
+ if (affloaded && dictloaded)
+ {
+ dict = (IspellDict *)palloc(sizeof(IspellDict));
+
+ SharedNIStartBuild(dict);
+
+ SharedNIImportDictionary(dict,
+ get_tsearch_config_filename(dictFile, "dict"));
+ SharedNIImportAffixes(dict,
+ get_tsearch_config_filename(affFile, "affix"));
+
+ // readstoplist(stopFile, &stoplist, lowerstr);
+
+ SharedNISortDictionary(dict);
+ SharedNISortAffixes(dict);
+ }
+ else if (!affloaded)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("missing AffFile parameter")));
+ }
+ else
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("missing DictFile parameter")));
+ }
+
+ SharedNIFinishBuild(dict);
+
+ shdict = (SharedIspellDict*)shalloc(sizeof(SharedIspellDict));
+
+ shdict->dictFile = shalloc(strlen(dictFile)+1);
+ shdict->affixFile = shalloc(strlen(affFile)+1);
+
+ strcpy(shdict->dictFile, dictFile);
+ strcpy(shdict->affixFile, affFile);
+
+ copyIspellDict(dict, shdict);
+
+ shdict->next = segment_info->dict;
+ segment_info->dict = shdict;
+
+ elog(LOG, "shared ispell init done, remaining %d B", segment_info->available);
+
+ }
+
+ LWLockRelease(segment_info->lock);
+
+ PG_RETURN_POINTER(shdict);
+}
+
+Datum
+dispell_lexize(PG_FUNCTION_ARGS)
+{
+ SharedIspellDict *d = (SharedIspellDict *) PG_GETARG_POINTER(0);
+ char *in = (char *) PG_GETARG_POINTER(1);
+ int32 len = PG_GETARG_INT32(2);
+ char *txt;
+ TSLexeme *res;
+ TSLexeme *ptr,
+ *cptr;
+
+ if (len <= 0)
+ PG_RETURN_POINTER(NULL);
+
+ txt = lowerstr_with_len(in, len);
+ res = SharedNINormalizeWord(d, txt);
+
+ if (res == NULL)
+ PG_RETURN_POINTER(NULL);
+
+ ptr = cptr = res;
+ while (ptr->lexeme)
+ {
+ if (searchstoplist(&stoplist, ptr->lexeme))
+ {
+ pfree(ptr->lexeme);
+ ptr->lexeme = NULL;
+ ptr++;
+ }
+ else
+ {
+ memcpy(cptr, ptr, sizeof(TSLexeme));
+ cptr++;
+ ptr++;
+ }
+ }
+ cptr->lexeme = NULL;
+
+ PG_RETURN_POINTER(res);
+}
+
+static
+char * shalloc(int bytes) {
+
+ char * result;
+ bytes = MAXALIGN(bytes);
+
+ if (bytes > segment_info->available) {
+ elog(ERROR, "the shared segment (shared ispell) is too small");
+ }
+
+ result = segment_info->firstfree;
+ segment_info->firstfree += bytes;
+ segment_info->available -= bytes;
+
+ memset(result, 0, bytes);
+
+ return result;
+
+}
+
+static
+SPNode * copySPNode(SPNode * node) {
+ int i;
+
+ SPNode * copy = NULL;
+
+ if (node == NULL) {
+ return NULL;
+ }
+
+ copy = (SPNode*)shalloc(offsetof(SPNode,data) + sizeof(SPNodeData) * node->length);
+ memcpy(copy, node, offsetof(SPNode,data) + sizeof(SPNodeData) * node->length);
+
+ for (i = 0; i < node->length; i++) {
+ copy->data[i].node = copySPNode(node->data[i].node);
+ }
+
+ return copy;
+}
+
+static
+char * shstrcpy(char * str) {
+ char * tmp = shalloc(strlen(str)+1);
+ memcpy(tmp, str, strlen(str)+1);
+ return tmp;
+}
+
+static
+RegisNode * copyRegisNode(RegisNode * node) {
+
+ RegisNode * copy = (RegisNode *)shalloc(offsetof(RegisNode, data) + node->len);
+
+ memcpy(copy, node, offsetof(RegisNode, data) + node->len);
+
+ if (node->next != NULL) {
+ copy->next = copyRegisNode(node->next);
+ }
+
+ return copy;
+}
+
+static
+AFFIX * copyAffix(AFFIX * affix) {
+
+ AFFIX * copy = (AFFIX*)shalloc(sizeof(AFFIX));
+
+ memcpy(copy, affix, sizeof(AFFIX));
+
+ copy->find = shstrcpy(affix->find);
+ copy->repl = shstrcpy(affix->repl);
+
+ if (copy->isregis) {
+ copy->reg.regis.node = copyRegisNode(copy->reg.regis.node);
+ } else {
+ // FIXME handle the regex_t properly (copy the strings etc)
+ }
+
+ return copy;
+
+}
+
+static
+AffixNode * copyAffixNode(AffixNode * node) {
+
+ int i, j;
+ AffixNode * copy = NULL;
+
+ if (node == NULL) {
+ return NULL;
+ }
+
+ copy = (AffixNode *)shalloc(offsetof(AffixNode,data) + sizeof(AffixNodeData) * node->length);
+ copy->isvoid = node->isvoid;
+ copy->length = node->length;
+ memcpy(copy, node, offsetof(SPNode,data) + sizeof(SPNodeData) * node->length);
+
+ for (i = 0; i < node->length; i++) {
+
+ copy->data[i].node = copyAffixNode(node->data[i].node);
+
+ copy->data[i].naff = node->data[i].naff;
+ copy->data[i].aff = (AFFIX**)shalloc(sizeof(AFFIX*) * node->data[i].naff);
+ memset(copy->data[i].aff, 0, sizeof(AFFIX*) * node->data[i].naff);
+
+ for (j = 0; j < node->data[i].naff; j++) {
+ copy->data[i].aff[j] = copyAffix(node->data[i].aff[j]);
+ }
+ }
+
+ return copy;
+}
+
+static
+void copyIspellDict(IspellDict * dict, SharedIspellDict * copy) {
+
+ int i;
+
+ copy->naffixes = dict->naffixes;
+
+ copy->Affix = (AFFIX*)shalloc(sizeof(AFFIX) * dict->naffixes);
+
+ copy->Suffix = copyAffixNode(dict->Suffix);
+ copy->Prefix = copyAffixNode(dict->Prefix);
+
+ copy->Dictionary = copySPNode(dict->Dictionary);
+
+ /* copy affix data */
+ copy->nAffixData = dict->nAffixData;
+ copy->AffixData = (char**)shalloc(sizeof(char*) * dict->nAffixData);
+ for (i = 0; i < copy->nAffixData; i++) {
+ copy->AffixData[i] = (char*)shalloc(sizeof(char) * strlen(dict->AffixData[i]) + 1);
+ strcpy(copy->AffixData[i], dict->AffixData[i]);
+ }
+
+ /* copy compound affixes */
+ /* FIXME How to copy this without the cmpaffixes? If we can get rid of this field, we
+ * could get rid of the local IspellDict copy. */
+ copy->CompoundAffix = (CMPDAffix*)shalloc(sizeof(CMPDAffix) * dict->cmpaffixes);
+ memcpy(copy->CompoundAffix, dict->CompoundAffix, sizeof(CMPDAffix) * dict->cmpaffixes);
+
+ memcpy(copy->flagval, dict->flagval, 255);
+ copy->usecompound = dict->usecompound;
+
+}
1,806 src/spell.c
@@ -0,0 +1,1806 @@
+/*-------------------------------------------------------------------------
+ *
+ * spell.c
+ * Normalizing word with ISpell
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/tsearch/spell.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "catalog/pg_collation.h"
+#include "spell.h"
+#include "tsearch/ts_locale.h"
+#include "utils/memutils.h"
+
+
+/*
+ * Initialization requires a lot of memory that's not needed
+ * after the initialization is done. During initialization,
+ * CurrentMemoryContext is the long-lived memory context associated
+ * with the dictionary cache entry. We keep the short-lived stuff
+ * in the Conf->buildCxt context.
+ */
+#define tmpalloc(sz) MemoryContextAlloc(Conf->buildCxt, (sz))
+#define tmpalloc0(sz) MemoryContextAllocZero(Conf->buildCxt, (sz))
+
+/*
+ * Prepare for constructing an ISpell dictionary.
+ *
+ * The IspellDict struct is assumed to be zeroed when allocated.
+ */
+void
+SharedNIStartBuild(IspellDict *Conf)
+{
+ /*
+ * The temp context is a child of CurTransactionContext, so that it will
+ * go away automatically on error.
+ */
+ Conf->buildCxt = AllocSetContextCreate(CurTransactionContext,
+ "Ispell dictionary init context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+}
+
+/*
+ * Clean up when dictionary construction is complete.
+ */
+void
+SharedNIFinishBuild(IspellDict *Conf)
+{
+ /* Release no-longer-needed temp memory */
+ MemoryContextDelete(Conf->buildCxt);
+ /* Just for cleanliness, zero the now-dangling pointers */
+ Conf->buildCxt = NULL;
+ Conf->Spell = NULL;
+ Conf->firstfree = NULL;
+}
+
+
+/*
+ * "Compact" palloc: allocate without extra palloc overhead.
+ *
+ * Since we have no need to free the ispell data items individually, there's
+ * not much value in the per-chunk overhead normally consumed by palloc.
+ * Getting rid of it is helpful since ispell can allocate a lot of small nodes.
+ *
+ * We currently pre-zero all data allocated this way, even though some of it
+ * doesn't need that. The cpalloc and cpalloc0 macros are just documentation
+ * to indicate which allocations actually require zeroing.
+ */
+#define COMPACT_ALLOC_CHUNK 8192 /* amount to get from palloc at once */
+#define COMPACT_MAX_REQ 1024 /* must be < COMPACT_ALLOC_CHUNK */
+
+static void *
+compact_palloc0(IspellDict *Conf, size_t size)
+{
+ void *result;
+
+ /* Should only be called during init */
+ Assert(Conf->buildCxt != NULL);
+
+ /* No point in this for large chunks */
+ if (size > COMPACT_MAX_REQ)
+ return palloc0(size);
+
+ /* Keep everything maxaligned */
+ size = MAXALIGN(size);
+
+ /* Need more space? */
+ if (size > Conf->avail)
+ {
+ Conf->firstfree = palloc0(COMPACT_ALLOC_CHUNK);
+ Conf->avail = COMPACT_ALLOC_CHUNK;
+ }
+
+ result = (void *) Conf->firstfree;
+ Conf->firstfree += size;
+ Conf->avail -= size;
+
+ return result;
+}
+
+#define cpalloc(size) compact_palloc0(Conf, size)
+#define cpalloc0(size) compact_palloc0(Conf, size)
+
+static char *
+cpstrdup(IspellDict *Conf, const char *str)
+{
+ char *res = cpalloc(strlen(str) + 1);
+
+ strcpy(res, str);
+ return res;
+}
+
+
+/*
+ * Apply lowerstr(), producing a temporary result (in the buildCxt).
+ */
+static char *
+lowerstr_ctx(IspellDict *Conf, const char *src)
+{
+ MemoryContext saveCtx;
+ char *dst;
+
+ saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
+ dst = lowerstr(src);
+ MemoryContextSwitchTo(saveCtx);
+
+ return dst;
+}
+
+#define MAX_NORM 1024
+#define MAXNORMLEN 256
+
+#define STRNCMP(s,p) strncmp( (s), (p), strlen(p) )
+#define GETWCHAR(W,L,N,T) ( ((const uint8*)(W))[ ((T)==FF_PREFIX) ? (N) : ( (L) - 1 - (N) ) ] )
+#define GETCHAR(A,N,T) GETWCHAR( (A)->repl, (A)->replen, N, T )
+
+static char *VoidString = "";
+
+static int
+cmpspell(const void *s1, const void *s2)
+{
+ return (strcmp((*(SPELL * const *) s1)->word, (*(SPELL * const *) s2)->word));
+}
+static int
+cmpspellaffix(const void *s1, const void *s2)
+{
+ return (strncmp((*(SPELL * const *) s1)->p.flag, (*(SPELL * const *) s2)->p.flag, MAXFLAGLEN));
+}
+
+static char *
+findchar(char *str, int c)
+{
+ while (*str)
+ {
+ if (t_iseq(str, c))
+ return str;
+ str += pg_mblen(str);
+ }
+
+ return NULL;
+}
+
+
+/* backward string compare for suffix tree operations */
+static int
+strbcmp(const unsigned char *s1, const unsigned char *s2)
+{
+ int l1 = strlen((const char *) s1) - 1,
+ l2 = strlen((const char *) s2) - 1;
+
+ while (l1 >= 0 && l2 >= 0)
+ {
+ if (s1[l1] < s2[l2])
+ return -1;
+ if (s1[l1] > s2[l2])
+ return 1;
+ l1--;
+ l2--;
+ }
+ if (l1 < l2)
+ return -1;
+ if (l1 > l2)
+ return 1;
+
+ return 0;
+}
+
+static int
+strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
+{
+ int l1 = strlen((const char *) s1) - 1,
+ l2 = strlen((const char *) s2) - 1,
+ l = count;
+
+ while (l1 >= 0 && l2 >= 0 && l > 0)
+ {
+ if (s1[l1] < s2[l2])
+ return -1;
+ if (s1[l1] > s2[l2])
+ return 1;
+ l1--;
+ l2--;
+ l--;
+ }
+ if (l == 0)
+ return 0;
+ if (l1 < l2)
+ return -1;
+ if (l1 > l2)
+ return 1;
+ return 0;
+}
+
+static int
+cmpaffix(const void *s1, const void *s2)
+{
+ const AFFIX *a1 = (const AFFIX *) s1;
+ const AFFIX *a2 = (const AFFIX *) s2;
+
+ if (a1->type < a2->type)
+ return -1;
+ if (a1->type > a2->type)
+ return 1;
+ if (a1->type == FF_PREFIX)
+ return strcmp(a1->repl, a2->repl);
+ else
+ return strbcmp((const unsigned char *) a1->repl,
+ (const unsigned char *) a2->repl);
+}
+
+static void
+NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
+{
+ if (Conf->nspell >= Conf->mspell)
+ {
+ if (Conf->mspell)
+ {
+ Conf->mspell *= 2;
+ Conf->Spell = (SPELL **) repalloc(Conf->Spell, Conf->mspell * sizeof(SPELL *));
+ }
+ else
+ {
+ Conf->mspell = 1024 * 20;
+ Conf->Spell = (SPELL **) tmpalloc(Conf->mspell * sizeof(SPELL *));
+ }
+ }
+ Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
+ strcpy(Conf->Spell[Conf->nspell]->word, word);
+ strncpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN);
+ Conf->nspell++;
+}
+
+/*
+ * import dictionary
+ *
+ * Note caller must already have applied get_tsearch_config_filename
+ */
+void
+SharedNIImportDictionary(IspellDict *Conf, const char *filename)
+{
+ tsearch_readline_state trst;
+ char *line;
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open dictionary file \"%s\": %m",
+ filename)));
+
+ while ((line = tsearch_readline(&trst)) != NULL)
+ {
+ char *s,
+ *pstr;
+ const char *flag;
+
+ /* Extract flag from the line */
+ flag = NULL;
+ if ((s = findchar(line, '/')))
+ {
+ *s++ = '\0';
+ flag = s;
+ while (*s)
+ {
+ /* we allow only single encoded flags for faster works */
+ if (pg_mblen(s) == 1 && t_isprint(s) && !t_isspace(s))
+ s++;
+ else
+ {
+ *s = '\0';
+ break;
+ }
+ }
+ }
+ else
+ flag = "";
+
+ /* Remove trailing spaces */
+ s = line;
+ while (*s)
+ {
+ if (t_isspace(s))
+ {
+ *s = '\0';
+ break;
+ }
+ s += pg_mblen(s);
+ }
+ pstr = lowerstr_ctx(Conf, line);
+
+ NIAddSpell(Conf, pstr, flag);
+ pfree(pstr);
+
+ pfree(line);
+ }
+ tsearch_readline_end(&trst);
+}
+
+
+static int
+FindWord(SharedIspellDict *Conf, const char *word, int affixflag, int flag)
+{
+ SPNode *node = Conf->Dictionary;
+ SPNodeData *StopLow,
+ *StopHigh,
+ *StopMiddle;
+ const uint8 *ptr = (const uint8 *) word;
+
+ flag &= FF_DICTFLAGMASK;
+
+ while (node && *ptr)
+ {
+ StopLow = node->data;
+ StopHigh = node->data + node->length;
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+ if (StopMiddle->val == *ptr)
+ {
+ if (*(ptr + 1) == '\0' && StopMiddle->isword)
+ {
+ if (flag == 0)
+ {
+ if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
+ return 0;
+ }
+ else if ((flag & StopMiddle->compoundflag) == 0)
+ return 0;
+
+ if ((affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
+ return 1;
+ }
+ node = StopMiddle->node;
+ ptr++;
+ break;
+ }
+ else if (StopMiddle->val < *ptr)
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle;
+ }
+ if (StopLow >= StopHigh)
+ break;
+ }
+ return 0;
+}
+
+static void
+NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type)
+{
+ AFFIX *Affix;
+
+ if (Conf->naffixes >= Conf->maffixes)
+ {
+ if (Conf->maffixes)
+ {
+ Conf->maffixes *= 2;
+ Conf->Affix = (AFFIX *) repalloc((void *) Conf->Affix, Conf->maffixes * sizeof(AFFIX));
+ }
+ else
+ {
+ Conf->maffixes = 16;
+ Conf->Affix = (AFFIX *) palloc(Conf->maffixes * sizeof(AFFIX));
+ }
+ }
+
+ Affix = Conf->Affix + Conf->naffixes;
+
+ if (strcmp(mask, ".") == 0)
+ {
+ Affix->issimple = 1;
+ Affix->isregis = 0;
+ }
+ else if (RS_isRegis(mask))
+ {
+ Affix->issimple = 0;
+ Affix->isregis = 1;
+ RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX) ? true : false,
+ (mask && *mask) ? mask : VoidString);
+ }
+ else
+ {
+ int masklen;
+ int wmasklen;
+ int err;
+ pg_wchar *wmask;
+ char *tmask;
+
+ Affix->issimple = 0;
+ Affix->isregis = 0;
+ tmask = (char *) tmpalloc(strlen(mask) + 3);
+ if (type == FF_SUFFIX)
+ sprintf(tmask, "%s$", mask);
+ else
+ sprintf(tmask, "^%s", mask);
+
+ masklen = strlen(tmask);
+ wmask = (pg_wchar *) tmpalloc((masklen + 1) * sizeof(pg_wchar));
+ wmasklen = pg_mb2wchar_with_len(tmask, wmask, masklen);
+
+ err = pg_regcomp(&(Affix->reg.regex), wmask, wmasklen,
+ REG_ADVANCED | REG_NOSUB,
+ DEFAULT_COLLATION_OID);
+ if (err)
+ {
+ char errstr[100];
+
+ pg_regerror(err, &(Affix->reg.regex), errstr, sizeof(errstr));
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
+ errmsg("invalid regular expression: %s", errstr)));
+ }
+ }
+
+ Affix->flagflags = flagflags;
+ if ((Affix->flagflags & FF_COMPOUNDONLY) || (Affix->flagflags & FF_COMPOUNDPERMITFLAG))
+ {
+ if ((Affix->flagflags & FF_COMPOUNDFLAG) == 0)
+ Affix->flagflags |= FF_COMPOUNDFLAG;
+ }
+ Affix->flag = flag;
+ Affix->type = type;
+
+ Affix->find = (find && *find) ? cpstrdup(Conf, find) : VoidString;
+ if ((Affix->replen = strlen(repl)) > 0)
+ Affix->repl = cpstrdup(Conf, repl);
+ else
+ Affix->repl = VoidString;
+ Conf->naffixes++;
+}
+
+#define PAE_WAIT_MASK 0
+#define PAE_INMASK 1
+#define PAE_WAIT_FIND 2
+#define PAE_INFIND 3
+#define PAE_WAIT_REPL 4
+#define PAE_INREPL 5
+
+static bool
+parse_affentry(char *str, char *mask, char *find, char *repl)
+{
+ int state = PAE_WAIT_MASK;
+ char *pmask = mask,
+ *pfind = find,
+ *prepl = repl;
+
+ *mask = *find = *repl = '\0';
+
+ while (*str)
+ {
+ if (state == PAE_WAIT_MASK)
+ {
+ if (t_iseq(str, '#'))
+ return false;
+ else if (!t_isspace(str))
+ {
+ COPYCHAR(pmask, str);
+ pmask += pg_mblen(str);
+ state = PAE_INMASK;
+ }
+ }
+ else if (state == PAE_INMASK)
+ {
+ if (t_iseq(str, '>'))
+ {
+ *pmask = '\0';
+ state = PAE_WAIT_FIND;
+ }
+ else if (!t_isspace(str))
+ {
+ COPYCHAR(pmask, str);
+ pmask += pg_mblen(str);
+ }
+ }
+ else if (state == PAE_WAIT_FIND)
+ {
+ if (t_iseq(str, '-'))
+ {
+ state = PAE_INFIND;
+ }
+ else if (t_isalpha(str) || t_iseq(str, '\'') /* english 's */ )
+ {
+ COPYCHAR(prepl, str);
+ prepl += pg_mblen(str);
+ state = PAE_INREPL;
+ }
+ else if (!t_isspace(str))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+ }
+ else if (state == PAE_INFIND)
+ {
+ if (t_iseq(str, ','))
+ {
+ *pfind = '\0';
+ state = PAE_WAIT_REPL;
+ }
+ else if (t_isalpha(str))
+ {
+ COPYCHAR(pfind, str);
+ pfind += pg_mblen(str);
+ }
+ else if (!t_isspace(str))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+ }
+ else if (state == PAE_WAIT_REPL)
+ {
+ if (t_iseq(str, '-'))
+ {
+ break; /* void repl */
+ }
+ else if (t_isalpha(str))
+ {
+ COPYCHAR(prepl, str);
+ prepl += pg_mblen(str);
+ state = PAE_INREPL;
+ }
+ else if (!t_isspace(str))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+ }
+ else if (state == PAE_INREPL)
+ {
+ if (t_iseq(str, '#'))
+ {
+ *prepl = '\0';
+ break;
+ }
+ else if (t_isalpha(str))
+ {
+ COPYCHAR(prepl, str);
+ prepl += pg_mblen(str);
+ }
+ else if (!t_isspace(str))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+ }
+ else
+ elog(ERROR, "unrecognized state in parse_affentry: %d", state);
+
+ str += pg_mblen(str);
+ }
+
+ *pmask = *pfind = *prepl = '\0';
+
+ return (*mask && (*find || *repl)) ? true : false;
+}
+
+static void
+addFlagValue(IspellDict *Conf, char *s, uint32 val)
+{
+ while (*s && t_isspace(s))
+ s += pg_mblen(s);
+
+ if (!*s)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("syntax error")));
+
+ if (pg_mblen(s) != 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("multibyte flag character is not allowed")));
+
+ Conf->flagval[*(unsigned char *) s] = (unsigned char) val;
+ Conf->usecompound = true;
+}
+
+static void
+NIImportOOAffixes(IspellDict *Conf, const char *filename)
+{
+ char type[BUFSIZ],
+ *ptype = NULL;
+ char sflag[BUFSIZ];
+ char mask[BUFSIZ],
+ *pmask;
+ char find[BUFSIZ],
+ *pfind;
+ char repl[BUFSIZ],
+ *prepl;
+ bool isSuffix = false;
+ int flag = 0;
+ char flagflags = 0;
+ tsearch_readline_state trst;
+ int scanread = 0;
+ char scanbuf[BUFSIZ];
+ char *recoded;
+
+ /* read file to find any flag */
+ memset(Conf->flagval, 0, sizeof(Conf->flagval));
+ Conf->usecompound = false;
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open affix file \"%s\": %m",
+ filename)));
+
+ while ((recoded = tsearch_readline(&trst)) != NULL)
+ {
+ if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
+ {
+ pfree(recoded);
+ continue;
+ }
+
+ if (STRNCMP(recoded, "COMPOUNDFLAG") == 0)
+ addFlagValue(Conf, recoded + strlen("COMPOUNDFLAG"),
+ FF_COMPOUNDFLAG);
+ else if (STRNCMP(recoded, "COMPOUNDBEGIN") == 0)
+ addFlagValue(Conf, recoded + strlen("COMPOUNDBEGIN"),
+ FF_COMPOUNDBEGIN);
+ else if (STRNCMP(recoded, "COMPOUNDLAST") == 0)
+ addFlagValue(Conf, recoded + strlen("COMPOUNDLAST"),
+ FF_COMPOUNDLAST);
+ /* COMPOUNDLAST and COMPOUNDEND are synonyms */
+ else if (STRNCMP(recoded, "COMPOUNDEND") == 0)
+ addFlagValue(Conf, recoded + strlen("COMPOUNDEND"),
+ FF_COMPOUNDLAST);
+ else if (STRNCMP(recoded, "COMPOUNDMIDDLE") == 0)
+ addFlagValue(Conf, recoded + strlen("COMPOUNDMIDDLE"),
+ FF_COMPOUNDMIDDLE);
+ else if (STRNCMP(recoded, "ONLYINCOMPOUND") == 0)
+ addFlagValue(Conf, recoded + strlen("ONLYINCOMPOUND"),
+ FF_COMPOUNDONLY);
+ else if (STRNCMP(recoded, "COMPOUNDPERMITFLAG") == 0)
+ addFlagValue(Conf, recoded + strlen("COMPOUNDPERMITFLAG"),
+ FF_COMPOUNDPERMITFLAG);
+ else if (STRNCMP(recoded, "COMPOUNDFORBIDFLAG") == 0)
+ addFlagValue(Conf, recoded + strlen("COMPOUNDFORBIDFLAG"),
+ FF_COMPOUNDFORBIDFLAG);
+ else if (STRNCMP(recoded, "FLAG") == 0)
+ {
+ char *s = recoded + strlen("FLAG");
+
+ while (*s && t_isspace(s))
+ s += pg_mblen(s);
+
+ if (*s && STRNCMP(s, "default") != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("Ispell dictionary supports only default flag value")));
+ }
+
+ pfree(recoded);
+ }
+ tsearch_readline_end(&trst);
+
+ sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open affix file \"%s\": %m",
+ filename)));
+
+ while ((recoded = tsearch_readline(&trst)) != NULL)
+ {
+ if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
+ goto nextline;
+
+ scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
+
+ if (ptype)
+ pfree(ptype);
+ ptype = lowerstr_ctx(Conf, type);
+ if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
+ goto nextline;
+
+ if (scanread == 4)
+ {
+ if (strlen(sflag) != 1)
+ goto nextline;
+ flag = *sflag;
+ isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
+ if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
+ flagflags = FF_CROSSPRODUCT;
+ else
+ flagflags = 0;
+ }
+ else
+ {
+ char *ptr;
+ int aflg = 0;
+
+ if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
+ goto nextline;
+ prepl = lowerstr_ctx(Conf, repl);
+ /* affix flag */
+ if ((ptr = strchr(prepl, '/')) != NULL)
+ {
+ *ptr = '\0';
+ ptr = repl + (ptr - prepl) + 1;
+ while (*ptr)
+ {
+ aflg |= Conf->flagval[*(unsigned char *) ptr];
+ ptr++;
+ }
+ }
+ pfind = lowerstr_ctx(Conf, find);
+ pmask = lowerstr_ctx(Conf, mask);
+ if (t_iseq(find, '0'))
+ *pfind = '\0';
+ if (t_iseq(repl, '0'))
+ *prepl = '\0';
+
+ NIAddAffix(Conf, flag, flagflags | aflg, pmask, pfind, prepl,
+ isSuffix ? FF_SUFFIX : FF_PREFIX);
+ pfree(prepl);
+ pfree(pfind);
+ pfree(pmask);
+ }
+
+nextline:
+ pfree(recoded);
+ }
+
+ tsearch_readline_end(&trst);
+ if (ptype)
+ pfree(ptype);
+}
+
+/*
+ * import affixes
+ *
+ * Note caller must already have applied get_tsearch_config_filename
+ */
+void
+SharedNIImportAffixes(IspellDict *Conf, const char *filename)
+{
+ char *pstr = NULL;
+ char mask[BUFSIZ];
+ char find[BUFSIZ];
+ char repl[BUFSIZ];
+ char *s;
+ bool suffixes = false;
+ bool prefixes = false;
+ int flag = 0;
+ char flagflags = 0;
+ tsearch_readline_state trst;
+ bool oldformat = false;
+ char *recoded = NULL;
+
+ if (!tsearch_readline_begin(&trst, filename))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("could not open affix file \"%s\": %m",
+ filename)));
+
+ memset(Conf->flagval, 0, sizeof(Conf->flagval));
+ Conf->usecompound = false;
+
+ while ((recoded = tsearch_readline(&trst)) != NULL)
+ {
+ pstr = lowerstr(recoded);
+
+ /* Skip comments and empty lines */
+ if (*pstr == '#' || *pstr == '\n')
+ goto nextline;
+
+ if (STRNCMP(pstr, "compoundwords") == 0)
+ {
+ s = findchar(pstr, 'l');
+ if (s)
+ {
+ s = recoded + (s - pstr); /* we need non-lowercased
+ * string */
+ while (*s && !t_isspace(s))
+ s += pg_mblen(s);
+ while (*s && t_isspace(s))
+ s += pg_mblen(s);
+
+ if (*s && pg_mblen(s) == 1)
+ {
+ Conf->flagval[*(unsigned char *) s] = FF_COMPOUNDFLAG;
+ Conf->usecompound = true;
+ }
+ oldformat = true;
+ goto nextline;
+ }
+ }
+ if (STRNCMP(pstr, "suffixes") == 0)
+ {
+ suffixes = true;
+ prefixes = false;
+ oldformat = true;
+ goto nextline;
+ }
+ if (STRNCMP(pstr, "prefixes") == 0)
+ {
+ suffixes = false;
+ prefixes = true;
+ oldformat = true;
+ goto nextline;
+ }
+ if (STRNCMP(pstr, "flag") == 0)
+ {
+ s = recoded + 4; /* we need non-lowercased string */
+ flagflags = 0;
+
+ while (*s && t_isspace(s))
+ s += pg_mblen(s);
+ oldformat = true;
+
+ /* allow only single-encoded flags */
+ if (pg_mblen(s) != 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("multibyte flag character is not allowed")));
+
+ if (*s == '*')
+ {
+ flagflags |= FF_CROSSPRODUCT;
+ s++;
+ }
+ else if (*s == '~')
+ {
+ flagflags |= FF_COMPOUNDONLY;
+ s++;
+ }
+
+ if (*s == '\\')
+ s++;
+
+ /* allow only single-encoded flags */
+ if (pg_mblen(s) != 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("multibyte flag character is not allowed")));
+
+ flag = *(unsigned char *) s;
+ goto nextline;
+ }
+ if (STRNCMP(recoded, "COMPOUNDFLAG") == 0 || STRNCMP(recoded, "COMPOUNDMIN") == 0 ||
+ STRNCMP(recoded, "PFX") == 0 || STRNCMP(recoded, "SFX") == 0)
+ {
+ if (oldformat)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIG_FILE_ERROR),
+ errmsg("wrong affix file format for flag")));
+ tsearch_readline_end(&trst);
+ NIImportOOAffixes(Conf, filename);
+ return;
+ }
+ if ((!suffixes) && (!prefixes))
+ goto nextline;
+
+ if (!parse_affentry(pstr, mask, find, repl))
+ goto nextline;
+
+ NIAddAffix(Conf, flag, flagflags, mask, find, repl, suffixes ? FF_SUFFIX : FF_PREFIX);
+
+nextline:
+ pfree(recoded);
+ pfree(pstr);
+ }
+ tsearch_readline_end(&trst);
+}
+
+static int
+MergeAffix(IspellDict *Conf, int a1, int a2)
+{
+ char **ptr;
+
+ while (Conf->nAffixData + 1 >= Conf->lenAffixData)
+ {
+ Conf->lenAffixData *= 2;
+ Conf->AffixData = (char **) repalloc(Conf->AffixData,
+ sizeof(char *) * Conf->lenAffixData);
+ }
+
+ ptr = Conf->AffixData + Conf->nAffixData;
+ *ptr = cpalloc(strlen(Conf->AffixData[a1]) +
+ strlen(Conf->AffixData[a2]) +
+ 1 /* space */ + 1 /* \0 */ );
+ sprintf(*ptr, "%s %s", Conf->AffixData[a1], Conf->AffixData[a2]);
+ ptr++;
+ *ptr = NULL;
+ Conf->nAffixData++;
+
+ return Conf->nAffixData - 1;
+}
+
+static uint32
+makeCompoundFlags(IspellDict *Conf, int affix)
+{
+ uint32 flag = 0;
+ char *str = Conf->AffixData[affix];
+
+ while (str && *str)
+ {
+ flag |= Conf->flagval[*(unsigned char *) str];
+ str++;
+ }
+
+ return (flag & FF_DICTFLAGMASK);
+}
+
+static SPNode *
+mkSPNode(IspellDict *Conf, int low, int high, int level)
+{
+ int i;
+ int nchar = 0;
+ char lastchar = '\0';
+ SPNode *rs;
+ SPNodeData *data;
+ int lownew = low;
+
+ for (i = low; i < high; i++)
+ if (Conf->Spell[i]->p.d.len > level && lastchar != Conf->Spell[i]->word[level])
+ {
+ nchar++;
+ lastchar = Conf->Spell[i]->word[level];
+ }
+
+ if (!nchar)
+ return NULL;
+
+ rs = (SPNode *) cpalloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
+ rs->length = nchar;
+ data = rs->data;
+
+ lastchar = '\0';
+ for (i = low; i < high; i++)
+ if (Conf->Spell[i]->p.d.len > level)
+ {
+ if (lastchar != Conf->Spell[i]->word[level])
+ {
+ if (lastchar)
+ {
+ data->node = mkSPNode(Conf, lownew, i, level + 1);
+ lownew = i;
+ data++;
+ }
+ lastchar = Conf->Spell[i]->word[level];
+ }
+ data->val = ((uint8 *) (Conf->Spell[i]->word))[level];
+ if (Conf->Spell[i]->p.d.len == level + 1)
+ {
+ bool clearCompoundOnly = false;
+
+ if (data->isword && data->affix != Conf->Spell[i]->p.d.affix)
+ {
+ /*
+ * MergeAffix called a few times. If one of word is
+ * allowed to be in compound word and another isn't, then
+ * clear FF_COMPOUNDONLY flag.
+ */
+
+ clearCompoundOnly = (FF_COMPOUNDONLY & data->compoundflag
+ & makeCompoundFlags(Conf, Conf->Spell[i]->p.d.affix))
+ ? false : true;
+ data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i]->p.d.affix);
+ }
+ else
+ data->affix = Conf->Spell[i]->p.d.affix;
+ data->isword = 1;
+
+ data->compoundflag = makeCompoundFlags(Conf, data->affix);
+
+ if ((data->compoundflag & FF_COMPOUNDONLY) &&
+ (data->compoundflag & FF_COMPOUNDFLAG) == 0)
+ data->compoundflag |= FF_COMPOUNDFLAG;
+
+ if (clearCompoundOnly)
+ data->compoundflag &= ~FF_COMPOUNDONLY;
+ }
+ }
+
+ data->node = mkSPNode(Conf, lownew, high, level + 1);
+
+ return rs;
+}
+
+/*
+ * Builds the Conf->Dictionary tree and AffixData from the imported dictionary
+ * and affixes.
+ */
+void
+SharedNISortDictionary(IspellDict *Conf)
+{
+ int i;
+ int naffix = 0;
+ int curaffix;
+
+ /* compress affixes */
+
+ /* Count the number of different flags used in the dictionary */
+
+ qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
+
+ naffix = 0;
+ for (i = 0; i < Conf->nspell; i++)
+ {
+ if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN))
+ naffix++;
+ }
+
+ /*
+ * Fill in Conf->AffixData with the affixes that were used in the
+ * dictionary. Replace textual flag-field of Conf->Spell entries with
+ * indexes into Conf->AffixData array.
+ */
+ Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
+
+ curaffix = -1;
+ for (i = 0; i < Conf->nspell; i++)
+ {
+ if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN))
+ {
+ curaffix++;
+ Assert(curaffix < naffix);
+ Conf->AffixData[curaffix] = cpstrdup(Conf, Conf->Spell[i]->p.flag);
+ }
+
+ Conf->Spell[i]->p.d.affix = curaffix;
+ Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
+ }
+
+ Conf->lenAffixData = Conf->nAffixData = naffix;
+
+ qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
+ Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
+}
+
+static AffixNode *
+mkANode(IspellDict *Conf, int low, int high, int level, int type)
+{
+ int i;
+ int nchar = 0;
+ uint8 lastchar = '\0';
+ AffixNode *rs;
+ AffixNodeData *data;
+ int lownew = low;
+ int naff;
+ AFFIX **aff;
+
+ for (i = low; i < high; i++)
+ if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
+ {
+ nchar++;
+ lastchar = GETCHAR(Conf->Affix + i, level, type);
+ }
+
+ if (!nchar)
+ return NULL;
+
+ aff = (AFFIX **) tmpalloc(sizeof(AFFIX *) * (high - low + 1));
+ naff = 0;
+
+ rs = (AffixNode *) cpalloc0(ANHRDSZ + nchar * sizeof(AffixNodeData));
+ rs->length = nchar;
+ data = rs->data;
+
+ lastchar = '\0';
+ for (i = low; i < high; i++)
+ if (Conf->Affix[i].replen > level)
+ {
+ if (lastchar != GETCHAR(Conf->Affix + i, level, type))
+ {
+ if (lastchar)
+ {
+ data->node = mkANode(Conf, lownew, i, level + 1, type);
+ if (naff)
+ {
+ data->naff = naff;
+ data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
+ memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
+ naff = 0;
+ }
+ data++;
+ lownew = i;
+ }
+ lastchar = GETCHAR(Conf->Affix + i, level, type);
+ }
+ data->val = GETCHAR(Conf->Affix + i, level, type);
+ if (Conf->Affix[i].replen == level + 1)
+ { /* affix stopped */
+ aff[naff++] = Conf->Affix + i;
+ }
+ }
+
+ data->node = mkANode(Conf, lownew, high, level + 1, type);
+ if (naff)
+ {
+ data->naff = naff;
+ data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * naff);
+ memcpy(data->aff, aff, sizeof(AFFIX *) * naff);
+ naff = 0;
+ }
+
+ pfree(aff);
+
+ return rs;
+}
+
+static void
+mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
+{
+ int i,
+ cnt = 0;
+ int start = (issuffix) ? startsuffix : 0;
+ int end = (issuffix) ? Conf->naffixes : startsuffix;
+ AffixNode *Affix = (AffixNode *) palloc0(ANHRDSZ + sizeof(AffixNodeData));
+
+ Affix->length = 1;
+ Affix->isvoid = 1;
+
+ if (issuffix)
+ {
+ Affix->data->node = Conf->Suffix;
+ Conf->Suffix = Affix;
+ }
+ else
+ {
+ Affix->data->node = Conf->Prefix;
+ Conf->Prefix = Affix;
+ }
+
+
+ for (i = start; i < end; i++)
+ if (Conf->Affix[i].replen == 0)
+ cnt++;
+
+ if (cnt == 0)
+ return;
+
+ Affix->data->aff = (AFFIX **) cpalloc(sizeof(AFFIX *) * cnt);
+ Affix->data->naff = (uint32) cnt;
+
+ cnt = 0;
+ for (i = start; i < end; i++)
+ if (Conf->Affix[i].replen == 0)
+ {
+ Affix->data->aff[cnt] = Conf->Affix + i;
+ cnt++;
+ }
+}
+
+static bool
+isAffixInUse(IspellDict *Conf, char flag)
+{
+ int i;
+
+ for (i = 0; i < Conf->nAffixData; i++)
+ if (strchr(Conf->AffixData[i], flag) != NULL)
+ return true;
+
+ return false;
+}
+
+void
+SharedNISortAffixes(IspellDict *Conf)
+{
+ AFFIX *Affix;
+ size_t i;
+ CMPDAffix *ptr;
+ int firstsuffix = Conf->naffixes;
+
+ if (Conf->naffixes == 0)
+ return;
+
+ if (Conf->naffixes > 1)
+ qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
+ Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
+ ptr->affix = NULL;
+
+ for (i = 0; i < Conf->naffixes; i++)
+ {
+ Affix = &(((AFFIX *) Conf->Affix)[i]);
+ if (Affix->type == FF_SUFFIX && i < firstsuffix)
+ firstsuffix = i;
+
+ if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
+ isAffixInUse(Conf, (char) Affix->flag))
+ {
+ if (ptr == Conf->CompoundAffix ||
+ ptr->issuffix != (ptr - 1)->issuffix ||
+ strbncmp((const unsigned char *) (ptr - 1)->affix,
+ (const unsigned char *) Affix->repl,
+ (ptr - 1)->len))
+ {
+ /* leave only unique and minimals suffixes */
+ ptr->affix = Affix->repl;
+ ptr->len = Affix->replen;
+ ptr->issuffix = (Affix->type == FF_SUFFIX) ? true : false;
+ ptr++;
+ }
+ }
+ }
+ ptr->affix = NULL;
+ Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
+ Conf->cmpaffixes = (int)(ptr - Conf->CompoundAffix + 1);
+
+ Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
+ Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
+ mkVoidAffix(Conf, true, firstsuffix);
+ mkVoidAffix(Conf, false, firstsuffix);
+}
+
+static AffixNodeData *
+FindAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type)
+{
+ AffixNodeData *StopLow,
+ *StopHigh,
+ *StopMiddle;
+ uint8 symbol;
+
+ if (node->isvoid)
+ { /* search void affixes */
+ if (node->data->naff)
+ return node->data;
+ node = node->data->node;
+ }
+
+ while (node && *level < wrdlen)
+ {
+ StopLow = node->data;
+ StopHigh = node->data + node->length;
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+ symbol = GETWCHAR(word, wrdlen, *level, type);
+
+ if (StopMiddle->val == symbol)
+ {
+ (*level)++;
+ if (StopMiddle->naff)
+ return StopMiddle;
+ node = StopMiddle->node;
+ break;
+ }
+ else if (StopMiddle->val < symbol)
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle;
+ }
+ if (StopLow >= StopHigh)
+ break;
+ }
+ return NULL;
+}
+
+static char *
+CheckAffix(const char *word, size_t len, AFFIX *Affix, int flagflags, char *newword, int *baselen)
+{
+ /*
+ * Check compound allow flags
+ */
+
+ if (flagflags == 0)
+ {
+ if (Affix->flagflags & FF_COMPOUNDONLY)
+ return NULL;
+ }
+ else if (flagflags & FF_COMPOUNDBEGIN)
+ {
+ if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
+ return NULL;
+ if ((Affix->flagflags & FF_COMPOUNDBEGIN) == 0)
+ if (Affix->type == FF_SUFFIX)
+ return NULL;
+ }
+ else if (flagflags & FF_COMPOUNDMIDDLE)
+ {
+ if ((Affix->flagflags & FF_COMPOUNDMIDDLE) == 0 ||
+ (Affix->flagflags & FF_COMPOUNDFORBIDFLAG))
+ return NULL;
+ }
+ else if (flagflags & FF_COMPOUNDLAST)
+ {
+ if (Affix->flagflags & FF_COMPOUNDFORBIDFLAG)
+ return NULL;
+ if ((Affix->flagflags & FF_COMPOUNDLAST) == 0)
+ if (Affix->type == FF_PREFIX)
+ return NULL;
+ }
+
+ /*
+ * make replace pattern of affix
+ */
+ if (Affix->type == FF_SUFFIX)
+ {
+ strcpy(newword, word);
+ strcpy(newword + len - Affix->replen, Affix->find);
+ if (baselen) /* store length of non-changed part of word */
+ *baselen = len - Affix->replen;
+ }
+ else
+ {
+ /*
+ * if prefix is a all non-chaged part's length then all word contains
+ * only prefix and suffix, so out
+ */
+ if (baselen && *baselen + strlen(Affix->find) <= Affix->replen)
+ return NULL;
+ strcpy(newword, Affix->find);
+ strcat(newword, word + Affix->replen);
+ }
+
+ /*
+ * check resulting word
+ */
+ if (Affix->issimple)
+ return newword;
+ else if (Affix->isregis)
+ {
+ if (RS_execute(&(Affix->reg.regis), newword))
+ return newword;
+ }
+ else
+ {
+ int err;
+ pg_wchar *data;
+ size_t data_len;
+ int newword_len;
+
+ /* Convert data string to wide characters */
+ newword_len = strlen(newword);
+ data = (pg_wchar *) palloc((newword_len + 1) * sizeof(pg_wchar));
+ data_len = pg_mb2wchar_with_len(newword, data, newword_len);
+
+ if (!(err = pg_regexec(&(Affix->reg.regex), data, data_len, 0, NULL, 0, NULL, 0)))
+ {
+ pfree(data);
+ return newword;
+ }
+ pfree(data);
+ }
+
+ return NULL;
+}
+
+static int
+addToResult(char **forms, char **cur, char *word)
+{
+ if (cur - forms >= MAX_NORM - 1)
+ return 0;
+ if (forms == cur || strcmp(word, *(cur - 1)) != 0)
+ {
+ *cur = pstrdup(word);
+ *(cur + 1) = NULL;
+ return 1;
+ }
+
+ return 0;
+}
+
+static char **
+NormalizeSubWord(SharedIspellDict *Conf, char *word, int flag)
+{
+ AffixNodeData *suffix = NULL,
+ *prefix = NULL;
+ int slevel = 0,
+ plevel = 0;
+ int wrdlen = strlen(word),
+ swrdlen;
+ char **forms;
+ char **cur;
+ char newword[2 * MAXNORMLEN] = "";
+ char pnewword[2 * MAXNORMLEN] = "";
+ AffixNode *snode = Conf->Suffix,
+ *pnode;
+ int i,
+ j;
+
+ if (wrdlen > MAXNORMLEN)
+ return NULL;
+ cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
+ *cur = NULL;
+
+
+ /* Check that the word itself is normal form */
+ if (FindWord(Conf, word, 0, flag))
+ {
+ *cur = pstrdup(word);
+ cur++;
+ *cur = NULL;
+ }
+
+ /* Find all other NORMAL forms of the 'word' (check only prefix) */
+ pnode = Conf->Prefix;
+ plevel = 0;
+ while (pnode)
+ {
+ prefix = FindAffixes(pnode, word, wrdlen, &plevel, FF_PREFIX);
+ if (!prefix)
+ break;
+ for (j = 0; j < prefix->naff; j++)
+ {
+ if (CheckAffix(word, wrdlen, prefix->aff[j], flag, newword, NULL))
+ {
+ /* prefix success */
+ if (FindWord(Conf, newword, prefix->aff[j]->flag, flag))
+ cur += addToResult(forms, cur, newword);
+ }
+ }
+ pnode = prefix->node;
+ }
+
+ /*
+ * Find all other NORMAL forms of the 'word' (check suffix and then
+ * prefix)
+ */
+ while (snode)
+ {
+ int baselen = 0;
+
+ /* find possible suffix */
+ suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
+ if (!suffix)
+ break;
+ /* foreach suffix check affix */
+ for (i = 0; i < suffix->naff; i++)
+ {
+ if (CheckAffix(word, wrdlen, suffix->aff[i], flag, newword, &baselen))
+ {
+ /* suffix success */
+ if (FindWord(Conf, newword, suffix->aff[i]->flag, flag))
+ cur += addToResult(forms, cur, newword);
+
+ /* now we will look changed word with prefixes */
+ pnode = Conf->Prefix;
+ plevel = 0;
+ swrdlen = strlen(newword);
+ while (pnode)
+ {
+ prefix = FindAffixes(pnode, newword, swrdlen, &plevel, FF_PREFIX);
+ if (!prefix)
+ break;
+ for (j = 0; j < prefix->naff; j++)
+ {
+ if (CheckAffix(newword, swrdlen, prefix->aff[j], flag, pnewword, &baselen))
+ {
+ /* prefix success */
+ int ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
+ 0 : prefix->aff[j]->flag;
+
+ if (FindWord(Conf, pnewword, ff, flag))
+ cur += addToResult(forms, cur, pnewword);
+ }
+ }
+ pnode = prefix->node;
+ }
+ }
+ }
+
+ snode = suffix->node;
+ }
+
+ if (cur == forms)
+ {
+ pfree(forms);
+ return (NULL);
+ }
+ return (forms);
+}
+
+typedef struct SplitVar
+{
+ int nstem;
+ int lenstem;
+ char **stem;
+ struct SplitVar *next;
+} SplitVar;
+
+static int
+CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len, bool CheckInPlace)
+{
+ bool issuffix;
+
+ if (CheckInPlace)
+ {
+ while ((*ptr)->affix)
+ {
+ if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
+ {
+ len = (*ptr)->len;
+ issuffix = (*ptr)->issuffix;
+ (*ptr)++;
+ return (issuffix) ? len : 0;
+ }
+ (*ptr)++;
+ }
+ }
+ else
+ {
+ char *affbegin;
+
+ while ((*ptr)->affix)
+ {
+ if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
+ {
+ len = (*ptr)->len + (affbegin - word);
+ issuffix = (*ptr)->issuffix;
+ (*ptr)++;
+ return (issuffix) ? len : 0;
+ }
+ (*ptr)++;
+ }
+ }
+ return -1;
+}
+
+static SplitVar *
+CopyVar(SplitVar *s, int makedup)
+{
+ SplitVar *v = (SplitVar *) palloc(sizeof(SplitVar));
+
+ v->next = NULL;
+ if (s)
+ {
+ int i;
+
+ v->lenstem = s->lenstem;
+ v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
+ v->nstem = s->nstem;
+ for (i = 0; i < s->nstem; i++)
+ v->stem[i] = (makedup) ? pstrdup(s->stem[i]) : s->stem[i];
+ }
+ else
+ {
+ v->lenstem = 16;
+ v->stem = (char **) palloc(sizeof(char *) * v->lenstem);
+ v->nstem = 0;
+ }
+ return v;
+}
+
+static void
+AddStem(SplitVar *v, char *word)
+{
+ if (v->nstem >= v->lenstem)
+ {
+ v->lenstem *= 2;
+ v->stem = (char **) repalloc(v->stem, sizeof(char *) * v->lenstem);
+ }
+
+ v->stem[v->nstem] = word;
+ v->nstem++;
+}
+
+static SplitVar *
+SplitToVariants(SharedIspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int wordlen, int startpos, int minpos)
+{
+ SplitVar *var = NULL;
+ SPNodeData *StopLow,
+ *StopHigh,
+ *StopMiddle = NULL;
+ SPNode *node = (snode) ? snode : Conf->Dictionary;
+ int level = (snode) ? minpos : startpos; /* recursive
+ * minpos==level */
+ int lenaff;
+ CMPDAffix *caff;
+ char *notprobed;
+ int compoundflag = 0;
+
+ notprobed = (char *) palloc(wordlen);
+ memset(notprobed, 1, wordlen);
+ var = CopyVar(orig, 1);
+
+ while (level < wordlen)
+ {
+ /* find word with epenthetic or/and compound affix */
+ caff = Conf->CompoundAffix;
+ while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) >= 0)
+ {
+ /*
+ * there is one of compound affixes, so check word for existings
+ */
+ char buf[MAXNORMLEN];
+ char **subres;
+
+ lenaff = level - startpos + lenaff;
+
+ if (!notprobed[startpos + lenaff - 1])
+ continue;
+
+ if (level + lenaff - 1 <= minpos)
+ continue;
+
+ if (lenaff >= MAXNORMLEN)
+ continue; /* skip too big value */
+ if (lenaff > 0)
+ memcpy(buf, word + startpos, lenaff);
+ buf[lenaff] = '\0';
+
+ if (level == 0)
+ compoundflag = FF_COMPOUNDBEGIN;
+ else if (level == wordlen - 1)
+ compoundflag = FF_COMPOUNDLAST;
+ else
+ compoundflag = FF_COMPOUNDMIDDLE;
+ subres = NormalizeSubWord(Conf, buf, compoundflag);
+ if (subres)
+ {
+ /* Yes, it was a word from dictionary */
+ SplitVar *new = CopyVar(var, 0);
+ SplitVar *ptr = var;
+ char **sptr = subres;
+
+ notprobed[startpos + lenaff - 1] = 0;
+
+ while (*sptr)
+ {
+ AddStem(new, *sptr);
+ sptr++;
+ }
+ pfree(subres);
+
+ while (ptr->next)
+ ptr = ptr->next;
+ ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos + lenaff, startpos + lenaff);
+
+ pfree(new->stem);
+ pfree(new);
+ }
+ }
+
+ if (!node)
+ break;
+
+ StopLow = node->data;
+ StopHigh = node->data + node->length;
+ while (StopLow < StopHigh)
+ {
+ StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+ if (StopMiddle->val == ((uint8 *) (word))[level])
+ break;
+ else if (StopMiddle->val < ((uint8 *) (word))[level])
+ StopLow = StopMiddle + 1;
+ else
+ StopHigh = StopMiddle;
+ }
+
+ if (StopLow < StopHigh)
+ {
+ if (level == FF_COMPOUNDBEGIN)
+ compoundflag = FF_COMPOUNDBEGIN;
+ else if (level == wordlen - 1)
+ compoundflag = FF_COMPOUNDLAST;
+ else
+ compoundflag = FF_COMPOUNDMIDDLE;
+
+ /* find infinitive */
+ if (StopMiddle->isword &&
+ (StopMiddle->compoundflag & compoundflag) &&
+ notprobed[level])
+ {
+ /* ok, we found full compoundallowed word */
+ if (level > minpos)
+ {
+ /* and its length more than minimal */
+ if (wordlen == level + 1)
+ {
+ /* well, it was last word */
+ AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
+ pfree(notprobed);
+ return var;
+ }
+ else
+ {
+ /* then we will search more big word at the same point */
+ SplitVar *ptr = var;
+
+ while (ptr->next)
+ ptr = ptr->next;
+ ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
+ /* we can find next word */
+ level++;
+ AddStem(var, pnstrdup(word + startpos, level - startpos));
+ node = Conf->Dictionary;
+ startpos = level;
+ continue;
+ }
+ }
+ }
+ node = StopMiddle->node;
+ }
+ else
+ node = NULL;
+ level++;
+ }
+
+ AddStem(var, pnstrdup(word + startpos, wordlen - startpos));
+ pfree(notprobed);
+ return var;
+}
+
+static void
+addNorm(TSLexeme **lres, TSLexeme **lcur, char *word, int flags, uint16 NVariant)
+{
+ if (*lres == NULL)
+ *lcur = *lres = (TSLexeme *) palloc(MAX_NORM * sizeof(TSLexeme));
+
+ if (*lcur - *lres < MAX_NORM - 1)
+ {
+ (*lcur)->lexeme = word;
+ (*lcur)->flags = flags;
+ (*lcur)->nvariant = NVariant;
+ (*lcur)++;
+ (*lcur)->lexeme = NULL;
+ }
+}
+
+TSLexeme *
+SharedNINormalizeWord(SharedIspellDict *Conf, char *word)
+{
+ char **res;
+ TSLexeme *lcur = NULL,
+ *lres = NULL;
+ uint16 NVariant = 1;
+
+ res = NormalizeSubWord(Conf, word, 0);
+
+ if (res)
+ {
+ char **ptr = res;
+
+ while (*ptr && (lcur - lres) < MAX_NORM)
+ {
+ addNorm(&lres, &lcur, *ptr, 0, NVariant++);
+ ptr++;
+ }
+ pfree(res);
+ }
+
+ if (Conf->usecompound)
+ {
+ int wordlen = strlen(word);
+ SplitVar *ptr,
+ *var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
+ int i;
+
+ while (var)
+ {
+ if (var->nstem > 1)
+ {
+ char **subres = NormalizeSubWord(Conf, var->stem[var->nstem - 1], FF_COMPOUNDLAST);
+
+ if (subres)
+ {
+ char **subptr = subres;
+
+ while (*subptr)
+ {
+ for (i = 0; i < var->nstem - 1; i++)
+ {
+ addNorm(&lres, &lcur, (subptr == subres) ? var->stem[i] : pstrdup(var->stem[i]), 0, NVariant);
+ }
+
+ addNorm(&lres, &lcur, *subptr, 0, NVariant);
+ subptr++;
+ NVariant++;
+ }
+
+ pfree(subres);
+ var->stem[0] = NULL;
+ pfree(var->stem[var->nstem - 1]);
+ }
+ }
+
+ for (i = 0; i < var->nstem && var->stem[i]; i++)
+ pfree(var->stem[i]);
+ ptr = var->next;
+ pfree(var->stem);
+ pfree(var);
+ var = ptr;
+ }
+ }
+
+ return lres;
+}
214 src/spell.h
@@ -0,0 +1,214 @@
+/*-------------------------------------------------------------------------
+ *
+ * spell.h
+ *
+ * Declarations for ISpell dictionary
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ *
+ * src/include/tsearch/dicts/spell.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef __SPELL_H__
+#define __SPELL_H__
+
+#include "regex/regex.h"
+#include "tsearch/dicts/regis.h"
+#include "tsearch/ts_public.h"
+#include "storage/lwlock.h"
+
+/*
+ * Max length of a flag name. Names longer than this will be truncated
+ * to the maximum.
+ */
+#define MAXFLAGLEN 16
+
+struct SPNode;
+
+typedef struct
+{
+ uint32 val:8,
+ isword:1,
+ compoundflag:4,
+ affix:19;
+ struct SPNode *node;
+} SPNodeData;
+
+/*
+ * Names of FF_ are correlated with Hunspell options in affix file
+ * http://hunspell.sourceforge.net/
+ */
+#define FF_COMPOUNDONLY 0x01
+#define FF_COMPOUNDBEGIN 0x02
+#define FF_COMPOUNDMIDDLE 0x04
+#define FF_COMPOUNDLAST 0x08
+#define FF_COMPOUNDFLAG ( FF_COMPOUNDBEGIN | FF_COMPOUNDMIDDLE | FF_COMPOUNDLAST )
+#define FF_DICTFLAGMASK 0x0f
+
+typedef struct SPNode
+{
+ uint32 length;
+ SPNodeData data[1];
+} SPNode;
+
+#define SPNHDRSZ (offsetof(SPNode,data))
+
+
+typedef struct spell_struct
+{
+ union
+ {
+ /*
+ * flag is filled in by NIImportDictionary. After NISortDictionary, d
+ * is valid and flag is invalid.
+ */
+ char flag[MAXFLAGLEN];
+ struct
+ {
+ int affix;
+ int len;
+ } d;
+ } p;
+ char word[1]; /* variable length, null-terminated */
+} SPELL;
+
+#define SPELLHDRSZ (offsetof(SPELL, word))
+
+typedef struct AFFIX
+{
+ uint32 flag:8,
+ type:1,
+ flagflags:7,
+ issimple:1,
+ isregis:1,
+ replen:14;
+ char *find;
+ char *repl;
+
+ union
+ {
+ regex_t regex;
+ Regis regis;
+ } reg;
+} AFFIX;
+
+/*
+ * affixes use dictionary flags too
+ */
+#define FF_COMPOUNDPERMITFLAG 0x10
+#define FF_COMPOUNDFORBIDFLAG 0x20
+#define FF_CROSSPRODUCT 0x40
+
+/*
+ * Don't change the order of these. Initialization sorts by these,
+ * and expects prefixes to come first after sorting.
+ */
+#define FF_SUFFIX 1
+#define FF_PREFIX 0
+
+struct AffixNode;
+
+typedef struct
+{
+ uint32 val:8,
+ naff:24;
+ AFFIX **aff;
+ struct AffixNode *node;
+} AffixNodeData;
+
+typedef struct AffixNode
+{
+ uint32 isvoid:1,
+ length:31;
+
+ AffixNodeData data[1];
+} AffixNode;
+
+#define ANHRDSZ (offsetof(AffixNode, data))
+
+typedef struct
+{
+ char *affix;
+ int len;
+ bool issuffix;
+} CMPDAffix;
+
+typedef struct
+{
+ int maffixes;
+ int naffixes;
+ AFFIX *Affix;
+
+ AffixNode *Suffix;
+ AffixNode *Prefix;
+
+ SPNode *Dictionary;
+ char **AffixData;
+ int lenAffixData;
+ int nAffixData;
+
+ CMPDAffix *CompoundAffix;
+ int cmpaffixes;
+
+ unsigned char flagval[256];
+ bool usecompound;
+
+ /*
+ * Remaining fields are only used during dictionary construction; they are
+ * set up by NIStartBuild and cleared by NIFinishBuild.
+ */
+ MemoryContext buildCxt; /* temp context for construction */
+
+ /* Temporary array of all words in the dict file */
+ SPELL **Spell;
+ int nspell; /* number of valid entries in Spell array */
+ int mspell; /* allocated length of Spell array */
+
+ /* These are used to allocate "compact" data without palloc overhead */
+ char *firstfree; /* first free address (always maxaligned) */
+ size_t avail; /* free space remaining at firstfree */
+
+} IspellDict;
+
+typedef struct SharedIspellDict
+{
+ /* this is used for selecting the dictionary */
+ char * dictFile;
+ char * affixFile;
+
+ /* FIXME should we add stopwords here too? */
+
+ /* next dictionary in the chain */
+ struct SharedIspellDict * next;
+
+ /* the copied fields */
+ int naffixes;
+ AFFIX *Affix;
+
+ AffixNode *Suffix;
+ AffixNode *Prefix;
+
+ SPNode *Dictionary;
+ char **AffixData;
+ int lenAffixData;
+ int nAffixData;
+
+ CMPDAffix *CompoundAffix;
+
+ unsigned char flagval[256];
+ bool usecompound;
+
+} SharedIspellDict;