diff --git a/Makefile b/Makefile index 0717592f5e..dcfd883319 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ OBJS = src/rumsort.o src/rum_ts_utils.o src/rumtsquery.o \ src/rumbtree.o src/rumbulk.o src/rumdatapage.o \ src/rumentrypage.o src/rumget.o src/ruminsert.o \ src/rumscan.o src/rumutil.o src/rumvacuum.o src/rumvalidate.o \ - src/btree_rum.o $(WIN32RES) + src/btree_rum.o src/tf_idf.o $(WIN32RES) EXTENSION = rum DATA = rum--1.0.sql rum--1.0--1.1.sql rum--1.1.sql diff --git a/src/rum.h b/src/rum.h index 78cb8db439..3f48ed4fa7 100644 --- a/src/rum.h +++ b/src/rum.h @@ -19,6 +19,7 @@ #include "access/sdir.h" #include "lib/rbtree.h" #include "storage/bufmgr.h" +#include "utils/guc.h" #include "rumsort.h" @@ -1008,4 +1009,10 @@ extern Datum FunctionCall10Coll(FmgrInfo *flinfo, Oid collation, Datum arg6, Datum arg7, Datum arg8, Datum arg9, Datum arg10); +/* tf_idf.c */ +extern char *TFIDFSource; +extern bool check_tf_idf_source(char **newval, void **extra, GucSource source); +extern void assign_tf_idf_source(const char *newval, void *extra); +extern float4 estimate_idf(char *lexeme, int length); + #endif /* __RUM_H__ */ diff --git a/src/rum_ts_utils.c b/src/rum_ts_utils.c index 07faabe42c..d9f79423b2 100644 --- a/src/rum_ts_utils.c +++ b/src/rum_ts_utils.c @@ -96,12 +96,13 @@ typedef struct } key; } data; uint8 wclass; + float4 idf; int32 pos; } DocRepresentation; typedef struct { - bool operandexist; + bool operandexist; WordEntryPos pos; } QueryRepresentationOperand; @@ -140,6 +141,7 @@ static WordEntryPosVector POSNULL = { #define RANK_NORM_UNIQ 0x08 #define RANK_NORM_LOGUNIQ 0x10 #define RANK_NORM_RDIVRPLUS1 0x20 +#define RANK_NORM_IDF 0x40 #define DEF_NORM_METHOD RANK_NO_NORM #define QR_GET_OPERAND(q, v) \ @@ -1090,7 +1092,7 @@ find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem) } static DocRepresentation * -get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen) +get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen, bool load_idf) { QueryItem *item = GETQUERY(qr->query); WordEntry *entry, @@ -1126,6 +1128,8 @@ get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen) while (entry - firstentry < nitem) { + float4 idf; + if (entry->haspos) { dimt = POSDATALEN(txt, entry); @@ -1179,12 +1183,18 @@ get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen) } } + + if (load_idf) + idf = estimate_idf(STRPTR(txt) + entry->pos, entry->len); + else + idf = 1.0f; } else { doc[cur].data.item.nitem = doc[cur - 1].data.item.nitem; doc[cur].data.item.item = doc[cur - 1].data.item.item; } + doc[cur].idf = idf; doc[cur].pos = WEP_GETPOS(post[j]); doc[cur].wclass = WEP_GETWEIGHT(post[j]); cur++; @@ -1229,6 +1239,7 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen, { double Cpos = 0.0; double InvSum = 0.0; + double Idf = 0.0; int nNoise; DocRepresentation *ptr = ext.begin; /* Added by SK */ @@ -1247,6 +1258,7 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen, /* For rum_tsquery_distance() */ else new_cover_key += (int)(uintptr_t)ptr->data.key.item_first; + Idf += ptr->idf; ptr++; } @@ -1286,6 +1298,9 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen, if (nitems > 0) Cpos *= nitems; + if (method & RANK_NORM_IDF) + Cpos *= Idf; + /* * if doc are big enough then ext.q may be equal to ext.p due to limit * of posional information. In this case we approximate number of @@ -1371,7 +1386,7 @@ calc_score(float4 *arrdata, TSVector txt, TSQuery query, int method) qr.operandData = palloc0(sizeof(qr.operandData[0]) * query->size); qr.length = query->size; - doc = get_docrep(txt, &qr, &doclen); + doc = get_docrep(txt, &qr, &doclen, (method & RANK_NORM_IDF) ? true : false); if (!doc) { pfree(qr.operandData); diff --git a/src/rumutil.c b/src/rumutil.c index 25eaaedddd..e67209b578 100644 --- a/src/rumutil.c +++ b/src/rumutil.c @@ -49,6 +49,17 @@ _PG_init(void) PGC_USERSET, 0, NULL, NULL, NULL); + DefineCustomStringVariable("tf_tdf_source", + "Source statistics for TD/IFD calculation.", + "", + &TFIDFSource, + "", + PGC_USERSET, + 0, + check_tf_idf_source, + assign_tf_idf_source, + NULL); + rum_relopt_kind = add_reloption_kind(); add_string_reloption(rum_relopt_kind, "attach", diff --git a/src/tf_idf.c b/src/tf_idf.c new file mode 100644 index 0000000000..1c14ef2d04 --- /dev/null +++ b/src/tf_idf.c @@ -0,0 +1,346 @@ +/*------------------------------------------------------------------------- + * + * tf_idf.c + * Implementation of TD/IDF statistics calculation. + * + * Portions Copyright (c) 2017, Postgres Professional + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/namespace.h" +#include "catalog/pg_statistic.h" +#include "catalog/pg_type.h" +#include "nodes/nodeFuncs.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/syscache.h" +#include "utils/varlena.h" + +#include "rum.h" + +/* + * FIXME: + * * cache IDF for ts_query (non-prefix search?) + * * calculate IDF from RUM index + */ + +/* lookup table type for binary searching through MCELEMs */ +typedef struct +{ + text *element; + float4 frequency; +} TextFreq; + +/* type of keys for bsearch'ing through an array of TextFreqs */ +typedef struct +{ + char *lexeme; + int length; +} LexemeKey; + +typedef struct +{ + TextFreq *lookup; + int nmcelem; + float4 minfreq; +} MCelemStats; + +typedef struct +{ + Oid relId; + AttrNumber attrno; +} RelAttrInfo; + +char *TFIDFSource; +static RelAttrInfo TFIDFSourceParsed; +static bool TDIDFLoaded = false; +static MemoryContext TFIDFContext = NULL; +static MCelemStats TDIDFStats; + +#define EXIT_CHECK_TF_IDF_SOURCE(error) \ + do { \ + GUC_check_errdetail(error); \ + pfree(rawname); \ + list_free(namelist); \ + if (rel) \ + RelationClose(rel); \ + return false; \ + } while (false); + +static void load_tf_idf_source(void); +static void check_load_tf_idf_source(void); +static void forget_tf_idf_stats(void); +static int compare_lexeme_textfreq(const void *e1, const void *e2); + +bool +check_tf_idf_source(char **newval, void **extra, GucSource source) +{ + char *rawname; + char *attname; + List *namelist; + Oid namespaceId; + Oid relId; + Relation rel = NULL; + AttrNumber attrno; + int i; + RelAttrInfo *myextra; + + /* Need a modifiable copy of string */ + rawname = pstrdup(*newval); + + /* Parse string into list of identifiers */ + if (!SplitIdentifierString(rawname, '.', &namelist)) + { + /* syntax error in name list */ + EXIT_CHECK_TF_IDF_SOURCE("List syntax is invalid."); + } + + switch (list_length(namelist)) + { + case 0: + return true; + case 1: + EXIT_CHECK_TF_IDF_SOURCE("improper column name (there should be at least 2 dotted names)"); + case 2: + relId = RelnameGetRelid(linitial(namelist)); + attname = lsecond(namelist); + break; + case 3: + /* use exact schema given */ + namespaceId = LookupExplicitNamespace(linitial(namelist), true); + if (!OidIsValid(namespaceId)) + relId = InvalidOid; + else + relId = get_relname_relid(lsecond(namelist), namespaceId); + attname = lthird(namelist); + break; + default: + EXIT_CHECK_TF_IDF_SOURCE("improper column name (too many dotted names)"); + } + + if (!OidIsValid(relId)) + EXIT_CHECK_TF_IDF_SOURCE("relation not found"); + + rel = RelationIdGetRelation(relId); + if (rel->rd_rel->relkind == RELKIND_INDEX) + { + int exprnum = 0; + + attrno = pg_atoi(attname, sizeof(attrno), 10); + if (attrno <= 0 || attrno > rel->rd_index->indnatts) + EXIT_CHECK_TF_IDF_SOURCE("wrong index attribute number"); + if (rel->rd_index->indkey.values[attrno - 1] != InvalidAttrNumber) + EXIT_CHECK_TF_IDF_SOURCE("regular indexed column is specified"); + for (i = 0; i < attrno - 1; i++) + { + if (rel->rd_index->indkey.values[i] == InvalidAttrNumber) + exprnum++; + } + RelationGetIndexExpressions(rel); + if (exprType((Node *) list_nth(rel->rd_indexprs, exprnum)) != TSVECTOROID) + EXIT_CHECK_TF_IDF_SOURCE("indexed expression should be of tsvector type"); + } + else + { + TupleDesc tupDesc = rel->rd_att; + + attrno = InvalidAttrNumber; + for (i = 0; i < tupDesc->natts; i++) + { + if (namestrcmp(&(tupDesc->attrs[i]->attname), attname) == 0) + { + attrno = tupDesc->attrs[i]->attnum; + break; + } + } + if (attrno == InvalidAttrNumber) + EXIT_CHECK_TF_IDF_SOURCE("attribute not found"); + if (tupDesc->attrs[attrno - 1]->atttypid != TSVECTOROID) + EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type"); + } + + + myextra = (RelAttrInfo *) malloc(sizeof(RelAttrInfo)); + myextra->relId = relId; + myextra->attrno = attrno; + *extra = (void *) myextra; + + pfree(rawname); + list_free(namelist); + RelationClose(rel); + return true; +} + + +void +assign_tf_idf_source(const char *newval, void *extra) +{ + RelAttrInfo *myextra = (RelAttrInfo *) extra; + + if (myextra) + { + TFIDFSourceParsed = *myextra; + } + else + { + TFIDFSourceParsed.relId = InvalidOid; + TFIDFSourceParsed.attrno = InvalidAttrNumber; + } + + forget_tf_idf_stats(); +} + +static void +load_tf_idf_source(void) +{ + HeapTuple statsTuple; + AttStatsSlot sslot; + MemoryContext oldContext; + int i; + + if (!TFIDFContext) + TFIDFContext = AllocSetContextCreate(TopMemoryContext, + "Memory context for TF/IDF statistics", + ALLOCSET_DEFAULT_SIZES); + + if (!OidIsValid(TFIDFSourceParsed.relId) + || TFIDFSourceParsed.attrno == InvalidAttrNumber) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("statistics for TD/IDF is not defined"), + errhint("consider setting tf_idf_source GUC"))); + } + + statsTuple = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(TFIDFSourceParsed.relId), + Int16GetDatum(TFIDFSourceParsed.attrno), + BoolGetDatum(true)); + + if (!statsTuple) + statsTuple = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(TFIDFSourceParsed.relId), + Int16GetDatum(TFIDFSourceParsed.attrno), + BoolGetDatum(false)); + + MemoryContextReset(TFIDFContext); + TDIDFLoaded = false; + + oldContext = MemoryContextSwitchTo(TFIDFContext); + + if (!statsTuple + || !get_attstatsslot(&sslot, statsTuple, + STATISTIC_KIND_MCELEM, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS) + || sslot.nnumbers != sslot.nvalues + 2) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("statistics for TD/IDF is not found"), + errhint("consider running ANALYZE"))); + } + + TDIDFStats.nmcelem = sslot.nvalues; + TDIDFStats.minfreq = sslot.numbers[sslot.nnumbers - 2]; + /* + * Transpose the data into a single array so we can use bsearch(). + */ + TDIDFStats.lookup = (TextFreq *) palloc(sizeof(TextFreq) * TDIDFStats.nmcelem); + for (i = 0; i < TDIDFStats.nmcelem; i++) + { + /* + * The text Datums came from an array, so it cannot be compressed or + * stored out-of-line -- it's safe to use VARSIZE_ANY*. + */ + Assert(!VARATT_IS_COMPRESSED(sslot.values[i]) && !VARATT_IS_EXTERNAL(sslot.values[i])); + TDIDFStats.lookup[i].element = (text *) DatumGetPointer(sslot.values[i]); + TDIDFStats.lookup[i].frequency = sslot.numbers[i]; + } + + MemoryContextSwitchTo(oldContext); + + TDIDFLoaded = true; + + ReleaseSysCache(statsTuple); +} + +static void +check_load_tf_idf_source(void) +{ + if (!TDIDFLoaded) + load_tf_idf_source(); +} + +static void +forget_tf_idf_stats(void) +{ + if (TFIDFContext) + MemoryContextReset(TFIDFContext); + TDIDFLoaded = false; +} + +/* + * bsearch() comparator for a lexeme (non-NULL terminated string with length) + * and a TextFreq. Use length, then byte-for-byte comparison, because that's + * how ANALYZE code sorted data before storing it in a statistic tuple. + * See ts_typanalyze.c for details. + */ +static int +compare_lexeme_textfreq(const void *e1, const void *e2) +{ + const LexemeKey *key = (const LexemeKey *) e1; + const TextFreq *t = (const TextFreq *) e2; + int len1, + len2; + + len1 = key->length; + len2 = VARSIZE_ANY_EXHDR(t->element); + + /* Compare lengths first, possibly avoiding a strncmp call */ + if (len1 > len2) + return 1; + else if (len1 < len2) + return -1; + + /* Fall back on byte-for-byte comparison */ + return strncmp(key->lexeme, VARDATA_ANY(t->element), len1); +} + +float4 +estimate_idf(char *lexeme, int length) +{ + TextFreq *searchres; + LexemeKey key; + float4 selec; + + check_load_tf_idf_source(); + + key.lexeme = lexeme; + key.length = length; + + searchres = (TextFreq *) bsearch(&key, TDIDFStats.lookup, TDIDFStats.nmcelem, + sizeof(TextFreq), + compare_lexeme_textfreq); + + if (searchres) + { + /* + * The element is in MCELEM. Return precise selectivity (or + * at least as precise as ANALYZE could find out). + */ + selec = searchres->frequency; + } + else + { + /* + * The element is not in MCELEM. Punt, but assume that the + * selectivity cannot be more than minfreq / 2. + */ + selec = TDIDFStats.minfreq / 2; + } + + return 1.0f / selec; +}