From 86f185f3c1600d47c8ee8a5738461d561e4c4977 Mon Sep 17 00:00:00 2001 From: Alexander Korotkov <a.korotkov@postgrespro.ru> Date: Sat, 7 Oct 2017 23:50:29 +0300 Subject: [PATCH 1/4] tf_idf_source GUC and its validation --- Makefile | 2 +- src/rum.h | 6 +++ src/rumutil.c | 11 +++++ src/tf_idf.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 src/tf_idf.c diff --git a/Makefile b/Makefile index 0717592f5e..dcfd883319 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ OBJS = src/rumsort.o src/rum_ts_utils.o src/rumtsquery.o \ src/rumbtree.o src/rumbulk.o src/rumdatapage.o \ src/rumentrypage.o src/rumget.o src/ruminsert.o \ src/rumscan.o src/rumutil.o src/rumvacuum.o src/rumvalidate.o \ - src/btree_rum.o $(WIN32RES) + src/btree_rum.o src/tf_idf.o $(WIN32RES) EXTENSION = rum DATA = rum--1.0.sql rum--1.0--1.1.sql rum--1.1.sql diff --git a/src/rum.h b/src/rum.h index 78cb8db439..2a5549e6e0 100644 --- a/src/rum.h +++ b/src/rum.h @@ -19,6 +19,7 @@ #include "access/sdir.h" #include "lib/rbtree.h" #include "storage/bufmgr.h" +#include "utils/guc.h" #include "rumsort.h" @@ -1008,4 +1009,9 @@ extern Datum FunctionCall10Coll(FmgrInfo *flinfo, Oid collation, Datum arg6, Datum arg7, Datum arg8, Datum arg9, Datum arg10); +/* tf_idf.c */ +extern char *TFIDFSource; +extern bool check_tf_idf_source(char **newval, void **extra, GucSource source); +extern void assign_tf_idf_source(const char *newval, void *extra); + #endif /* __RUM_H__ */ diff --git a/src/rumutil.c b/src/rumutil.c index 25eaaedddd..e67209b578 100644 --- a/src/rumutil.c +++ b/src/rumutil.c @@ -49,6 +49,17 @@ _PG_init(void) PGC_USERSET, 0, NULL, NULL, NULL); + DefineCustomStringVariable("tf_tdf_source", + "Source statistics for TD/IFD calculation.", + "", + &TFIDFSource, + "", + PGC_USERSET, + 0, + check_tf_idf_source, + assign_tf_idf_source, + NULL); + rum_relopt_kind = add_reloption_kind(); add_string_reloption(rum_relopt_kind, "attach", diff --git a/src/tf_idf.c b/src/tf_idf.c new file mode 100644 index 0000000000..a283c3e863 --- /dev/null +++ b/src/tf_idf.c @@ -0,0 +1,121 @@ +/*------------------------------------------------------------------------- + * + * tf_idf.c + * Implementation of TD/IDF statistics calculation. + * + * Portions Copyright (c) 2017, Postgres Professional + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "catalog/namespace.h" +#include "catalog/pg_type.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/varlena.h" + +#include "rum.h" + +char *TFIDFSource; + +#define EXIT_CHECK_TF_IDF_SOURCE(error) \ + do { \ + GUC_check_errdetail(error); \ + pfree(rawname); \ + list_free(namelist); \ + if (rel) \ + RelationClose(rel); \ + return false; \ + } while (false); + +bool +check_tf_idf_source(char **newval, void **extra, GucSource source) +{ + char *rawname; + char *attname; + List *namelist; + Oid namespaceId; + Oid relId; + Relation rel = NULL; + TupleDesc tupDesc; + AttrNumber attrno; + int i; + + /* Need a modifiable copy of string */ + rawname = pstrdup(*newval); + + /* Parse string into list of identifiers */ + if (!SplitIdentifierString(rawname, '.', &namelist)) + { + /* syntax error in name list */ + EXIT_CHECK_TF_IDF_SOURCE("List syntax is invalid."); + } + + switch (list_length(namelist)) + { + case 0: + return true; + case 1: + EXIT_CHECK_TF_IDF_SOURCE("improper column name (there should be at least 2 dotted names)"); + case 2: + relId = RelnameGetRelid(linitial(namelist)); + attname = lsecond(namelist); + break; + case 3: + /* use exact schema given */ + namespaceId = LookupExplicitNamespace(linitial(namelist), true); + if (!OidIsValid(namespaceId)) + relId = InvalidOid; + else + relId = get_relname_relid(lsecond(namelist), namespaceId); + attname = lthird(namelist); + break; + default: + EXIT_CHECK_TF_IDF_SOURCE("improper column name (too many dotted names)"); + } + + if (!OidIsValid(relId)) + EXIT_CHECK_TF_IDF_SOURCE("relation not found"); + + rel = RelationIdGetRelation(relId); + tupDesc = rel->rd_att; + if (rel->rd_rel->relkind == RELKIND_INDEX) + { + attrno = pg_atoi(attname, sizeof(attrno), 10); + if (attrno <= 0 || attrno > rel->rd_index->indnatts) + EXIT_CHECK_TF_IDF_SOURCE("wrong index attribute number"); + if (rel->rd_index->indkey.values[attrno - 1] != InvalidAttrNumber) + EXIT_CHECK_TF_IDF_SOURCE("regular indexed column is specified"); + } + else + { + attrno = InvalidAttrNumber; + for (i = 0; i < tupDesc->natts; i++) + { + if (namestrcmp(&(tupDesc->attrs[i]->attname), attname) == 0) + { + attrno = tupDesc->attrs[i]->attnum; + break; + } + } + + if (attrno == InvalidAttrNumber) + EXIT_CHECK_TF_IDF_SOURCE("attribute not found"); + } + + if (tupDesc->attrs[attrno - 1]->atttypid != TSVECTOROID) + EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type"); + + pfree(rawname); + list_free(namelist); + RelationClose(rel); + return true; +} + + +void +assign_tf_idf_source(const char *newval, void *extra) +{ + +} \ No newline at end of file From 90b1638d94cb8bcfbb44abe132336af711e64e3b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov <a.korotkov@postgrespro.ru> Date: Sun, 8 Oct 2017 00:50:50 +0300 Subject: [PATCH 2/4] Implement estimate_idf(). --- src/rum.h | 1 + src/tf_idf.c | 210 ++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 200 insertions(+), 11 deletions(-) diff --git a/src/rum.h b/src/rum.h index 2a5549e6e0..3f48ed4fa7 100644 --- a/src/rum.h +++ b/src/rum.h @@ -1013,5 +1013,6 @@ extern Datum FunctionCall10Coll(FmgrInfo *flinfo, Oid collation, extern char *TFIDFSource; extern bool check_tf_idf_source(char **newval, void **extra, GucSource source); extern void assign_tf_idf_source(const char *newval, void *extra); +extern float4 estimate_idf(char *lexeme, int length); #endif /* __RUM_H__ */ diff --git a/src/tf_idf.c b/src/tf_idf.c index a283c3e863..0d7aff5eb1 100644 --- a/src/tf_idf.c +++ b/src/tf_idf.c @@ -10,14 +10,48 @@ #include "postgres.h" #include "catalog/namespace.h" +#include "catalog/pg_statistic.h" #include "catalog/pg_type.h" #include "utils/builtins.h" #include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/syscache.h" #include "utils/varlena.h" #include "rum.h" -char *TFIDFSource; +/* lookup table type for binary searching through MCELEMs */ +typedef struct +{ + text *element; + float4 frequency; +} TextFreq; + +/* type of keys for bsearch'ing through an array of TextFreqs */ +typedef struct +{ + char *lexeme; + int length; +} LexemeKey; + +typedef struct +{ + TextFreq *lookup; + int nmcelem; + float4 minfreq; +} MCelemStats; + +typedef struct +{ + Oid relId; + AttrNumber attrno; +} RelAttrInfo; + +char *TFIDFSource; +static RelAttrInfo TFIDFSourceParsed; +static bool TDIDFLoaded = false; +static MemoryContext TFIDFContext = NULL; +static MCelemStats TDIDFStats; #define EXIT_CHECK_TF_IDF_SOURCE(error) \ do { \ @@ -29,18 +63,24 @@ char *TFIDFSource; return false; \ } while (false); +static void load_tf_idf_source(void); +static void check_load_tf_idf_source(void); +static void forget_tf_idf_stats(void); +static int compare_lexeme_textfreq(const void *e1, const void *e2); + bool check_tf_idf_source(char **newval, void **extra, GucSource source) { - char *rawname; - char *attname; - List *namelist; - Oid namespaceId; - Oid relId; - Relation rel = NULL; - TupleDesc tupDesc; - AttrNumber attrno; - int i; + char *rawname; + char *attname; + List *namelist; + Oid namespaceId; + Oid relId; + Relation rel = NULL; + TupleDesc tupDesc; + AttrNumber attrno; + int i; + RelAttrInfo *myextra; /* Need a modifiable copy of string */ rawname = pstrdup(*newval); @@ -107,6 +147,11 @@ check_tf_idf_source(char **newval, void **extra, GucSource source) if (tupDesc->attrs[attrno - 1]->atttypid != TSVECTOROID) EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type"); + myextra = (RelAttrInfo *) malloc(sizeof(RelAttrInfo)); + myextra->relId = relId; + myextra->attrno = attrno; + *extra = (void *) myextra; + pfree(rawname); list_free(namelist); RelationClose(rel); @@ -117,5 +162,148 @@ check_tf_idf_source(char **newval, void **extra, GucSource source) void assign_tf_idf_source(const char *newval, void *extra) { + RelAttrInfo *myextra = (RelAttrInfo *) extra; + + TFIDFSourceParsed = *myextra; + forget_tf_idf_stats(); +} + +static void +load_tf_idf_source(void) +{ + HeapTuple statsTuple; + AttStatsSlot sslot; + MemoryContext oldContext; + int i; + + if (!TFIDFContext) + TFIDFContext = AllocSetContextCreate(TopMemoryContext, + "Memory context for TF/IDF statistics", + ALLOCSET_DEFAULT_SIZES); + + statsTuple = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(TFIDFSourceParsed.relId), + Int16GetDatum(TFIDFSourceParsed.attrno), + BoolGetDatum(true)); + + if (!statsTuple) + statsTuple = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(TFIDFSourceParsed.relId), + Int16GetDatum(TFIDFSourceParsed.attrno), + BoolGetDatum(false)); + + MemoryContextReset(TFIDFContext); + TDIDFLoaded = false; + + oldContext = MemoryContextSwitchTo(TFIDFContext); + + if (!statsTuple + || !get_attstatsslot(&sslot, statsTuple, + STATISTIC_KIND_MCELEM, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS) + || sslot.nnumbers != sslot.nvalues + 2) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("statistics for TD/IDF is not found"), + errhint("consider running ANALYZE"))); + } + + TDIDFStats.nmcelem = sslot.nvalues; + TDIDFStats.minfreq = sslot.numbers[sslot.nnumbers - 2]; + /* + * Transpose the data into a single array so we can use bsearch(). + */ + TDIDFStats.lookup = (TextFreq *) palloc(sizeof(TextFreq) * TDIDFStats.nmcelem); + for (i = 0; i < TDIDFStats.nmcelem; i++) + { + /* + * The text Datums came from an array, so it cannot be compressed or + * stored out-of-line -- it's safe to use VARSIZE_ANY*. + */ + Assert(!VARATT_IS_COMPRESSED(sslot.values[i]) && !VARATT_IS_EXTERNAL(sslot.values[i])); + TDIDFStats.lookup[i].element = (text *) DatumGetPointer(sslot.values[i]); + TDIDFStats.lookup[i].frequency = sslot.numbers[i]; + } -} \ No newline at end of file + MemoryContextSwitchTo(oldContext); + + ReleaseSysCache(statsTuple); +} + +static void +check_load_tf_idf_source(void) +{ + if (!TDIDFLoaded) + load_tf_idf_source(); +} + +static void +forget_tf_idf_stats(void) +{ + MemoryContextReset(TFIDFContext); + TDIDFLoaded = false; +} + +/* + * bsearch() comparator for a lexeme (non-NULL terminated string with length) + * and a TextFreq. Use length, then byte-for-byte comparison, because that's + * how ANALYZE code sorted data before storing it in a statistic tuple. + * See ts_typanalyze.c for details. + */ +static int +compare_lexeme_textfreq(const void *e1, const void *e2) +{ + const LexemeKey *key = (const LexemeKey *) e1; + const TextFreq *t = (const TextFreq *) e2; + int len1, + len2; + + len1 = key->length; + len2 = VARSIZE_ANY_EXHDR(t->element); + + /* Compare lengths first, possibly avoiding a strncmp call */ + if (len1 > len2) + return 1; + else if (len1 < len2) + return -1; + + /* Fall back on byte-for-byte comparison */ + return strncmp(key->lexeme, VARDATA_ANY(t->element), len1); +} + +float4 +estimate_idf(char *lexeme, int length) +{ + TextFreq *searchres; + LexemeKey key; + float4 selec; + + check_load_tf_idf_source(); + + key.lexeme = lexeme; + key.length = length; + + searchres = (TextFreq *) bsearch(&key, TDIDFStats.lookup, TDIDFStats.nmcelem, + sizeof(TextFreq), + compare_lexeme_textfreq); + + if (searchres) + { + /* + * The element is in MCELEM. Return precise selectivity (or + * at least as precise as ANALYZE could find out). + */ + selec = searchres->frequency; + } + else + { + /* + * The element is not in MCELEM. Punt, but assume that the + * selectivity cannot be more than minfreq / 2. + */ + selec = TDIDFStats.minfreq / 2; + } + + return 1.0f / selec; +} From 3bb824402e17d473694b6cbb556ec2c5b7b7241b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov <a.korotkov@postgrespro.ru> Date: Sun, 8 Oct 2017 02:18:24 +0300 Subject: [PATCH 3/4] Add IDF to scoring (vary basic). --- src/rum_ts_utils.c | 49 +++++++++++++++++++++++++++++++++++++++++---- src/tf_idf.c | 50 +++++++++++++++++++++++++++++++++++++++------- 2 files changed, 88 insertions(+), 11 deletions(-) diff --git a/src/rum_ts_utils.c b/src/rum_ts_utils.c index 07faabe42c..5c22f88879 100644 --- a/src/rum_ts_utils.c +++ b/src/rum_ts_utils.c @@ -101,11 +101,17 @@ typedef struct typedef struct { - bool operandexist; + bool operandexist; WordEntryPos pos; } QueryRepresentationOperand; +typedef struct +{ + float4 idf; + bool idfloaded; +} QueryRepresentationIDF; + typedef struct { TSQuery query; @@ -113,6 +119,7 @@ typedef struct int *map_item_operand; QueryRepresentationOperand *operandData; + QueryRepresentationIDF *operandIdf; int length; } QueryRepresentation; @@ -140,6 +147,7 @@ static WordEntryPosVector POSNULL = { #define RANK_NORM_UNIQ 0x08 #define RANK_NORM_LOGUNIQ 0x10 #define RANK_NORM_RDIVRPLUS1 0x20 +#define RANK_NORM_IDF 0x40 #define DEF_NORM_METHOD RANK_NO_NORM #define QR_GET_OPERAND(q, v) \ @@ -1229,6 +1237,7 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen, { double Cpos = 0.0; double InvSum = 0.0; + double Idf = 0.0; int nNoise; DocRepresentation *ptr = ext.begin; /* Added by SK */ @@ -1278,13 +1287,43 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen, /* Compute the number of query terms in the cover */ for (i = 0; i < qr->length; i++) + { if (qr->operandData[i].operandexist) - nitems++; + { + if (method & RANK_NORM_IDF) + { + if (!qr->operandIdf[i].idfloaded) + { + QueryOperand *oper = (QueryOperand *) (GETQUERY(qr->query) + i); + qr->operandIdf[i].idf = + estimate_idf( + GETOPERAND(qr->query) + oper->distance, + oper->length + ); + qr->operandIdf[i].idfloaded = true; + } + + Idf += qr->operandIdf[i].idf; + } + else + { + nitems++; + } + } + } Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum; - if (nitems > 0) - Cpos *= nitems; + if (method & RANK_NORM_IDF) + { + if (Idf >= 1.0) + Cpos *= Idf; + } + else + { + if (nitems > 0) + Cpos *= nitems; + } /* * if doc are big enough then ext.q may be equal to ext.p due to limit @@ -1369,6 +1408,8 @@ calc_score(float4 *arrdata, TSVector txt, TSQuery query, int method) qr.query = query; qr.map_item_operand = NULL; qr.operandData = palloc0(sizeof(qr.operandData[0]) * query->size); + if (method & RANK_NORM_IDF) + qr.operandIdf = palloc0(sizeof(qr.operandIdf[0]) * query->size); qr.length = query->size; doc = get_docrep(txt, &qr, &doclen); diff --git a/src/tf_idf.c b/src/tf_idf.c index 0d7aff5eb1..995e9de572 100644 --- a/src/tf_idf.c +++ b/src/tf_idf.c @@ -12,6 +12,7 @@ #include "catalog/namespace.h" #include "catalog/pg_statistic.h" #include "catalog/pg_type.h" +#include "nodes/nodeFuncs.h" #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -20,6 +21,12 @@ #include "rum.h" +/* + * FIXME: + * * cache IDF + * * handle prefix search + */ + /* lookup table type for binary searching through MCELEMs */ typedef struct { @@ -77,7 +84,6 @@ check_tf_idf_source(char **newval, void **extra, GucSource source) Oid namespaceId; Oid relId; Relation rel = NULL; - TupleDesc tupDesc; AttrNumber attrno; int i; RelAttrInfo *myextra; @@ -119,17 +125,27 @@ check_tf_idf_source(char **newval, void **extra, GucSource source) EXIT_CHECK_TF_IDF_SOURCE("relation not found"); rel = RelationIdGetRelation(relId); - tupDesc = rel->rd_att; if (rel->rd_rel->relkind == RELKIND_INDEX) { + int exprnum = 0; + attrno = pg_atoi(attname, sizeof(attrno), 10); if (attrno <= 0 || attrno > rel->rd_index->indnatts) EXIT_CHECK_TF_IDF_SOURCE("wrong index attribute number"); if (rel->rd_index->indkey.values[attrno - 1] != InvalidAttrNumber) EXIT_CHECK_TF_IDF_SOURCE("regular indexed column is specified"); + for (i = 0; i < attrno - 1; i++) + { + if (rel->rd_index->indkey.values[i] == InvalidAttrNumber) + exprnum++; + } + if (exprType((Node *) list_nth(rel->rd_indexprs, exprnum)) != TSVECTOROID) + EXIT_CHECK_TF_IDF_SOURCE("indexed expression should be of tsvector type"); } else { + TupleDesc tupDesc = rel->rd_att; + attrno = InvalidAttrNumber; for (i = 0; i < tupDesc->natts; i++) { @@ -139,13 +155,12 @@ check_tf_idf_source(char **newval, void **extra, GucSource source) break; } } - if (attrno == InvalidAttrNumber) EXIT_CHECK_TF_IDF_SOURCE("attribute not found"); + if (tupDesc->attrs[attrno - 1]->atttypid != TSVECTOROID) + EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type"); } - if (tupDesc->attrs[attrno - 1]->atttypid != TSVECTOROID) - EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type"); myextra = (RelAttrInfo *) malloc(sizeof(RelAttrInfo)); myextra->relId = relId; @@ -164,7 +179,16 @@ assign_tf_idf_source(const char *newval, void *extra) { RelAttrInfo *myextra = (RelAttrInfo *) extra; - TFIDFSourceParsed = *myextra; + if (myextra) + { + TFIDFSourceParsed = *myextra; + } + else + { + TFIDFSourceParsed.relId = InvalidOid; + TFIDFSourceParsed.attrno = InvalidAttrNumber; + } + forget_tf_idf_stats(); } @@ -181,6 +205,15 @@ load_tf_idf_source(void) "Memory context for TF/IDF statistics", ALLOCSET_DEFAULT_SIZES); + if (!OidIsValid(TFIDFSourceParsed.relId) + || TFIDFSourceParsed.attrno == InvalidAttrNumber) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("statistics for TD/IDF is not defined"), + errhint("consider setting tf_idf_source GUC"))); + } + statsTuple = SearchSysCache3(STATRELATTINH, ObjectIdGetDatum(TFIDFSourceParsed.relId), Int16GetDatum(TFIDFSourceParsed.attrno), @@ -228,6 +261,8 @@ load_tf_idf_source(void) MemoryContextSwitchTo(oldContext); + TDIDFLoaded = true; + ReleaseSysCache(statsTuple); } @@ -241,7 +276,8 @@ check_load_tf_idf_source(void) static void forget_tf_idf_stats(void) { - MemoryContextReset(TFIDFContext); + if (TFIDFContext) + MemoryContextReset(TFIDFContext); TDIDFLoaded = false; } From 581daf5fbe5b4b1cb56c7adefc17c4dd054ca60b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov <a.korotkov@postgrespro.ru> Date: Sun, 8 Oct 2017 23:18:49 +0300 Subject: [PATCH 4/4] Better IDF calculation. --- src/rum_ts_utils.c | 60 +++++++++++++--------------------------------- src/tf_idf.c | 5 ++-- 2 files changed, 20 insertions(+), 45 deletions(-) diff --git a/src/rum_ts_utils.c b/src/rum_ts_utils.c index 5c22f88879..d9f79423b2 100644 --- a/src/rum_ts_utils.c +++ b/src/rum_ts_utils.c @@ -96,6 +96,7 @@ typedef struct } key; } data; uint8 wclass; + float4 idf; int32 pos; } DocRepresentation; @@ -106,12 +107,6 @@ typedef struct } QueryRepresentationOperand; -typedef struct -{ - float4 idf; - bool idfloaded; -} QueryRepresentationIDF; - typedef struct { TSQuery query; @@ -119,7 +114,6 @@ typedef struct int *map_item_operand; QueryRepresentationOperand *operandData; - QueryRepresentationIDF *operandIdf; int length; } QueryRepresentation; @@ -1098,7 +1092,7 @@ find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem) } static DocRepresentation * -get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen) +get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen, bool load_idf) { QueryItem *item = GETQUERY(qr->query); WordEntry *entry, @@ -1134,6 +1128,8 @@ get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen) while (entry - firstentry < nitem) { + float4 idf; + if (entry->haspos) { dimt = POSDATALEN(txt, entry); @@ -1187,12 +1183,18 @@ get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen) } } + + if (load_idf) + idf = estimate_idf(STRPTR(txt) + entry->pos, entry->len); + else + idf = 1.0f; } else { doc[cur].data.item.nitem = doc[cur - 1].data.item.nitem; doc[cur].data.item.item = doc[cur - 1].data.item.item; } + doc[cur].idf = idf; doc[cur].pos = WEP_GETPOS(post[j]); doc[cur].wclass = WEP_GETWEIGHT(post[j]); cur++; @@ -1256,6 +1258,7 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen, /* For rum_tsquery_distance() */ else new_cover_key += (int)(uintptr_t)ptr->data.key.item_first; + Idf += ptr->idf; ptr++; } @@ -1287,43 +1290,16 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen, /* Compute the number of query terms in the cover */ for (i = 0; i < qr->length; i++) - { if (qr->operandData[i].operandexist) - { - if (method & RANK_NORM_IDF) - { - if (!qr->operandIdf[i].idfloaded) - { - QueryOperand *oper = (QueryOperand *) (GETQUERY(qr->query) + i); - qr->operandIdf[i].idf = - estimate_idf( - GETOPERAND(qr->query) + oper->distance, - oper->length - ); - qr->operandIdf[i].idfloaded = true; - } - - Idf += qr->operandIdf[i].idf; - } - else - { - nitems++; - } - } - } + nitems++; Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum; + if (nitems > 0) + Cpos *= nitems; + if (method & RANK_NORM_IDF) - { - if (Idf >= 1.0) - Cpos *= Idf; - } - else - { - if (nitems > 0) - Cpos *= nitems; - } + Cpos *= Idf; /* * if doc are big enough then ext.q may be equal to ext.p due to limit @@ -1408,11 +1384,9 @@ calc_score(float4 *arrdata, TSVector txt, TSQuery query, int method) qr.query = query; qr.map_item_operand = NULL; qr.operandData = palloc0(sizeof(qr.operandData[0]) * query->size); - if (method & RANK_NORM_IDF) - qr.operandIdf = palloc0(sizeof(qr.operandIdf[0]) * query->size); qr.length = query->size; - doc = get_docrep(txt, &qr, &doclen); + doc = get_docrep(txt, &qr, &doclen, (method & RANK_NORM_IDF) ? true : false); if (!doc) { pfree(qr.operandData); diff --git a/src/tf_idf.c b/src/tf_idf.c index 995e9de572..1c14ef2d04 100644 --- a/src/tf_idf.c +++ b/src/tf_idf.c @@ -23,8 +23,8 @@ /* * FIXME: - * * cache IDF - * * handle prefix search + * * cache IDF for ts_query (non-prefix search?) + * * calculate IDF from RUM index */ /* lookup table type for binary searching through MCELEMs */ @@ -139,6 +139,7 @@ check_tf_idf_source(char **newval, void **extra, GucSource source) if (rel->rd_index->indkey.values[i] == InvalidAttrNumber) exprnum++; } + RelationGetIndexExpressions(rel); if (exprType((Node *) list_nth(rel->rd_indexprs, exprnum)) != TSVECTOROID) EXIT_CHECK_TF_IDF_SOURCE("indexed expression should be of tsvector type"); }