Skip to content

Commit 3bb8244

Browse files
author
Alexander Korotkov
committed
Add IDF to scoring (vary basic).
1 parent 90b1638 commit 3bb8244

File tree

2 files changed

+88
-11
lines changed

2 files changed

+88
-11
lines changed

src/rum_ts_utils.c

+45-4
Original file line numberDiff line numberDiff line change
@@ -101,18 +101,25 @@ typedef struct
101101

102102
typedef struct
103103
{
104-
bool operandexist;
104+
bool operandexist;
105105
WordEntryPos pos;
106106
}
107107
QueryRepresentationOperand;
108108

109+
typedef struct
110+
{
111+
float4 idf;
112+
bool idfloaded;
113+
} QueryRepresentationIDF;
114+
109115
typedef struct
110116
{
111117
TSQuery query;
112118
/* Used in rum_tsquery_distance() */
113119
int *map_item_operand;
114120

115121
QueryRepresentationOperand *operandData;
122+
QueryRepresentationIDF *operandIdf;
116123
int length;
117124
} QueryRepresentation;
118125

@@ -140,6 +147,7 @@ static WordEntryPosVector POSNULL = {
140147
#define RANK_NORM_UNIQ 0x08
141148
#define RANK_NORM_LOGUNIQ 0x10
142149
#define RANK_NORM_RDIVRPLUS1 0x20
150+
#define RANK_NORM_IDF 0x40
143151
#define DEF_NORM_METHOD RANK_NO_NORM
144152

145153
#define QR_GET_OPERAND(q, v) \
@@ -1229,6 +1237,7 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen,
12291237
{
12301238
double Cpos = 0.0;
12311239
double InvSum = 0.0;
1240+
double Idf = 0.0;
12321241
int nNoise;
12331242
DocRepresentation *ptr = ext.begin;
12341243
/* Added by SK */
@@ -1278,13 +1287,43 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen,
12781287

12791288
/* Compute the number of query terms in the cover */
12801289
for (i = 0; i < qr->length; i++)
1290+
{
12811291
if (qr->operandData[i].operandexist)
1282-
nitems++;
1292+
{
1293+
if (method & RANK_NORM_IDF)
1294+
{
1295+
if (!qr->operandIdf[i].idfloaded)
1296+
{
1297+
QueryOperand *oper = (QueryOperand *) (GETQUERY(qr->query) + i);
1298+
qr->operandIdf[i].idf =
1299+
estimate_idf(
1300+
GETOPERAND(qr->query) + oper->distance,
1301+
oper->length
1302+
);
1303+
qr->operandIdf[i].idfloaded = true;
1304+
}
1305+
1306+
Idf += qr->operandIdf[i].idf;
1307+
}
1308+
else
1309+
{
1310+
nitems++;
1311+
}
1312+
}
1313+
}
12831314

12841315
Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum;
12851316

1286-
if (nitems > 0)
1287-
Cpos *= nitems;
1317+
if (method & RANK_NORM_IDF)
1318+
{
1319+
if (Idf >= 1.0)
1320+
Cpos *= Idf;
1321+
}
1322+
else
1323+
{
1324+
if (nitems > 0)
1325+
Cpos *= nitems;
1326+
}
12881327

12891328
/*
12901329
* if doc are big enough then ext.q may be equal to ext.p due to limit
@@ -1369,6 +1408,8 @@ calc_score(float4 *arrdata, TSVector txt, TSQuery query, int method)
13691408
qr.query = query;
13701409
qr.map_item_operand = NULL;
13711410
qr.operandData = palloc0(sizeof(qr.operandData[0]) * query->size);
1411+
if (method & RANK_NORM_IDF)
1412+
qr.operandIdf = palloc0(sizeof(qr.operandIdf[0]) * query->size);
13721413
qr.length = query->size;
13731414

13741415
doc = get_docrep(txt, &qr, &doclen);

src/tf_idf.c

+43-7
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "catalog/namespace.h"
1313
#include "catalog/pg_statistic.h"
1414
#include "catalog/pg_type.h"
15+
#include "nodes/nodeFuncs.h"
1516
#include "utils/builtins.h"
1617
#include "utils/lsyscache.h"
1718
#include "utils/memutils.h"
@@ -20,6 +21,12 @@
2021

2122
#include "rum.h"
2223

24+
/*
25+
* FIXME:
26+
* * cache IDF
27+
* * handle prefix search
28+
*/
29+
2330
/* lookup table type for binary searching through MCELEMs */
2431
typedef struct
2532
{
@@ -77,7 +84,6 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
7784
Oid namespaceId;
7885
Oid relId;
7986
Relation rel = NULL;
80-
TupleDesc tupDesc;
8187
AttrNumber attrno;
8288
int i;
8389
RelAttrInfo *myextra;
@@ -119,17 +125,27 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
119125
EXIT_CHECK_TF_IDF_SOURCE("relation not found");
120126

121127
rel = RelationIdGetRelation(relId);
122-
tupDesc = rel->rd_att;
123128
if (rel->rd_rel->relkind == RELKIND_INDEX)
124129
{
130+
int exprnum = 0;
131+
125132
attrno = pg_atoi(attname, sizeof(attrno), 10);
126133
if (attrno <= 0 || attrno > rel->rd_index->indnatts)
127134
EXIT_CHECK_TF_IDF_SOURCE("wrong index attribute number");
128135
if (rel->rd_index->indkey.values[attrno - 1] != InvalidAttrNumber)
129136
EXIT_CHECK_TF_IDF_SOURCE("regular indexed column is specified");
137+
for (i = 0; i < attrno - 1; i++)
138+
{
139+
if (rel->rd_index->indkey.values[i] == InvalidAttrNumber)
140+
exprnum++;
141+
}
142+
if (exprType((Node *) list_nth(rel->rd_indexprs, exprnum)) != TSVECTOROID)
143+
EXIT_CHECK_TF_IDF_SOURCE("indexed expression should be of tsvector type");
130144
}
131145
else
132146
{
147+
TupleDesc tupDesc = rel->rd_att;
148+
133149
attrno = InvalidAttrNumber;
134150
for (i = 0; i < tupDesc->natts; i++)
135151
{
@@ -139,13 +155,12 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
139155
break;
140156
}
141157
}
142-
143158
if (attrno == InvalidAttrNumber)
144159
EXIT_CHECK_TF_IDF_SOURCE("attribute not found");
160+
if (tupDesc->attrs[attrno - 1]->atttypid != TSVECTOROID)
161+
EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type");
145162
}
146163

147-
if (tupDesc->attrs[attrno - 1]->atttypid != TSVECTOROID)
148-
EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type");
149164

150165
myextra = (RelAttrInfo *) malloc(sizeof(RelAttrInfo));
151166
myextra->relId = relId;
@@ -164,7 +179,16 @@ assign_tf_idf_source(const char *newval, void *extra)
164179
{
165180
RelAttrInfo *myextra = (RelAttrInfo *) extra;
166181

167-
TFIDFSourceParsed = *myextra;
182+
if (myextra)
183+
{
184+
TFIDFSourceParsed = *myextra;
185+
}
186+
else
187+
{
188+
TFIDFSourceParsed.relId = InvalidOid;
189+
TFIDFSourceParsed.attrno = InvalidAttrNumber;
190+
}
191+
168192
forget_tf_idf_stats();
169193
}
170194

@@ -181,6 +205,15 @@ load_tf_idf_source(void)
181205
"Memory context for TF/IDF statistics",
182206
ALLOCSET_DEFAULT_SIZES);
183207

208+
if (!OidIsValid(TFIDFSourceParsed.relId)
209+
|| TFIDFSourceParsed.attrno == InvalidAttrNumber)
210+
{
211+
ereport(ERROR,
212+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
213+
errmsg("statistics for TD/IDF is not defined"),
214+
errhint("consider setting tf_idf_source GUC")));
215+
}
216+
184217
statsTuple = SearchSysCache3(STATRELATTINH,
185218
ObjectIdGetDatum(TFIDFSourceParsed.relId),
186219
Int16GetDatum(TFIDFSourceParsed.attrno),
@@ -228,6 +261,8 @@ load_tf_idf_source(void)
228261

229262
MemoryContextSwitchTo(oldContext);
230263

264+
TDIDFLoaded = true;
265+
231266
ReleaseSysCache(statsTuple);
232267
}
233268

@@ -241,7 +276,8 @@ check_load_tf_idf_source(void)
241276
static void
242277
forget_tf_idf_stats(void)
243278
{
244-
MemoryContextReset(TFIDFContext);
279+
if (TFIDFContext)
280+
MemoryContextReset(TFIDFContext);
245281
TDIDFLoaded = false;
246282
}
247283

0 commit comments

Comments
 (0)