Skip to content

Commit 581daf5

Browse files
author
Alexander Korotkov
committed
Better IDF calculation.
1 parent 3bb8244 commit 581daf5

File tree

2 files changed

+20
-45
lines changed

2 files changed

+20
-45
lines changed

src/rum_ts_utils.c

+17-43
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ typedef struct
9696
} key;
9797
} data;
9898
uint8 wclass;
99+
float4 idf;
99100
int32 pos;
100101
} DocRepresentation;
101102

@@ -106,20 +107,13 @@ typedef struct
106107
}
107108
QueryRepresentationOperand;
108109

109-
typedef struct
110-
{
111-
float4 idf;
112-
bool idfloaded;
113-
} QueryRepresentationIDF;
114-
115110
typedef struct
116111
{
117112
TSQuery query;
118113
/* Used in rum_tsquery_distance() */
119114
int *map_item_operand;
120115

121116
QueryRepresentationOperand *operandData;
122-
QueryRepresentationIDF *operandIdf;
123117
int length;
124118
} QueryRepresentation;
125119

@@ -1098,7 +1092,7 @@ find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
10981092
}
10991093

11001094
static DocRepresentation *
1101-
get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen)
1095+
get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen, bool load_idf)
11021096
{
11031097
QueryItem *item = GETQUERY(qr->query);
11041098
WordEntry *entry,
@@ -1134,6 +1128,8 @@ get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen)
11341128

11351129
while (entry - firstentry < nitem)
11361130
{
1131+
float4 idf;
1132+
11371133
if (entry->haspos)
11381134
{
11391135
dimt = POSDATALEN(txt, entry);
@@ -1187,12 +1183,18 @@ get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen)
11871183

11881184
}
11891185
}
1186+
1187+
if (load_idf)
1188+
idf = estimate_idf(STRPTR(txt) + entry->pos, entry->len);
1189+
else
1190+
idf = 1.0f;
11901191
}
11911192
else
11921193
{
11931194
doc[cur].data.item.nitem = doc[cur - 1].data.item.nitem;
11941195
doc[cur].data.item.item = doc[cur - 1].data.item.item;
11951196
}
1197+
doc[cur].idf = idf;
11961198
doc[cur].pos = WEP_GETPOS(post[j]);
11971199
doc[cur].wclass = WEP_GETWEIGHT(post[j]);
11981200
cur++;
@@ -1256,6 +1258,7 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen,
12561258
/* For rum_tsquery_distance() */
12571259
else
12581260
new_cover_key += (int)(uintptr_t)ptr->data.key.item_first;
1261+
Idf += ptr->idf;
12591262
ptr++;
12601263
}
12611264

@@ -1287,43 +1290,16 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen,
12871290

12881291
/* Compute the number of query terms in the cover */
12891292
for (i = 0; i < qr->length; i++)
1290-
{
12911293
if (qr->operandData[i].operandexist)
1292-
{
1293-
if (method & RANK_NORM_IDF)
1294-
{
1295-
if (!qr->operandIdf[i].idfloaded)
1296-
{
1297-
QueryOperand *oper = (QueryOperand *) (GETQUERY(qr->query) + i);
1298-
qr->operandIdf[i].idf =
1299-
estimate_idf(
1300-
GETOPERAND(qr->query) + oper->distance,
1301-
oper->length
1302-
);
1303-
qr->operandIdf[i].idfloaded = true;
1304-
}
1305-
1306-
Idf += qr->operandIdf[i].idf;
1307-
}
1308-
else
1309-
{
1310-
nitems++;
1311-
}
1312-
}
1313-
}
1294+
nitems++;
13141295

13151296
Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum;
13161297

1298+
if (nitems > 0)
1299+
Cpos *= nitems;
1300+
13171301
if (method & RANK_NORM_IDF)
1318-
{
1319-
if (Idf >= 1.0)
1320-
Cpos *= Idf;
1321-
}
1322-
else
1323-
{
1324-
if (nitems > 0)
1325-
Cpos *= nitems;
1326-
}
1302+
Cpos *= Idf;
13271303

13281304
/*
13291305
* if doc are big enough then ext.q may be equal to ext.p due to limit
@@ -1408,11 +1384,9 @@ calc_score(float4 *arrdata, TSVector txt, TSQuery query, int method)
14081384
qr.query = query;
14091385
qr.map_item_operand = NULL;
14101386
qr.operandData = palloc0(sizeof(qr.operandData[0]) * query->size);
1411-
if (method & RANK_NORM_IDF)
1412-
qr.operandIdf = palloc0(sizeof(qr.operandIdf[0]) * query->size);
14131387
qr.length = query->size;
14141388

1415-
doc = get_docrep(txt, &qr, &doclen);
1389+
doc = get_docrep(txt, &qr, &doclen, (method & RANK_NORM_IDF) ? true : false);
14161390
if (!doc)
14171391
{
14181392
pfree(qr.operandData);

src/tf_idf.c

+3-2
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323

2424
/*
2525
* FIXME:
26-
* * cache IDF
27-
* * handle prefix search
26+
* * cache IDF for ts_query (non-prefix search?)
27+
* * calculate IDF from RUM index
2828
*/
2929

3030
/* lookup table type for binary searching through MCELEMs */
@@ -139,6 +139,7 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
139139
if (rel->rd_index->indkey.values[i] == InvalidAttrNumber)
140140
exprnum++;
141141
}
142+
RelationGetIndexExpressions(rel);
142143
if (exprType((Node *) list_nth(rel->rd_indexprs, exprnum)) != TSVECTOROID)
143144
EXIT_CHECK_TF_IDF_SOURCE("indexed expression should be of tsvector type");
144145
}

0 commit comments

Comments
 (0)