Skip to content

Commit 90b1638

Browse files
author
Alexander Korotkov
committed
Implement estimate_idf().
1 parent 86f185f commit 90b1638

File tree

2 files changed

+200
-11
lines changed

2 files changed

+200
-11
lines changed

src/rum.h

+1
Original file line numberDiff line numberDiff line change
@@ -1013,5 +1013,6 @@ extern Datum FunctionCall10Coll(FmgrInfo *flinfo, Oid collation,
10131013
extern char *TFIDFSource;
10141014
extern bool check_tf_idf_source(char **newval, void **extra, GucSource source);
10151015
extern void assign_tf_idf_source(const char *newval, void *extra);
1016+
extern float4 estimate_idf(char *lexeme, int length);
10161017

10171018
#endif /* __RUM_H__ */

src/tf_idf.c

+199-11
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,48 @@
1010
#include "postgres.h"
1111

1212
#include "catalog/namespace.h"
13+
#include "catalog/pg_statistic.h"
1314
#include "catalog/pg_type.h"
1415
#include "utils/builtins.h"
1516
#include "utils/lsyscache.h"
17+
#include "utils/memutils.h"
18+
#include "utils/syscache.h"
1619
#include "utils/varlena.h"
1720

1821
#include "rum.h"
1922

20-
char *TFIDFSource;
23+
/* lookup table type for binary searching through MCELEMs */
24+
typedef struct
25+
{
26+
text *element;
27+
float4 frequency;
28+
} TextFreq;
29+
30+
/* type of keys for bsearch'ing through an array of TextFreqs */
31+
typedef struct
32+
{
33+
char *lexeme;
34+
int length;
35+
} LexemeKey;
36+
37+
typedef struct
38+
{
39+
TextFreq *lookup;
40+
int nmcelem;
41+
float4 minfreq;
42+
} MCelemStats;
43+
44+
typedef struct
45+
{
46+
Oid relId;
47+
AttrNumber attrno;
48+
} RelAttrInfo;
49+
50+
char *TFIDFSource;
51+
static RelAttrInfo TFIDFSourceParsed;
52+
static bool TDIDFLoaded = false;
53+
static MemoryContext TFIDFContext = NULL;
54+
static MCelemStats TDIDFStats;
2155

2256
#define EXIT_CHECK_TF_IDF_SOURCE(error) \
2357
do { \
@@ -29,18 +63,24 @@ char *TFIDFSource;
2963
return false; \
3064
} while (false);
3165

66+
static void load_tf_idf_source(void);
67+
static void check_load_tf_idf_source(void);
68+
static void forget_tf_idf_stats(void);
69+
static int compare_lexeme_textfreq(const void *e1, const void *e2);
70+
3271
bool
3372
check_tf_idf_source(char **newval, void **extra, GucSource source)
3473
{
35-
char *rawname;
36-
char *attname;
37-
List *namelist;
38-
Oid namespaceId;
39-
Oid relId;
40-
Relation rel = NULL;
41-
TupleDesc tupDesc;
42-
AttrNumber attrno;
43-
int i;
74+
char *rawname;
75+
char *attname;
76+
List *namelist;
77+
Oid namespaceId;
78+
Oid relId;
79+
Relation rel = NULL;
80+
TupleDesc tupDesc;
81+
AttrNumber attrno;
82+
int i;
83+
RelAttrInfo *myextra;
4484

4585
/* Need a modifiable copy of string */
4686
rawname = pstrdup(*newval);
@@ -107,6 +147,11 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
107147
if (tupDesc->attrs[attrno - 1]->atttypid != TSVECTOROID)
108148
EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type");
109149

150+
myextra = (RelAttrInfo *) malloc(sizeof(RelAttrInfo));
151+
myextra->relId = relId;
152+
myextra->attrno = attrno;
153+
*extra = (void *) myextra;
154+
110155
pfree(rawname);
111156
list_free(namelist);
112157
RelationClose(rel);
@@ -117,5 +162,148 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
117162
void
118163
assign_tf_idf_source(const char *newval, void *extra)
119164
{
165+
RelAttrInfo *myextra = (RelAttrInfo *) extra;
166+
167+
TFIDFSourceParsed = *myextra;
168+
forget_tf_idf_stats();
169+
}
170+
171+
static void
172+
load_tf_idf_source(void)
173+
{
174+
HeapTuple statsTuple;
175+
AttStatsSlot sslot;
176+
MemoryContext oldContext;
177+
int i;
178+
179+
if (!TFIDFContext)
180+
TFIDFContext = AllocSetContextCreate(TopMemoryContext,
181+
"Memory context for TF/IDF statistics",
182+
ALLOCSET_DEFAULT_SIZES);
183+
184+
statsTuple = SearchSysCache3(STATRELATTINH,
185+
ObjectIdGetDatum(TFIDFSourceParsed.relId),
186+
Int16GetDatum(TFIDFSourceParsed.attrno),
187+
BoolGetDatum(true));
188+
189+
if (!statsTuple)
190+
statsTuple = SearchSysCache3(STATRELATTINH,
191+
ObjectIdGetDatum(TFIDFSourceParsed.relId),
192+
Int16GetDatum(TFIDFSourceParsed.attrno),
193+
BoolGetDatum(false));
194+
195+
MemoryContextReset(TFIDFContext);
196+
TDIDFLoaded = false;
197+
198+
oldContext = MemoryContextSwitchTo(TFIDFContext);
199+
200+
if (!statsTuple
201+
|| !get_attstatsslot(&sslot, statsTuple,
202+
STATISTIC_KIND_MCELEM, InvalidOid,
203+
ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS)
204+
|| sslot.nnumbers != sslot.nvalues + 2)
205+
{
206+
ereport(ERROR,
207+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
208+
errmsg("statistics for TD/IDF is not found"),
209+
errhint("consider running ANALYZE")));
210+
}
211+
212+
TDIDFStats.nmcelem = sslot.nvalues;
213+
TDIDFStats.minfreq = sslot.numbers[sslot.nnumbers - 2];
214+
/*
215+
* Transpose the data into a single array so we can use bsearch().
216+
*/
217+
TDIDFStats.lookup = (TextFreq *) palloc(sizeof(TextFreq) * TDIDFStats.nmcelem);
218+
for (i = 0; i < TDIDFStats.nmcelem; i++)
219+
{
220+
/*
221+
* The text Datums came from an array, so it cannot be compressed or
222+
* stored out-of-line -- it's safe to use VARSIZE_ANY*.
223+
*/
224+
Assert(!VARATT_IS_COMPRESSED(sslot.values[i]) && !VARATT_IS_EXTERNAL(sslot.values[i]));
225+
TDIDFStats.lookup[i].element = (text *) DatumGetPointer(sslot.values[i]);
226+
TDIDFStats.lookup[i].frequency = sslot.numbers[i];
227+
}
120228

121-
}
229+
MemoryContextSwitchTo(oldContext);
230+
231+
ReleaseSysCache(statsTuple);
232+
}
233+
234+
static void
235+
check_load_tf_idf_source(void)
236+
{
237+
if (!TDIDFLoaded)
238+
load_tf_idf_source();
239+
}
240+
241+
static void
242+
forget_tf_idf_stats(void)
243+
{
244+
MemoryContextReset(TFIDFContext);
245+
TDIDFLoaded = false;
246+
}
247+
248+
/*
249+
* bsearch() comparator for a lexeme (non-NULL terminated string with length)
250+
* and a TextFreq. Use length, then byte-for-byte comparison, because that's
251+
* how ANALYZE code sorted data before storing it in a statistic tuple.
252+
* See ts_typanalyze.c for details.
253+
*/
254+
static int
255+
compare_lexeme_textfreq(const void *e1, const void *e2)
256+
{
257+
const LexemeKey *key = (const LexemeKey *) e1;
258+
const TextFreq *t = (const TextFreq *) e2;
259+
int len1,
260+
len2;
261+
262+
len1 = key->length;
263+
len2 = VARSIZE_ANY_EXHDR(t->element);
264+
265+
/* Compare lengths first, possibly avoiding a strncmp call */
266+
if (len1 > len2)
267+
return 1;
268+
else if (len1 < len2)
269+
return -1;
270+
271+
/* Fall back on byte-for-byte comparison */
272+
return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
273+
}
274+
275+
float4
276+
estimate_idf(char *lexeme, int length)
277+
{
278+
TextFreq *searchres;
279+
LexemeKey key;
280+
float4 selec;
281+
282+
check_load_tf_idf_source();
283+
284+
key.lexeme = lexeme;
285+
key.length = length;
286+
287+
searchres = (TextFreq *) bsearch(&key, TDIDFStats.lookup, TDIDFStats.nmcelem,
288+
sizeof(TextFreq),
289+
compare_lexeme_textfreq);
290+
291+
if (searchres)
292+
{
293+
/*
294+
* The element is in MCELEM. Return precise selectivity (or
295+
* at least as precise as ANALYZE could find out).
296+
*/
297+
selec = searchres->frequency;
298+
}
299+
else
300+
{
301+
/*
302+
* The element is not in MCELEM. Punt, but assume that the
303+
* selectivity cannot be more than minfreq / 2.
304+
*/
305+
selec = TDIDFStats.minfreq / 2;
306+
}
307+
308+
return 1.0f / selec;
309+
}

0 commit comments

Comments
 (0)