10
10
#include "postgres.h"
11
11
12
12
#include "catalog/namespace.h"
13
+ #include "catalog/pg_statistic.h"
13
14
#include "catalog/pg_type.h"
14
15
#include "utils/builtins.h"
15
16
#include "utils/lsyscache.h"
17
+ #include "utils/memutils.h"
18
+ #include "utils/syscache.h"
16
19
#include "utils/varlena.h"
17
20
18
21
#include "rum.h"
19
22
20
- char * TFIDFSource ;
23
+ /* lookup table type for binary searching through MCELEMs */
24
+ typedef struct
25
+ {
26
+ text * element ;
27
+ float4 frequency ;
28
+ } TextFreq ;
29
+
30
+ /* type of keys for bsearch'ing through an array of TextFreqs */
31
+ typedef struct
32
+ {
33
+ char * lexeme ;
34
+ int length ;
35
+ } LexemeKey ;
36
+
37
+ typedef struct
38
+ {
39
+ TextFreq * lookup ;
40
+ int nmcelem ;
41
+ float4 minfreq ;
42
+ } MCelemStats ;
43
+
44
+ typedef struct
45
+ {
46
+ Oid relId ;
47
+ AttrNumber attrno ;
48
+ } RelAttrInfo ;
49
+
50
+ char * TFIDFSource ;
51
+ static RelAttrInfo TFIDFSourceParsed ;
52
+ static bool TDIDFLoaded = false;
53
+ static MemoryContext TFIDFContext = NULL ;
54
+ static MCelemStats TDIDFStats ;
21
55
22
56
#define EXIT_CHECK_TF_IDF_SOURCE (error ) \
23
57
do { \
@@ -29,18 +63,24 @@ char *TFIDFSource;
29
63
return false; \
30
64
} while (false);
31
65
66
+ static void load_tf_idf_source (void );
67
+ static void check_load_tf_idf_source (void );
68
+ static void forget_tf_idf_stats (void );
69
+ static int compare_lexeme_textfreq (const void * e1 , const void * e2 );
70
+
32
71
bool
33
72
check_tf_idf_source (char * * newval , void * * extra , GucSource source )
34
73
{
35
- char * rawname ;
36
- char * attname ;
37
- List * namelist ;
38
- Oid namespaceId ;
39
- Oid relId ;
40
- Relation rel = NULL ;
41
- TupleDesc tupDesc ;
42
- AttrNumber attrno ;
43
- int i ;
74
+ char * rawname ;
75
+ char * attname ;
76
+ List * namelist ;
77
+ Oid namespaceId ;
78
+ Oid relId ;
79
+ Relation rel = NULL ;
80
+ TupleDesc tupDesc ;
81
+ AttrNumber attrno ;
82
+ int i ;
83
+ RelAttrInfo * myextra ;
44
84
45
85
/* Need a modifiable copy of string */
46
86
rawname = pstrdup (* newval );
@@ -107,6 +147,11 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
107
147
if (tupDesc -> attrs [attrno - 1 ]-> atttypid != TSVECTOROID )
108
148
EXIT_CHECK_TF_IDF_SOURCE ("attribute should be of tsvector type" );
109
149
150
+ myextra = (RelAttrInfo * ) malloc (sizeof (RelAttrInfo ));
151
+ myextra -> relId = relId ;
152
+ myextra -> attrno = attrno ;
153
+ * extra = (void * ) myextra ;
154
+
110
155
pfree (rawname );
111
156
list_free (namelist );
112
157
RelationClose (rel );
@@ -117,5 +162,148 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
117
162
void
118
163
assign_tf_idf_source (const char * newval , void * extra )
119
164
{
165
+ RelAttrInfo * myextra = (RelAttrInfo * ) extra ;
166
+
167
+ TFIDFSourceParsed = * myextra ;
168
+ forget_tf_idf_stats ();
169
+ }
170
+
171
+ static void
172
+ load_tf_idf_source (void )
173
+ {
174
+ HeapTuple statsTuple ;
175
+ AttStatsSlot sslot ;
176
+ MemoryContext oldContext ;
177
+ int i ;
178
+
179
+ if (!TFIDFContext )
180
+ TFIDFContext = AllocSetContextCreate (TopMemoryContext ,
181
+ "Memory context for TF/IDF statistics" ,
182
+ ALLOCSET_DEFAULT_SIZES );
183
+
184
+ statsTuple = SearchSysCache3 (STATRELATTINH ,
185
+ ObjectIdGetDatum (TFIDFSourceParsed .relId ),
186
+ Int16GetDatum (TFIDFSourceParsed .attrno ),
187
+ BoolGetDatum (true));
188
+
189
+ if (!statsTuple )
190
+ statsTuple = SearchSysCache3 (STATRELATTINH ,
191
+ ObjectIdGetDatum (TFIDFSourceParsed .relId ),
192
+ Int16GetDatum (TFIDFSourceParsed .attrno ),
193
+ BoolGetDatum (false));
194
+
195
+ MemoryContextReset (TFIDFContext );
196
+ TDIDFLoaded = false;
197
+
198
+ oldContext = MemoryContextSwitchTo (TFIDFContext );
199
+
200
+ if (!statsTuple
201
+ || !get_attstatsslot (& sslot , statsTuple ,
202
+ STATISTIC_KIND_MCELEM , InvalidOid ,
203
+ ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS )
204
+ || sslot .nnumbers != sslot .nvalues + 2 )
205
+ {
206
+ ereport (ERROR ,
207
+ (errcode (ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ),
208
+ errmsg ("statistics for TD/IDF is not found" ),
209
+ errhint ("consider running ANALYZE" )));
210
+ }
211
+
212
+ TDIDFStats .nmcelem = sslot .nvalues ;
213
+ TDIDFStats .minfreq = sslot .numbers [sslot .nnumbers - 2 ];
214
+ /*
215
+ * Transpose the data into a single array so we can use bsearch().
216
+ */
217
+ TDIDFStats .lookup = (TextFreq * ) palloc (sizeof (TextFreq ) * TDIDFStats .nmcelem );
218
+ for (i = 0 ; i < TDIDFStats .nmcelem ; i ++ )
219
+ {
220
+ /*
221
+ * The text Datums came from an array, so it cannot be compressed or
222
+ * stored out-of-line -- it's safe to use VARSIZE_ANY*.
223
+ */
224
+ Assert (!VARATT_IS_COMPRESSED (sslot .values [i ]) && !VARATT_IS_EXTERNAL (sslot .values [i ]));
225
+ TDIDFStats .lookup [i ].element = (text * ) DatumGetPointer (sslot .values [i ]);
226
+ TDIDFStats .lookup [i ].frequency = sslot .numbers [i ];
227
+ }
120
228
121
- }
229
+ MemoryContextSwitchTo (oldContext );
230
+
231
+ ReleaseSysCache (statsTuple );
232
+ }
233
+
234
+ static void
235
+ check_load_tf_idf_source (void )
236
+ {
237
+ if (!TDIDFLoaded )
238
+ load_tf_idf_source ();
239
+ }
240
+
241
+ static void
242
+ forget_tf_idf_stats (void )
243
+ {
244
+ MemoryContextReset (TFIDFContext );
245
+ TDIDFLoaded = false;
246
+ }
247
+
248
+ /*
249
+ * bsearch() comparator for a lexeme (non-NULL terminated string with length)
250
+ * and a TextFreq. Use length, then byte-for-byte comparison, because that's
251
+ * how ANALYZE code sorted data before storing it in a statistic tuple.
252
+ * See ts_typanalyze.c for details.
253
+ */
254
+ static int
255
+ compare_lexeme_textfreq (const void * e1 , const void * e2 )
256
+ {
257
+ const LexemeKey * key = (const LexemeKey * ) e1 ;
258
+ const TextFreq * t = (const TextFreq * ) e2 ;
259
+ int len1 ,
260
+ len2 ;
261
+
262
+ len1 = key -> length ;
263
+ len2 = VARSIZE_ANY_EXHDR (t -> element );
264
+
265
+ /* Compare lengths first, possibly avoiding a strncmp call */
266
+ if (len1 > len2 )
267
+ return 1 ;
268
+ else if (len1 < len2 )
269
+ return -1 ;
270
+
271
+ /* Fall back on byte-for-byte comparison */
272
+ return strncmp (key -> lexeme , VARDATA_ANY (t -> element ), len1 );
273
+ }
274
+
275
+ float4
276
+ estimate_idf (char * lexeme , int length )
277
+ {
278
+ TextFreq * searchres ;
279
+ LexemeKey key ;
280
+ float4 selec ;
281
+
282
+ check_load_tf_idf_source ();
283
+
284
+ key .lexeme = lexeme ;
285
+ key .length = length ;
286
+
287
+ searchres = (TextFreq * ) bsearch (& key , TDIDFStats .lookup , TDIDFStats .nmcelem ,
288
+ sizeof (TextFreq ),
289
+ compare_lexeme_textfreq );
290
+
291
+ if (searchres )
292
+ {
293
+ /*
294
+ * The element is in MCELEM. Return precise selectivity (or
295
+ * at least as precise as ANALYZE could find out).
296
+ */
297
+ selec = searchres -> frequency ;
298
+ }
299
+ else
300
+ {
301
+ /*
302
+ * The element is not in MCELEM. Punt, but assume that the
303
+ * selectivity cannot be more than minfreq / 2.
304
+ */
305
+ selec = TDIDFStats .minfreq / 2 ;
306
+ }
307
+
308
+ return 1.0f / selec ;
309
+ }
0 commit comments