Skip to content

Commit

Permalink
Turns out Elasticsearch fieldnames are case-sensitive, so handle lowe…
Browse files Browse the repository at this point in the history
…r-casing of input data (during indexing) inside Postgres, so that fieldnames inside nested JSON structures can be forced to lower-case -- this isn't really a thing we can easily handle inside Elasticsearch.

Quick testing doesn't seem to indicate any measurable performance difference.
  • Loading branch information
eeeebbbbrrrr committed Oct 20, 2015
1 parent 6018e90 commit 3b5da30
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 8 deletions.
10 changes: 4 additions & 6 deletions postgres/src/main/c/am/elasticsearch.c
Original file line number Diff line number Diff line change
Expand Up @@ -203,19 +203,17 @@ void elasticsearch_createNewIndex(ZDBIndexDescriptor *indexDescriptor, int shard
" \"analyzer\": {"
" \"default\": {"
" \"tokenizer\": \"keyword\","
" \"filter\": [\"trim\", \"lowercase\", \"truncate_32000\"]"
" \"filter\": [\"trim\", \"truncate_32000\"]"
" },"
" \"exact\": {"
" \"tokenizer\": \"keyword\","
" \"filter\": [\"trim\", \"lowercase\", \"truncate_32000\"]"
" \"filter\": [\"trim\", \"truncate_32000\"]"
" },"
" \"phrase\": {"
" \"tokenizer\": \"standard\","
" \"filter\": [\"lowercase\"]"
" \"tokenizer\": \"standard\""
" },"
" \"fulltext\": {"
" \"tokenizer\": \"standard\","
" \"filter\": [\"lowercase\"]"
" \"tokenizer\": \"standard\""
" }"
" }"
" }"
Expand Down
10 changes: 8 additions & 2 deletions postgres/src/main/c/util/zdbutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,29 @@
* limitations under the License.
*/
#include "postgres.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_type.h"
#include "executor/spi.h"
#include "lib/stringinfo.h"
#include "utils/formatting.h"
#include "utils/memutils.h"

#include "zdbutils.h"

void appendBinaryStringInfoAndStripLineBreaks(StringInfo str, const char *data, int datalen)
{
char *lcase;
int i;
Assert(str != NULL);

/* Make more room if needed */
enlargeStringInfo(str, datalen);

/* OK, append the data */
memcpy(str->data + str->len, data, datalen);
/* slam data to lowercase and copy it into the StringInfo */
lcase = str_tolower(data, (size_t) datalen, DEFAULT_COLLATION_OID);
memcpy(str->data + str->len, lcase, datalen);
pfree(lcase);

for (i=str->len; i<str->len+datalen; i++) {
switch (str->data[i]) {
case '\r':
Expand Down

0 comments on commit 3b5da30

Please sign in to comment.