Turns out Elasticsearch fieldnames are case-sensitive, so handle lowe…

…r-casing of input data (during indexing) inside Postgres, so that fieldnames inside nested JSON structures can be forced to lower-case -- this isn't really a thing we can easily handle inside Elasticsearch. Quick testing doesn't seem to indicate any measurable performance difference.
zombodb · Oct 20, 2015 · 3b5da30 · 3b5da30
1 parent 6018e90
commit 3b5da30
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 8 deletions.
diff --git a/postgres/src/main/c/am/elasticsearch.c b/postgres/src/main/c/am/elasticsearch.c
@@ -203,19 +203,17 @@ void elasticsearch_createNewIndex(ZDBIndexDescriptor *indexDescriptor, int shard
 					"         \"analyzer\": {"
 					"            \"default\": {"
 					"               \"tokenizer\": \"keyword\","
-					"               \"filter\": [\"trim\", \"lowercase\", \"truncate_32000\"]"
+					"               \"filter\": [\"trim\", \"truncate_32000\"]"
 					"            },"
 					"            \"exact\": {"
 					"               \"tokenizer\": \"keyword\","
-					"               \"filter\": [\"trim\", \"lowercase\", \"truncate_32000\"]"
+					"               \"filter\": [\"trim\", \"truncate_32000\"]"
 					"            },"
 					"            \"phrase\": {"
-					"               \"tokenizer\": \"standard\","
-					"               \"filter\": [\"lowercase\"]"
+					"               \"tokenizer\": \"standard\""
 					"            },"
 					"            \"fulltext\": {"
-					"               \"tokenizer\": \"standard\","
-					"               \"filter\": [\"lowercase\"]"
+					"               \"tokenizer\": \"standard\""
 					"            }"
 					"         }"
 					"      }"

diff --git a/postgres/src/main/c/util/zdbutils.c b/postgres/src/main/c/util/zdbutils.c
@@ -14,23 +14,29 @@
  * limitations under the License.
  */
 #include "postgres.h"
+#include "catalog/pg_collation.h"
 #include "catalog/pg_type.h"
 #include "executor/spi.h"
 #include "lib/stringinfo.h"
+#include "utils/formatting.h"
 #include "utils/memutils.h"
 
 #include "zdbutils.h"
 
 void appendBinaryStringInfoAndStripLineBreaks(StringInfo str, const char *data, int datalen)
 {
+    char *lcase;
     int i;
     Assert(str != NULL);
 
     /* Make more room if needed */
     enlargeStringInfo(str, datalen);
 
-    /* OK, append the data */
-    memcpy(str->data + str->len, data, datalen);
+    /* slam data to lowercase and copy it into the StringInfo */
+    lcase = str_tolower(data, (size_t) datalen, DEFAULT_COLLATION_OID);
+    memcpy(str->data + str->len, lcase, datalen);
+    pfree(lcase);
+
     for (i=str->len; i<str->len+datalen; i++) {
         switch (str->data[i]) {
             case '\r':