Merge branch 'master' into ibm

waveform-computing · Feb 4, 2015 · 3dc2067 · 3dc2067
2 parents 89fb51e + ba5ba4a
commit 3dc2067
Show file tree

Hide file tree

Showing 7 changed files with 505 additions and 0 deletions.
diff --git a/Makefile b/Makefile
@@ -9,12 +9,14 @@ ALL_FOO:=$(ALL_SQL:%.sql=%.foo)
 
 install: install.sql
 	$(MAKE) -C pcre install
+	$(MAKE) -C unicode install
 	printf "CONNECT TO $(DBNAME);\nCREATE SCHEMA $(SCHEMANAME);\nCOMMIT;\n" | db2 +c +p -t || true
 	db2 -td! +c -s -vf $< || [ $$? -lt 4 ] && true
 
 uninstall: uninstall.sql
 	db2 -td! +c +s -vf $< || true
 	printf "CONNECT TO $(DBNAME);\nDROP SCHEMA $(SCHEMANAME) RESTRICT;\nCOMMIT;\n" | db2 +c +p -t || true
+	$(MAKE) -C unicode uninstall
 	$(MAKE) -C pcre uninstall
 
 doc:
@@ -26,6 +28,7 @@ test:
 clean: $(SUBDIRS)
 	$(MAKE) -C docs clean
 	$(MAKE) -C pcre clean
+	$(MAKE) -C unicode clean
 	$(MAKE) -C tests clean
 	rm -f foo
 	rm -f *.foo

diff --git a/docs/UNICODE_REPLACE_BAD.rst b/docs/UNICODE_REPLACE_BAD.rst
@@ -0,0 +1,79 @@
+.. _UNICODE_REPLACE_BAD:
+
+============================
+UNICODE_REPLACE_BAD function
+============================
+
+Returns **SOURCE** with characters that are invalid in UTF-8 encoding replaced
+with the string **REPL**.
+
+Prototypes
+==========
+
+.. code-block:: sql
+
+    UNICODE_REPLACE_BAD(SOURCE VARCHAR(4000), REPL VARCHAR(100))
+    UNICODE_REPLACE_BAD(SOURCE VARCHAR(4000))
+
+    RETURNS VARCHAR(4000)
+
+Description
+===========
+
+Under certain circumstances, DB2 will permit text containing characters invalid
+in the UTF-8 encoding scheme to be inserted into a column intended to contain
+UTF-8 encoded data. While this doesn't cause a problem for DB2 queries, it can
+cause issues for down-stream appliations. This function provides a means of
+stripping or replacing such invalid characters.
+
+Parameters
+==========
+
+SOURCE
+    The string to search for characters invalid in the UTF-8 encoding scheme.
+
+REPL
+    The string to replace any invalid sequences with. Defaults to the empty
+    string if omitted.
+
+Examples
+========
+
+Replacement of truncated UTF-8 characters:
+
+.. code-block:: sql
+
+    VALUES
+        (UNICODE_REPLACE_BAD('FOO' || X'C2', 'BAR'))
+
+::
+
+    1
+    --------------------....
+    FOOBAR
+
+
+Replacement of invalid characters in the middle of a string:
+
+.. code-block:: sql
+
+    VALUES
+        (UNICODE_REPLACE_BAD('FOO' || X'80' || BAR))
+
+::
+
+    1
+    --------------------....
+    FOOBAR
+
+
+See Also
+========
+
+* `SQL source code`_
+* `C source code`_
+* `Wikipedia UTF-8 article`_
+
+.. _C source code: https://github.com/waveform80/db2utils/blob/master/unicode/unicode_udfs.c#L119
+.. _SQL source code: https://github.com/waveform80/db2utils/blob/master/unicode.sql#L51
+.. _Wikipedia UTF-8 article: http://en.wikipedia.org/wiki/UTF-8
diff --git a/tests/test_unicode.sql b/tests/test_unicode.sql
@@ -0,0 +1,7 @@
+VALUES ASSERT_IS_NULL(UNICODE_REPLACE_BAD('FOO', NULL))!
+VALUES ASSERT_IS_NULL(UNICODE_REPLACE_BAD(NULL, NULL))!
+VALUES ASSERT_EQUALS(UNICODE_REPLACE_BAD('FOO', 'BAR'), 'FOO')!
+VALUES ASSERT_EQUALS(UNICODE_REPLACE_BAD('FOO' || X'80', 'BAR'), 'FOOBAR')!
+VALUES ASSERT_EQUALS(UNICODE_REPLACE_BAD(X'C2' || 'BAR', 'FOO'), 'FOOBAR')!
+
+-- vim: set et sw=4 sts=4:
diff --git a/unicode.sql b/unicode.sql
@@ -0,0 +1,105 @@
+-------------------------------------------------------------------------------
+--UNICODE CORRECTION FUNCTIONS
+-------------------------------------------------------------------------------
+-- Copyright (c) 2015 Dave Hughes <dave@waveform.org.uk>
+--
+-- Permission is hereby granted, free of charge, to any person obtaining a copy
+-- of this software and associated documentation files (the "Software"), to
+-- deal in the Software without restriction, including without limitation the
+-- rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+-- sell copies of the Software, and to permit persons to whom the Software is
+-- furnished to do so, subject to the following conditions:
+--
+-- The above copyright notice and this permission notice shall be included in
+-- all copies or substantial portions of the Software.
+--
+-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+-- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+-- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+-- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+-- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+-- FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+-- IN THE SOFTWARE.
+-------------------------------------------------------------------------------
+-- These functions are intended for cleaning data which contains erroneous
+-- UTF-8 data. Unfortunately, under certain circumstances, DB2 will store
+-- invalid UTF-8 data in VARCHAR fields which are intended to be UTF-8 encoded.
+-- While this doesn't affect DB2 queries it may well affect downstream
+-- applications which expect only valid UTF-8.
+--
+-- These functions rely on Bjoern Hoehrmann's excellent little UTF-8 decoder
+-- which can be found at http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+--
+-- To install these functions, do not run this script. Rather, use the Makefile
+-- with the GNU make utility. The "build", "install", and "register" targets do
+-- what they say on the tin...
+-------------------------------------------------------------------------------
+
+-- ROLES
+-------------------------------------------------------------------------------
+-- The following roles grant usage and administrative rights to the objects
+-- created by this module.
+-------------------------------------------------------------------------------
+
+CREATE ROLE UTILS_UNICODE_USER!
+CREATE ROLE UTILS_UNICODE_ADMIN!
+
+GRANT ROLE UTILS_UNICODE_USER TO ROLE UTILS_USER!
+GRANT ROLE UTILS_UNICODE_USER TO ROLE UTILS_UNICODE_ADMIN WITH ADMIN OPTION!
+GRANT ROLE UTILS_UNICODE_ADMIN TO ROLE UTILS_ADMIN WITH ADMIN OPTION!
+
+-- UNICODE_REPLACE_BAD(SOURCE, REPL)
+-- UNICODE_REPLACE_BAD(SOURCE)
+-------------------------------------------------------------------------------
+-- Corrects invalid UTF-8 sequences in SOURCE. Each such sequence is replaced
+-- with the string in REPL, or the blank string if REPL is omitted. If
+-- SOURCE or REPL are NULL, the result is NULL.
+--
+-- EXAMPLES
+-------------------------------------------------------------------------------
+-- Simple replacement of truncated UTF-8 characters:
+--
+--   UNICODE_REPLACE_BAD('FOO' || X'C2', 'BAR') = 'FOOBAR'
+--
+-- Simple replacement of invalid characters in the middle of a UTF-8 encoded
+-- string:
+--
+--   UNICODE_REPLACE_BAD('FOO' || X'80' || 'BAR') = 'FOOBAR'
+-------------------------------------------------------------------------------
+
+CREATE FUNCTION UNICODE_REPLACE_BAD(SOURCE VARCHAR(4000), REPL VARCHAR(100))
+    RETURNS VARCHAR(4000)
+    SPECIFIC UNICODE_REPLACE_BAD1
+    EXTERNAL NAME 'unicode_udfs!unicode_udf_replace_bad'
+    LANGUAGE C
+    PARAMETER STYLE SQL
+    PARAMETER CCSID UNICODE
+    DETERMINISTIC
+    NOT FENCED
+    RETURNS NULL ON NULL INPUT
+    NO SQL
+    NO EXTERNAL ACTION
+    ALLOW PARALLEL!
+
+CREATE FUNCTION UNICODE_REPLACE_BAD(SOURCE VARCHAR(4000))
+    RETURNS VARCHAR(4000)
+    SPECIFIC UNICODE_REPLACE_BAD2
+    LANGUAGE SQL
+    PARAMETER CCSID UNICODE
+    DETERMINISTIC
+    NO EXTERNAL ACTION
+    CONTAINS SQL
+RETURN
+    UNICODE_REPLACE_BAD(SOURCE, '')!
+
+GRANT EXECUTE ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD1 TO ROLE UTILS_UNICODE_USER!
+GRANT EXECUTE ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD2 TO ROLE UTILS_UNICODE_USER!
+GRANT EXECUTE ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD1 TO ROLE UTILS_UNICODE_ADMIN WITH GRANT OPTION!
+GRANT EXECUTE ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD2 TO ROLE UTILS_UNICODE_ADMIN WITH GRANT OPTION!
+
+COMMENT ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD1
+    IS 'Returns SOURCE string with all invalid UTF-8 sequences replaced with REPL'!
+COMMENT ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD2
+    IS 'Returns SOURCE string with all invalid UTF-8 sequences omitted'!
+
+-- vim: set et sw=4 sts=4:
diff --git a/unicode/Makefile b/unicode/Makefile
@@ -0,0 +1,57 @@
+###############################################################################
+# Makefile for the Unicode UDFs library
+#
+# This makefile was adapted from the samples/c/bldrtn script distributed with
+# IBM DB2 for Linux/UNIX/Windows. It essentially performs the same steps as
+# that script with a few minor alterations
+###############################################################################
+
+CC:=gcc
+CCFLAGS:=
+
+HARDWAREPLAT:=$(shell uname -m)
+DB2PATH:=$(shell getent passwd ${DB2INSTANCE} | cut -d':' -f6)/sqllib
+
+# Platform detection
+ifeq ($(filter x86_64 ppc64 s390x ia64, $(HARDWAREPLAT)), $(HARDWAREPLAT))
+BITWIDTH:=64
+LIB:=lib64
+EXTRA_C_FLAGS:=-m64
+else
+BITWIDTH:=32
+LIB:=lib32
+ifeq ($(HARDWAREPLAT), s390x)
+EXTRA_C_FLAGS:=-m31
+else
+EXTRA_C_FLAGS:=-m32
+endif
+endif
+
+# Compiler specific settings
+ifeq ($(CC), xlc_r)
+SHARED_LIB_FLAG:=-qmkshrobj
+else
+SHARED_LIB_FLAG:=-shared
+EXTRA_C_FLAGS:=$(EXTRA_C_FLAGS) -fpic
+endif
+LINK_FLAGS:=$(EXTRA_C_FLAGS) $(SHARED_LIB_FLAG)
+EXTRA_LFLAG:=-Wl,-rpath,$(DB2PATH)/$(LIB)
+
+install: build
+	cp unicode_udfs $(DB2PATH)/function/
+
+uninstall:
+	rm -f $(DB2PATH)/function/unicode_udfs
+
+build: unicode_udfs
+
+clean:
+	rm -f unicode_udfs.o unicode_udfs
+
+unicode_udfs: unicode_udfs.o
+	$(CC) $(LINK_FLAGS) -o unicode_udfs unicode_udfs.o $(EXTRA_LFLAG) -L$(DB2PATH)/$(LIB) -ldb2 -lpthread
+
+unicode_udfs.o: unicode_udfs.c unicode_udfs.h
+	$(CC) $(EXTRA_C_FLAGS) -I$(DB2PATH)/include -c unicode_udfs.c -D_REENTRANT
+
+.PHONY: uninstall install build clean