Skip to content

Commit

Permalink
Merge branch 'master' into ibm
Browse files Browse the repository at this point in the history
  • Loading branch information
waveform80 committed Feb 4, 2015
2 parents 89fb51e + ba5ba4a commit 3dc2067
Show file tree
Hide file tree
Showing 7 changed files with 505 additions and 0 deletions.
3 changes: 3 additions & 0 deletions Makefile
Expand Up @@ -9,12 +9,14 @@ ALL_FOO:=$(ALL_SQL:%.sql=%.foo)

install: install.sql
$(MAKE) -C pcre install
$(MAKE) -C unicode install
printf "CONNECT TO $(DBNAME);\nCREATE SCHEMA $(SCHEMANAME);\nCOMMIT;\n" | db2 +c +p -t || true
db2 -td! +c -s -vf $< || [ $$? -lt 4 ] && true

uninstall: uninstall.sql
db2 -td! +c +s -vf $< || true
printf "CONNECT TO $(DBNAME);\nDROP SCHEMA $(SCHEMANAME) RESTRICT;\nCOMMIT;\n" | db2 +c +p -t || true
$(MAKE) -C unicode uninstall
$(MAKE) -C pcre uninstall

doc:
Expand All @@ -26,6 +28,7 @@ test:
clean: $(SUBDIRS)
$(MAKE) -C docs clean
$(MAKE) -C pcre clean
$(MAKE) -C unicode clean
$(MAKE) -C tests clean
rm -f foo
rm -f *.foo
Expand Down
79 changes: 79 additions & 0 deletions docs/UNICODE_REPLACE_BAD.rst
@@ -0,0 +1,79 @@
.. _UNICODE_REPLACE_BAD:

============================
UNICODE_REPLACE_BAD function
============================

Returns **SOURCE** with characters that are invalid in UTF-8 encoding replaced
with the string **REPL**.

Prototypes
==========

.. code-block:: sql
UNICODE_REPLACE_BAD(SOURCE VARCHAR(4000), REPL VARCHAR(100))
UNICODE_REPLACE_BAD(SOURCE VARCHAR(4000))
RETURNS VARCHAR(4000)
Description
===========

Under certain circumstances, DB2 will permit text containing characters invalid
in the UTF-8 encoding scheme to be inserted into a column intended to contain
UTF-8 encoded data. While this doesn't cause a problem for DB2 queries, it can
cause issues for down-stream appliations. This function provides a means of
stripping or replacing such invalid characters.

Parameters
==========

SOURCE
The string to search for characters invalid in the UTF-8 encoding scheme.

REPL
The string to replace any invalid sequences with. Defaults to the empty
string if omitted.

Examples
========

Replacement of truncated UTF-8 characters:

.. code-block:: sql
VALUES
(UNICODE_REPLACE_BAD('FOO' || X'C2', 'BAR'))
::

1
--------------------....
FOOBAR


Replacement of invalid characters in the middle of a string:

.. code-block:: sql
VALUES
(UNICODE_REPLACE_BAD('FOO' || X'80' || BAR))
::

1
--------------------....
FOOBAR


See Also
========

* `SQL source code`_
* `C source code`_
* `Wikipedia UTF-8 article`_

.. _C source code: https://github.com/waveform80/db2utils/blob/master/unicode/unicode_udfs.c#L119
.. _SQL source code: https://github.com/waveform80/db2utils/blob/master/unicode.sql#L51
.. _Wikipedia UTF-8 article: http://en.wikipedia.org/wiki/UTF-8
7 changes: 7 additions & 0 deletions tests/test_unicode.sql
@@ -0,0 +1,7 @@
VALUES ASSERT_IS_NULL(UNICODE_REPLACE_BAD('FOO', NULL))!
VALUES ASSERT_IS_NULL(UNICODE_REPLACE_BAD(NULL, NULL))!
VALUES ASSERT_EQUALS(UNICODE_REPLACE_BAD('FOO', 'BAR'), 'FOO')!
VALUES ASSERT_EQUALS(UNICODE_REPLACE_BAD('FOO' || X'80', 'BAR'), 'FOOBAR')!
VALUES ASSERT_EQUALS(UNICODE_REPLACE_BAD(X'C2' || 'BAR', 'FOO'), 'FOOBAR')!

-- vim: set et sw=4 sts=4:
105 changes: 105 additions & 0 deletions unicode.sql
@@ -0,0 +1,105 @@
-------------------------------------------------------------------------------
--UNICODE CORRECTION FUNCTIONS
-------------------------------------------------------------------------------
-- Copyright (c) 2015 Dave Hughes <dave@waveform.org.uk>
--
-- Permission is hereby granted, free of charge, to any person obtaining a copy
-- of this software and associated documentation files (the "Software"), to
-- deal in the Software without restriction, including without limitation the
-- rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-- sell copies of the Software, and to permit persons to whom the Software is
-- furnished to do so, subject to the following conditions:
--
-- The above copyright notice and this permission notice shall be included in
-- all copies or substantial portions of the Software.
--
-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-- FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-- IN THE SOFTWARE.
-------------------------------------------------------------------------------
-- These functions are intended for cleaning data which contains erroneous
-- UTF-8 data. Unfortunately, under certain circumstances, DB2 will store
-- invalid UTF-8 data in VARCHAR fields which are intended to be UTF-8 encoded.
-- While this doesn't affect DB2 queries it may well affect downstream
-- applications which expect only valid UTF-8.
--
-- These functions rely on Bjoern Hoehrmann's excellent little UTF-8 decoder
-- which can be found at http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
--
-- To install these functions, do not run this script. Rather, use the Makefile
-- with the GNU make utility. The "build", "install", and "register" targets do
-- what they say on the tin...
-------------------------------------------------------------------------------

-- ROLES
-------------------------------------------------------------------------------
-- The following roles grant usage and administrative rights to the objects
-- created by this module.
-------------------------------------------------------------------------------

CREATE ROLE UTILS_UNICODE_USER!
CREATE ROLE UTILS_UNICODE_ADMIN!

GRANT ROLE UTILS_UNICODE_USER TO ROLE UTILS_USER!
GRANT ROLE UTILS_UNICODE_USER TO ROLE UTILS_UNICODE_ADMIN WITH ADMIN OPTION!
GRANT ROLE UTILS_UNICODE_ADMIN TO ROLE UTILS_ADMIN WITH ADMIN OPTION!

-- UNICODE_REPLACE_BAD(SOURCE, REPL)
-- UNICODE_REPLACE_BAD(SOURCE)
-------------------------------------------------------------------------------
-- Corrects invalid UTF-8 sequences in SOURCE. Each such sequence is replaced
-- with the string in REPL, or the blank string if REPL is omitted. If
-- SOURCE or REPL are NULL, the result is NULL.
--
-- EXAMPLES
-------------------------------------------------------------------------------
-- Simple replacement of truncated UTF-8 characters:
--
-- UNICODE_REPLACE_BAD('FOO' || X'C2', 'BAR') = 'FOOBAR'
--
-- Simple replacement of invalid characters in the middle of a UTF-8 encoded
-- string:
--
-- UNICODE_REPLACE_BAD('FOO' || X'80' || 'BAR') = 'FOOBAR'
-------------------------------------------------------------------------------

CREATE FUNCTION UNICODE_REPLACE_BAD(SOURCE VARCHAR(4000), REPL VARCHAR(100))
RETURNS VARCHAR(4000)
SPECIFIC UNICODE_REPLACE_BAD1
EXTERNAL NAME 'unicode_udfs!unicode_udf_replace_bad'
LANGUAGE C
PARAMETER STYLE SQL
PARAMETER CCSID UNICODE
DETERMINISTIC
NOT FENCED
RETURNS NULL ON NULL INPUT
NO SQL
NO EXTERNAL ACTION
ALLOW PARALLEL!

CREATE FUNCTION UNICODE_REPLACE_BAD(SOURCE VARCHAR(4000))
RETURNS VARCHAR(4000)
SPECIFIC UNICODE_REPLACE_BAD2
LANGUAGE SQL
PARAMETER CCSID UNICODE
DETERMINISTIC
NO EXTERNAL ACTION
CONTAINS SQL
RETURN
UNICODE_REPLACE_BAD(SOURCE, '')!

GRANT EXECUTE ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD1 TO ROLE UTILS_UNICODE_USER!
GRANT EXECUTE ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD2 TO ROLE UTILS_UNICODE_USER!
GRANT EXECUTE ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD1 TO ROLE UTILS_UNICODE_ADMIN WITH GRANT OPTION!
GRANT EXECUTE ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD2 TO ROLE UTILS_UNICODE_ADMIN WITH GRANT OPTION!

COMMENT ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD1
IS 'Returns SOURCE string with all invalid UTF-8 sequences replaced with REPL'!
COMMENT ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD2
IS 'Returns SOURCE string with all invalid UTF-8 sequences omitted'!

-- vim: set et sw=4 sts=4:
57 changes: 57 additions & 0 deletions unicode/Makefile
@@ -0,0 +1,57 @@
###############################################################################
# Makefile for the Unicode UDFs library
#
# This makefile was adapted from the samples/c/bldrtn script distributed with
# IBM DB2 for Linux/UNIX/Windows. It essentially performs the same steps as
# that script with a few minor alterations
###############################################################################

CC:=gcc
CCFLAGS:=

HARDWAREPLAT:=$(shell uname -m)
DB2PATH:=$(shell getent passwd ${DB2INSTANCE} | cut -d':' -f6)/sqllib

# Platform detection
ifeq ($(filter x86_64 ppc64 s390x ia64, $(HARDWAREPLAT)), $(HARDWAREPLAT))
BITWIDTH:=64
LIB:=lib64
EXTRA_C_FLAGS:=-m64
else
BITWIDTH:=32
LIB:=lib32
ifeq ($(HARDWAREPLAT), s390x)
EXTRA_C_FLAGS:=-m31
else
EXTRA_C_FLAGS:=-m32
endif
endif

# Compiler specific settings
ifeq ($(CC), xlc_r)
SHARED_LIB_FLAG:=-qmkshrobj
else
SHARED_LIB_FLAG:=-shared
EXTRA_C_FLAGS:=$(EXTRA_C_FLAGS) -fpic
endif
LINK_FLAGS:=$(EXTRA_C_FLAGS) $(SHARED_LIB_FLAG)
EXTRA_LFLAG:=-Wl,-rpath,$(DB2PATH)/$(LIB)

install: build
cp unicode_udfs $(DB2PATH)/function/

uninstall:
rm -f $(DB2PATH)/function/unicode_udfs

build: unicode_udfs

clean:
rm -f unicode_udfs.o unicode_udfs

unicode_udfs: unicode_udfs.o
$(CC) $(LINK_FLAGS) -o unicode_udfs unicode_udfs.o $(EXTRA_LFLAG) -L$(DB2PATH)/$(LIB) -ldb2 -lpthread

unicode_udfs.o: unicode_udfs.c unicode_udfs.h
$(CC) $(EXTRA_C_FLAGS) -I$(DB2PATH)/include -c unicode_udfs.c -D_REENTRANT

.PHONY: uninstall install build clean

0 comments on commit 3dc2067

Please sign in to comment.