Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
505 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
.. _UNICODE_REPLACE_BAD: | ||
|
||
============================ | ||
UNICODE_REPLACE_BAD function | ||
============================ | ||
|
||
Returns **SOURCE** with characters that are invalid in UTF-8 encoding replaced | ||
with the string **REPL**. | ||
|
||
Prototypes | ||
========== | ||
|
||
.. code-block:: sql | ||
UNICODE_REPLACE_BAD(SOURCE VARCHAR(4000), REPL VARCHAR(100)) | ||
UNICODE_REPLACE_BAD(SOURCE VARCHAR(4000)) | ||
RETURNS VARCHAR(4000) | ||
Description | ||
=========== | ||
|
||
Under certain circumstances, DB2 will permit text containing characters invalid | ||
in the UTF-8 encoding scheme to be inserted into a column intended to contain | ||
UTF-8 encoded data. While this doesn't cause a problem for DB2 queries, it can | ||
cause issues for down-stream appliations. This function provides a means of | ||
stripping or replacing such invalid characters. | ||
|
||
Parameters | ||
========== | ||
|
||
SOURCE | ||
The string to search for characters invalid in the UTF-8 encoding scheme. | ||
|
||
REPL | ||
The string to replace any invalid sequences with. Defaults to the empty | ||
string if omitted. | ||
|
||
Examples | ||
======== | ||
|
||
Replacement of truncated UTF-8 characters: | ||
|
||
.. code-block:: sql | ||
VALUES | ||
(UNICODE_REPLACE_BAD('FOO' || X'C2', 'BAR')) | ||
:: | ||
|
||
1 | ||
--------------------.... | ||
FOOBAR | ||
|
||
|
||
Replacement of invalid characters in the middle of a string: | ||
|
||
.. code-block:: sql | ||
VALUES | ||
(UNICODE_REPLACE_BAD('FOO' || X'80' || BAR)) | ||
:: | ||
|
||
1 | ||
--------------------.... | ||
FOOBAR | ||
|
||
|
||
See Also | ||
======== | ||
|
||
* `SQL source code`_ | ||
* `C source code`_ | ||
* `Wikipedia UTF-8 article`_ | ||
|
||
.. _C source code: https://github.com/waveform80/db2utils/blob/master/unicode/unicode_udfs.c#L119 | ||
.. _SQL source code: https://github.com/waveform80/db2utils/blob/master/unicode.sql#L51 | ||
.. _Wikipedia UTF-8 article: http://en.wikipedia.org/wiki/UTF-8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
VALUES ASSERT_IS_NULL(UNICODE_REPLACE_BAD('FOO', NULL))! | ||
VALUES ASSERT_IS_NULL(UNICODE_REPLACE_BAD(NULL, NULL))! | ||
VALUES ASSERT_EQUALS(UNICODE_REPLACE_BAD('FOO', 'BAR'), 'FOO')! | ||
VALUES ASSERT_EQUALS(UNICODE_REPLACE_BAD('FOO' || X'80', 'BAR'), 'FOOBAR')! | ||
VALUES ASSERT_EQUALS(UNICODE_REPLACE_BAD(X'C2' || 'BAR', 'FOO'), 'FOOBAR')! | ||
|
||
-- vim: set et sw=4 sts=4: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
------------------------------------------------------------------------------- | ||
--UNICODE CORRECTION FUNCTIONS | ||
------------------------------------------------------------------------------- | ||
-- Copyright (c) 2015 Dave Hughes <dave@waveform.org.uk> | ||
-- | ||
-- Permission is hereby granted, free of charge, to any person obtaining a copy | ||
-- of this software and associated documentation files (the "Software"), to | ||
-- deal in the Software without restriction, including without limitation the | ||
-- rights to use, copy, modify, merge, publish, distribute, sublicense, and/or | ||
-- sell copies of the Software, and to permit persons to whom the Software is | ||
-- furnished to do so, subject to the following conditions: | ||
-- | ||
-- The above copyright notice and this permission notice shall be included in | ||
-- all copies or substantial portions of the Software. | ||
-- | ||
-- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
-- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
-- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
-- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
-- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
-- FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
-- IN THE SOFTWARE. | ||
------------------------------------------------------------------------------- | ||
-- These functions are intended for cleaning data which contains erroneous | ||
-- UTF-8 data. Unfortunately, under certain circumstances, DB2 will store | ||
-- invalid UTF-8 data in VARCHAR fields which are intended to be UTF-8 encoded. | ||
-- While this doesn't affect DB2 queries it may well affect downstream | ||
-- applications which expect only valid UTF-8. | ||
-- | ||
-- These functions rely on Bjoern Hoehrmann's excellent little UTF-8 decoder | ||
-- which can be found at http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ | ||
-- | ||
-- To install these functions, do not run this script. Rather, use the Makefile | ||
-- with the GNU make utility. The "build", "install", and "register" targets do | ||
-- what they say on the tin... | ||
------------------------------------------------------------------------------- | ||
|
||
-- ROLES | ||
------------------------------------------------------------------------------- | ||
-- The following roles grant usage and administrative rights to the objects | ||
-- created by this module. | ||
------------------------------------------------------------------------------- | ||
|
||
CREATE ROLE UTILS_UNICODE_USER! | ||
CREATE ROLE UTILS_UNICODE_ADMIN! | ||
|
||
GRANT ROLE UTILS_UNICODE_USER TO ROLE UTILS_USER! | ||
GRANT ROLE UTILS_UNICODE_USER TO ROLE UTILS_UNICODE_ADMIN WITH ADMIN OPTION! | ||
GRANT ROLE UTILS_UNICODE_ADMIN TO ROLE UTILS_ADMIN WITH ADMIN OPTION! | ||
|
||
-- UNICODE_REPLACE_BAD(SOURCE, REPL) | ||
-- UNICODE_REPLACE_BAD(SOURCE) | ||
------------------------------------------------------------------------------- | ||
-- Corrects invalid UTF-8 sequences in SOURCE. Each such sequence is replaced | ||
-- with the string in REPL, or the blank string if REPL is omitted. If | ||
-- SOURCE or REPL are NULL, the result is NULL. | ||
-- | ||
-- EXAMPLES | ||
------------------------------------------------------------------------------- | ||
-- Simple replacement of truncated UTF-8 characters: | ||
-- | ||
-- UNICODE_REPLACE_BAD('FOO' || X'C2', 'BAR') = 'FOOBAR' | ||
-- | ||
-- Simple replacement of invalid characters in the middle of a UTF-8 encoded | ||
-- string: | ||
-- | ||
-- UNICODE_REPLACE_BAD('FOO' || X'80' || 'BAR') = 'FOOBAR' | ||
------------------------------------------------------------------------------- | ||
|
||
CREATE FUNCTION UNICODE_REPLACE_BAD(SOURCE VARCHAR(4000), REPL VARCHAR(100)) | ||
RETURNS VARCHAR(4000) | ||
SPECIFIC UNICODE_REPLACE_BAD1 | ||
EXTERNAL NAME 'unicode_udfs!unicode_udf_replace_bad' | ||
LANGUAGE C | ||
PARAMETER STYLE SQL | ||
PARAMETER CCSID UNICODE | ||
DETERMINISTIC | ||
NOT FENCED | ||
RETURNS NULL ON NULL INPUT | ||
NO SQL | ||
NO EXTERNAL ACTION | ||
ALLOW PARALLEL! | ||
|
||
CREATE FUNCTION UNICODE_REPLACE_BAD(SOURCE VARCHAR(4000)) | ||
RETURNS VARCHAR(4000) | ||
SPECIFIC UNICODE_REPLACE_BAD2 | ||
LANGUAGE SQL | ||
PARAMETER CCSID UNICODE | ||
DETERMINISTIC | ||
NO EXTERNAL ACTION | ||
CONTAINS SQL | ||
RETURN | ||
UNICODE_REPLACE_BAD(SOURCE, '')! | ||
|
||
GRANT EXECUTE ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD1 TO ROLE UTILS_UNICODE_USER! | ||
GRANT EXECUTE ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD2 TO ROLE UTILS_UNICODE_USER! | ||
GRANT EXECUTE ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD1 TO ROLE UTILS_UNICODE_ADMIN WITH GRANT OPTION! | ||
GRANT EXECUTE ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD2 TO ROLE UTILS_UNICODE_ADMIN WITH GRANT OPTION! | ||
|
||
COMMENT ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD1 | ||
IS 'Returns SOURCE string with all invalid UTF-8 sequences replaced with REPL'! | ||
COMMENT ON SPECIFIC FUNCTION UNICODE_REPLACE_BAD2 | ||
IS 'Returns SOURCE string with all invalid UTF-8 sequences omitted'! | ||
|
||
-- vim: set et sw=4 sts=4: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
############################################################################### | ||
# Makefile for the Unicode UDFs library | ||
# | ||
# This makefile was adapted from the samples/c/bldrtn script distributed with | ||
# IBM DB2 for Linux/UNIX/Windows. It essentially performs the same steps as | ||
# that script with a few minor alterations | ||
############################################################################### | ||
|
||
CC:=gcc | ||
CCFLAGS:= | ||
|
||
HARDWAREPLAT:=$(shell uname -m) | ||
DB2PATH:=$(shell getent passwd ${DB2INSTANCE} | cut -d':' -f6)/sqllib | ||
|
||
# Platform detection | ||
ifeq ($(filter x86_64 ppc64 s390x ia64, $(HARDWAREPLAT)), $(HARDWAREPLAT)) | ||
BITWIDTH:=64 | ||
LIB:=lib64 | ||
EXTRA_C_FLAGS:=-m64 | ||
else | ||
BITWIDTH:=32 | ||
LIB:=lib32 | ||
ifeq ($(HARDWAREPLAT), s390x) | ||
EXTRA_C_FLAGS:=-m31 | ||
else | ||
EXTRA_C_FLAGS:=-m32 | ||
endif | ||
endif | ||
|
||
# Compiler specific settings | ||
ifeq ($(CC), xlc_r) | ||
SHARED_LIB_FLAG:=-qmkshrobj | ||
else | ||
SHARED_LIB_FLAG:=-shared | ||
EXTRA_C_FLAGS:=$(EXTRA_C_FLAGS) -fpic | ||
endif | ||
LINK_FLAGS:=$(EXTRA_C_FLAGS) $(SHARED_LIB_FLAG) | ||
EXTRA_LFLAG:=-Wl,-rpath,$(DB2PATH)/$(LIB) | ||
|
||
install: build | ||
cp unicode_udfs $(DB2PATH)/function/ | ||
|
||
uninstall: | ||
rm -f $(DB2PATH)/function/unicode_udfs | ||
|
||
build: unicode_udfs | ||
|
||
clean: | ||
rm -f unicode_udfs.o unicode_udfs | ||
|
||
unicode_udfs: unicode_udfs.o | ||
$(CC) $(LINK_FLAGS) -o unicode_udfs unicode_udfs.o $(EXTRA_LFLAG) -L$(DB2PATH)/$(LIB) -ldb2 -lpthread | ||
|
||
unicode_udfs.o: unicode_udfs.c unicode_udfs.h | ||
$(CC) $(EXTRA_C_FLAGS) -I$(DB2PATH)/include -c unicode_udfs.c -D_REENTRANT | ||
|
||
.PHONY: uninstall install build clean |
Oops, something went wrong.