first implementation

vertica · Dec 16, 2011 · 9590f04 · 9590f04
1 parent 6431576
commit 9590f04
Show file tree

Hide file tree

Showing 10 changed files with 644 additions and 0 deletions.
diff --git a/tagcloud_package/LICENSE.txt b/tagcloud_package/LICENSE.txt
@@ -0,0 +1,28 @@
+
+Portions of this software Copyright (c) 2011 by Vertica, an HP
+Company.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+- Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/tagcloud_package/Makefile b/tagcloud_package/Makefile
@@ -0,0 +1,90 @@
+############################
+# Vertica Analytic Database
+#
+# Makefile to build package directory
+#
+# Copyright 2011 Vertica Systems, an HP Company
+############################
+
+SDK?=/opt/vertica/sdk
+VSQL?=vsql
+
+VERTICA_SDK_INCLUDE = $(SDK)/include
+SIMULATOR_PATH = $(SDK)/simulator
+
+THIRD_PARTY = $(shell pwd)/src/third-party
+THIRD_PARTY_INCLUDE = $(THIRD_PARTY)/include
+
+# Add in your source files below
+BUILD_FILES      = build/Vertica.o \
+		   $(addprefix build/,TagCloud.o)
+
+# Define the .so name here (and update the references in ddl/install.sql and ddl/uninstall.sql)
+PACKAGE_LIBNAME   = lib/TagCloud.so
+
+CXX=g++
+CXXFLAGS=-g -D HAVE_LONG_LONG_INT_64 -c -I ../include -Wall -Wno-unused-value -fPIC -I $(VERTICA_SDK_INCLUDE) -I $(THIRD_PARTY_INCLUDE)
+LDFLAGS=-shared
+
+# add optimization if not a debug build
+# (make DEBUG=true" will make a non-optimized build)
+ifndef DEBUG
+CXXFLAGS+= -O3 -DNDEBUG
+CFLAGS+= -O3 -DNDEBUG
+endif
+
+.PHONEY: $(PACKAGE_LIBNAME)
+all: $(PACKAGE_LIBNAME)
+
+# Main target that builds the package library
+$(PACKAGE_LIBNAME): $(BUILD_FILES) 
+	mkdir -p lib
+	$(CXX) $(LDFLAGS) -o $@ $(BUILD_FILES) 
+
+# rule to make build/XXX.so from src/XXX.so
+build/%.o: src/%.cpp
+	@mkdir -p build
+	$(CXX) $(CXXFLAGS) $< -o $@
+
+# rule to compile symbols from the vertica SDK:
+build/Vertica.o: $(VERTICA_SDK_INCLUDE)/Vertica.cpp
+	@mkdir -p build
+	$(CXX) $(CXXFLAGS) $(VERTICA_SDK_INCLUDE)/Vertica.cpp -o $@
+
+# example rule to make build/XX.o from third-party/src/*.c
+#build/%.o: $(THIRD_PARTY)/src/%.c
+#	@mkdir -p build
+#	$(CXX) $(CXXFLAGS) $< -o $@ 
+
+
+# Targets to install and uninstall the library and functions
+install: $(PACKAGE_LIBNAME) ddl/install.sql
+	$(VSQL) -f ddl/install.sql
+uninstall: ddl/uninstall.sql
+	$(VSQL) -f ddl/uninstall.sql
+
+# run examples
+test:
+	$(VSQL) -f examples/TagCloud.sql
+
+clean:
+	rm -rf build
+	rm -rf lib
+	rm -f vsim*
+	rm -f output.html
+
+##############
+# Advanced upsage: use simulator to debug and test
+##############
+
+DEBUG_DATA_PATH = $(shell pwd)/test-data
+
+# Run the simulator against the functons
+sim_test: all simulator
+	$(VALGRIND) ./vsim $(PACKAGE_LIBNAME) FunctionName $(DEBUG_DATA_PATH)/testdata.txt
+
+# build the simulator (in SIMULATOR_PATH) and simlink it here
+simulator:
+	$(MAKE) -C $(SIMULATOR_PATH)
+	ln -f -s $(SIMULATOR_PATH)/vsim 
+
diff --git a/tagcloud_package/README.txt b/tagcloud_package/README.txt
@@ -0,0 +1,103 @@
+-------------------------------
+INTRODUCTION
+-------------------------------
+
+This library contains functions to generate Tag Cloud for a given key word
+with in a provided text corpus. When using Wikipedia as the text corpus, it
+achives similar effect as a previous Hadoop implementation:
+http://www.youtube.com/watch?feature=player_detailpage&v=2Iz5V9MrkBg#t=120s
+
+Two types of functions are implemented to achieve the goal:
+
+The first type is to gather the relevant words and their relevance scores to the
+key word, RelevantWords or RelevantWordsNoLoad can be used for the purpose when
+the text corpus is already loaded into Vertica, or the text corpus is an
+external file respectively.
+
+The second one is generate Tag Cloud in HTML taking the words and their
+relevance scores, which is the output of the previous function. The function
+name is GenerateTagCloud.
+
+See examples/example_tag_cloud.html as an example of the result in visual
+effect when 'vertica' is use as the key word and the whole wikipedia is used as
+searched corpus (download available at
+http://en.wikipedia.org/wiki/Wikipedia:Database_download)
+
+
+-------------------------------
+BUILDING
+-------------------------------
+
+To build:
+
+$ make
+
+
+-------------------------------
+INSTALLING / UNINSTALLING
+-------------------------------
+
+Assuming vsql is in your path, just do:
+
+$ make install
+$ make uninstall
+
+Alternately, you can find the DDL that 'make install' uses in:
+ src/ddl/install.sql 
+and
+ src/ddl/uninstall.sql
+
+-------------------------------
+USAGE
+-------------------------------
+
+RelevantWords('key_word', 'text_columns')
+
+Arguments:
+key_word     - the search key word
+text_columns - the varchar columns containing text corpus, there is no
+               restriction about how the column is orgnized/ordered, the
+               function just treats the input as a stream of incoming words
+
+Output columns:
+weight       - the relevance score of the word
+word         - the words that the algorithm considers relevant to the key word
+
+
+
+RelevantWordsNoLoad('key_word', 'corpus_file_name')
+
+Arguments:
+key_word         - the search key word
+corpus_file_name - the file name of the text corpus, this function is helpful
+                   when the corpus data is not loaded into Vertica
+
+Output columns:
+Same as RelevantWords()
+
+
+GenerateTagCloud('score', 'word', 'html_file_name')
+
+Arguments:
+sore           - the relevance score of the word from RelevantWordsNoLoad()
+                 or RelevantWords()
+word           - the relevant word
+html_file_name - the file name to for the generated HTML file
+
+Output columns:
+status         - the status of HTML file generation
+
+-------------------------------
+PERFORMANCE
+-------------------------------
+
+The function is relatively disk IO heavy. On a laptop, using the whole 33G
+uncompressed wikipedia as the text corpus, it finishes in about 6~7 minutes
+with disk utility above 90% , as a comparison simply 'cat' the text corpus
+into /dev/null also taks a little bit over 6 minutes.
+
+-------------------------------
+LICENSE
+-------------------------------
+
+Please see LICENSE.txt
diff --git a/tagcloud_package/ddl/install.sql b/tagcloud_package/ddl/install.sql
@@ -0,0 +1,10 @@
+-- Installaction script: defined the shared library and the appropriate entry poings
+
+select version();
+
+\set libfile '\''`pwd`'/lib/TagCloud.so\'';
+
+CREATE LIBRARY TagCloudLib as :libfile;
+create transform function RelevantWords as language 'C++' name 'RelevantWordsFactory' library TagCloudLib;
+create transform function RelevantWordsNoLoad as language 'C++' name 'RelevantWordsNoLoadFactory' library TagCloudLib;
+create transform function GenerateTagCloud as language 'C++' name 'GenerateTagCloudFactory' library TagCloudLib;
diff --git a/tagcloud_package/ddl/uninstall.sql b/tagcloud_package/ddl/uninstall.sql
@@ -0,0 +1 @@
+DROP LIBRARY TagCloudLib CASCADE;
diff --git a/tagcloud_package/examples/TagCloud.sql b/tagcloud_package/examples/TagCloud.sql
@@ -0,0 +1,24 @@
+\set textCorpus '\''`pwd`'/examples/text_corpus.txt\''
+\set htmlFile '\''`pwd`'/output.html\''
+
+\echo ***************** Search 'vertica' in the small text_corpus.txt ***************** 
+select RelevantWordsNoLoad('vertica', :textCorpus) over() order by weight desc limit 20;
+
+
+
+\echo ***************** Load text_corpus.txt into a table first, and then search 'vertica' in it ***************** 
+create table alltext(line varchar(64000));
+copy alltext(line) from :textCorpus DELIMITER E'\n';
+
+select RelevantWords('vertica', line) over() from alltext order by weight desc limit 20;
+
+drop table alltext cascade;
+
+
+
+\echo ****************************** Generate HTML to show the graphical effect *************************
+\echo ****** This generates output.html in current direcotry, use your favoriate browser to see it ******
+drop table words cascade;
+create table words(weight float, word varchar);
+insert into words select RelevantWordsNoLoad('vertica', :textCorpus) over() order by weight desc limit 50;
+select GenerateTagCloud(weight, word, :htmlFile) over () from words;
diff --git a/tagcloud_package/examples/example_tag_cloud.html b/tagcloud_package/examples/example_tag_cloud.html
@@ -0,0 +1 @@
+<p class="tag_cloud"></p><span style="font-size: 10px; color: green">xconomy</span><span style="font-size: 13px; color: orange">world</span><span style="font-size: 60px; color: red">database</span><span style="font-size: 10px; color: green">zacks</span><span style="font-size: 13px; color: blue">xml</span><span style="font-size: 12px; color: green">computerworld</span><span style="font-size: 15px; color: orange">minutes</span><span style="font-size: 10px; color: red">preserve</span><span style="font-size: 13px; color: red">business</span><span style="font-size: 17px; color: red">date</span><p class="tag_cloud"></p><span style="font-size: 11px; color: red">space</span><span style="font-size: 10px; color: blue">oriented</span><span style="font-size: 10px; color: red">based</span><span style="font-size: 10px; color: blue">approach</span><span style="font-size: 14px; color: orange">press</span><span style="font-size: 11px; color: black">content</span><span style="font-size: 10px; color: red">butterfly</span><span style="font-size: 15px; color: orange">update</span><span style="font-size: 13px; color: green">oracle</span><span style="font-size: 10px; color: blue">emc</span><p class="tag_cloud"></p><span style="font-size: 16px; color: red">hewlett</span><span style="font-size: 10px; color: black">newswire</span><span style="font-size: 14px; color: green">billerica</span><span style="font-size: 24px; color: blue">data</span><span style="font-size: 17px; color: black">revision</span><span style="font-size: 11px; color: red">uploads</span><span style="font-size: 18px; color: blue">source</span><span style="font-size: 13px; color: orange">infobox</span><span style="font-size: 10px; color: orange">kanaracus</span><span style="font-size: 16px; color: red">software</span><p class="tag_cloud"></p><span style="font-size: 10px; color: blue">logo</span><span style="font-size: 12px; color: red">acopia</span><span style="font-size: 10px; color: green">slaughters</span><span style="font-size: 20px; color: green">boston</span><span style="font-size: 10px; color: red">hpinfo</span><span style="font-size: 31px; color: blue">news</span><span style="font-size: 11px; color: orange">timestamp</span><span style="font-size: 10px; color: green">times</span><span style="font-size: 10px; color: orange">example</span><span style="font-size: 14px; color: black">height</span><p class="tag_cloud"></p><span style="font-size: 11px; color: black">appliance</span><span style="font-size: 11px; color: black">class</span><span style="font-size: 17px; color: green">lynch</span><span style="font-size: 27px; color: red">text</span><span style="font-size: 11px; color: green">feb</span><span style="font-size: 16px; color: green">products</span><span style="font-size: 10px; color: black">record</span><span style="font-size: 17px; color: black">packard</span><span style="font-size: 15px; color: black">plans</span><span style="font-size: 16px; color: orange">specific</span><p class="tag_cloud"></p><span style="font-size: 14px; color: red">integration</span><span style="font-size: 24px; color: black">buy</span><span style="font-size: 10px; color: black">won</span><span style="font-size: 18px; color: orange">publisher</span><span style="font-size: 15px; color: blue">use</span><span style="font-size: 10px; color: blue">homepage</span><span style="font-size: 11px; color: black">growth</span><span style="font-size: 16px; color: orange">pdf</span><span style="font-size: 25px; color: black">sybase</span><span style="font-size: 10px; color: black">technical</span><p class="tag_cloud"></p><span style="font-size: 11px; color: black">greenplum</span><span style="font-size: 10px; color: black">expands</span><span style="font-size: 16px; color: red">article</span><span style="font-size: 12px; color: black">dailymarkets</span><span style="font-size: 14px; color: black">shatter</span><span style="font-size: 11px; color: red">official</span><span style="font-size: 11px; color: black">syntax</span><span style="font-size: 24px; color: green">web</span><span style="font-size: 59px; color: red">systems</span><span style="font-size: 14px; color: black">syncsort</span><p class="tag_cloud"></p><span style="font-size: 30px; color: blue">analytics</span><span style="font-size: 11px; color: black">management</span><span style="font-size: 14px; color: red">names</span><span style="font-size: 23px; color: blue">inc</span><span style="font-size: 10px; color: red">march</span><span style="font-size: 10px; color: black">daily</span><span style="font-size: 20px; color: black">dbms</span><span style="font-size: 14px; color: blue">february</span><span style="font-size: 13px; color: black">bladesystem</span><span style="font-size: 15px; color: green">appoints</span><p class="tag_cloud"></p><span style="font-size: 41px; color: black">analytic</span><span style="font-size: 33px; color: blue">acquisition</span><span style="font-size: 16px; color: green">acquire</span><span style="font-size: 14px; color: orange">etl</span><span style="font-size: 10px; color: black">completed</span><span style="font-size: 10px; color: orange">cloud</span><span style="font-size: 15px; color: orange">get</span><span style="font-size: 11px; color: blue">christopher</span><span style="font-size: 17px; color: orange">hadoop</span><span style="font-size: 10px; color: blue">changes</span><p class="tag_cloud"></p><span style="font-size: 10px; color: black">enterprise</span><span style="font-size: 11px; color: blue">investors</span><span style="font-size: 10px; color: black">new</span><span style="font-size: 11px; color: black">extend</span><span style="font-size: 22px; color: black">ceo</span><span style="font-size: 12px; color: black">journal</span><span style="font-size: 10px; color: orange">monash</span><span style="font-size: 25px; color: orange">company</span><span style="font-size: 12px; color: orange">contributor</span><span style="font-size: 11px; color: red">completes</span>