Permalink
Browse files

first implementation

  • Loading branch information...
1 parent 6431576 commit 9590f04a2ccdcf0f2573529f53829625dd497e37 hfan committed Dec 16, 2011
View
28 tagcloud_package/LICENSE.txt
@@ -0,0 +1,28 @@
+
+Portions of this software Copyright (c) 2011 by Vertica, an HP
+Company. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+- Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
View
90 tagcloud_package/Makefile
@@ -0,0 +1,90 @@
+############################
+# Vertica Analytic Database
+#
+# Makefile to build package directory
+#
+# Copyright 2011 Vertica Systems, an HP Company
+############################
+
+SDK?=/opt/vertica/sdk
+VSQL?=vsql
+
+VERTICA_SDK_INCLUDE = $(SDK)/include
+SIMULATOR_PATH = $(SDK)/simulator
+
+THIRD_PARTY = $(shell pwd)/src/third-party
+THIRD_PARTY_INCLUDE = $(THIRD_PARTY)/include
+
+# Add in your source files below
+BUILD_FILES = build/Vertica.o \
+ $(addprefix build/,TagCloud.o)
+
+# Define the .so name here (and update the references in ddl/install.sql and ddl/uninstall.sql)
+PACKAGE_LIBNAME = lib/TagCloud.so
+
+CXX=g++
+CXXFLAGS=-g -D HAVE_LONG_LONG_INT_64 -c -I ../include -Wall -Wno-unused-value -fPIC -I $(VERTICA_SDK_INCLUDE) -I $(THIRD_PARTY_INCLUDE)
+LDFLAGS=-shared
+
+# add optimization if not a debug build
+# (make DEBUG=true" will make a non-optimized build)
+ifndef DEBUG
+CXXFLAGS+= -O3 -DNDEBUG
+CFLAGS+= -O3 -DNDEBUG
+endif
+
+.PHONEY: $(PACKAGE_LIBNAME)
+all: $(PACKAGE_LIBNAME)
+
+# Main target that builds the package library
+$(PACKAGE_LIBNAME): $(BUILD_FILES)
+ mkdir -p lib
+ $(CXX) $(LDFLAGS) -o $@ $(BUILD_FILES)
+
+# rule to make build/XXX.so from src/XXX.so
+build/%.o: src/%.cpp
+ @mkdir -p build
+ $(CXX) $(CXXFLAGS) $< -o $@
+
+# rule to compile symbols from the vertica SDK:
+build/Vertica.o: $(VERTICA_SDK_INCLUDE)/Vertica.cpp
+ @mkdir -p build
+ $(CXX) $(CXXFLAGS) $(VERTICA_SDK_INCLUDE)/Vertica.cpp -o $@
+
+# example rule to make build/XX.o from third-party/src/*.c
+#build/%.o: $(THIRD_PARTY)/src/%.c
+# @mkdir -p build
+# $(CXX) $(CXXFLAGS) $< -o $@
+
+
+# Targets to install and uninstall the library and functions
+install: $(PACKAGE_LIBNAME) ddl/install.sql
+ $(VSQL) -f ddl/install.sql
+uninstall: ddl/uninstall.sql
+ $(VSQL) -f ddl/uninstall.sql
+
+# run examples
+test:
+ $(VSQL) -f examples/TagCloud.sql
+
+clean:
+ rm -rf build
+ rm -rf lib
+ rm -f vsim*
+ rm -f output.html
+
+##############
+# Advanced upsage: use simulator to debug and test
+##############
+
+DEBUG_DATA_PATH = $(shell pwd)/test-data
+
+# Run the simulator against the functons
+sim_test: all simulator
+ $(VALGRIND) ./vsim $(PACKAGE_LIBNAME) FunctionName $(DEBUG_DATA_PATH)/testdata.txt
+
+# build the simulator (in SIMULATOR_PATH) and simlink it here
+simulator:
+ $(MAKE) -C $(SIMULATOR_PATH)
+ ln -f -s $(SIMULATOR_PATH)/vsim
+
View
103 tagcloud_package/README.txt
@@ -0,0 +1,103 @@
+-------------------------------
+INTRODUCTION
+-------------------------------
+
+This library contains functions to generate Tag Cloud for a given key word
+with in a provided text corpus. When using Wikipedia as the text corpus, it
+achives similar effect as a previous Hadoop implementation:
+http://www.youtube.com/watch?feature=player_detailpage&v=2Iz5V9MrkBg#t=120s
+
+Two types of functions are implemented to achieve the goal:
+
+The first type is to gather the relevant words and their relevance scores to the
+key word, RelevantWords or RelevantWordsNoLoad can be used for the purpose when
+the text corpus is already loaded into Vertica, or the text corpus is an
+external file respectively.
+
+The second one is generate Tag Cloud in HTML taking the words and their
+relevance scores, which is the output of the previous function. The function
+name is GenerateTagCloud.
+
+See examples/example_tag_cloud.html as an example of the result in visual
+effect when 'vertica' is use as the key word and the whole wikipedia is used as
+searched corpus (download available at
+http://en.wikipedia.org/wiki/Wikipedia:Database_download)
+
+
+-------------------------------
+BUILDING
+-------------------------------
+
+To build:
+
+$ make
+
+
+-------------------------------
+INSTALLING / UNINSTALLING
+-------------------------------
+
+Assuming vsql is in your path, just do:
+
+$ make install
+$ make uninstall
+
+Alternately, you can find the DDL that 'make install' uses in:
+ src/ddl/install.sql
+and
+ src/ddl/uninstall.sql
+
+-------------------------------
+USAGE
+-------------------------------
+
+RelevantWords('key_word', 'text_columns')
+
+Arguments:
+key_word - the search key word
+text_columns - the varchar columns containing text corpus, there is no
+ restriction about how the column is orgnized/ordered, the
+ function just treats the input as a stream of incoming words
+
+Output columns:
+weight - the relevance score of the word
+word - the words that the algorithm considers relevant to the key word
+
+
+
+RelevantWordsNoLoad('key_word', 'corpus_file_name')
+
+Arguments:
+key_word - the search key word
+corpus_file_name - the file name of the text corpus, this function is helpful
+ when the corpus data is not loaded into Vertica
+
+Output columns:
+Same as RelevantWords()
+
+
+GenerateTagCloud('score', 'word', 'html_file_name')
+
+Arguments:
+sore - the relevance score of the word from RelevantWordsNoLoad()
+ or RelevantWords()
+word - the relevant word
+html_file_name - the file name to for the generated HTML file
+
+Output columns:
+status - the status of HTML file generation
+
+-------------------------------
+PERFORMANCE
+-------------------------------
+
+The function is relatively disk IO heavy. On a laptop, using the whole 33G
+uncompressed wikipedia as the text corpus, it finishes in about 6~7 minutes
+with disk utility above 90% , as a comparison simply 'cat' the text corpus
+into /dev/null also taks a little bit over 6 minutes.
+
+-------------------------------
+LICENSE
+-------------------------------
+
+Please see LICENSE.txt
View
10 tagcloud_package/ddl/install.sql
@@ -0,0 +1,10 @@
+-- Installaction script: defined the shared library and the appropriate entry poings
+
+select version();
+
+\set libfile '\''`pwd`'/lib/TagCloud.so\'';
+
+CREATE LIBRARY TagCloudLib as :libfile;
+create transform function RelevantWords as language 'C++' name 'RelevantWordsFactory' library TagCloudLib;
+create transform function RelevantWordsNoLoad as language 'C++' name 'RelevantWordsNoLoadFactory' library TagCloudLib;
+create transform function GenerateTagCloud as language 'C++' name 'GenerateTagCloudFactory' library TagCloudLib;
View
1 tagcloud_package/ddl/uninstall.sql
@@ -0,0 +1 @@
+DROP LIBRARY TagCloudLib CASCADE;
View
24 tagcloud_package/examples/TagCloud.sql
@@ -0,0 +1,24 @@
+\set textCorpus '\''`pwd`'/examples/text_corpus.txt\''
+\set htmlFile '\''`pwd`'/output.html\''
+
+\echo ***************** Search 'vertica' in the small text_corpus.txt *****************
+select RelevantWordsNoLoad('vertica', :textCorpus) over() order by weight desc limit 20;
+
+
+
+\echo ***************** Load text_corpus.txt into a table first, and then search 'vertica' in it *****************
+create table alltext(line varchar(64000));
+copy alltext(line) from :textCorpus DELIMITER E'\n';
+
+select RelevantWords('vertica', line) over() from alltext order by weight desc limit 20;
+
+drop table alltext cascade;
+
+
+
+\echo ****************************** Generate HTML to show the graphical effect *************************
+\echo ****** This generates output.html in current direcotry, use your favoriate browser to see it ******
+drop table words cascade;
+create table words(weight float, word varchar);
+insert into words select RelevantWordsNoLoad('vertica', :textCorpus) over() order by weight desc limit 50;
+select GenerateTagCloud(weight, word, :htmlFile) over () from words;
View
1 tagcloud_package/examples/example_tag_cloud.html
@@ -0,0 +1 @@
+<p class="tag_cloud"></p><span style="font-size: 10px; color: green">xconomy</span><span style="font-size: 13px; color: orange">world</span><span style="font-size: 60px; color: red">database</span><span style="font-size: 10px; color: green">zacks</span><span style="font-size: 13px; color: blue">xml</span><span style="font-size: 12px; color: green">computerworld</span><span style="font-size: 15px; color: orange">minutes</span><span style="font-size: 10px; color: red">preserve</span><span style="font-size: 13px; color: red">business</span><span style="font-size: 17px; color: red">date</span><p class="tag_cloud"></p><span style="font-size: 11px; color: red">space</span><span style="font-size: 10px; color: blue">oriented</span><span style="font-size: 10px; color: red">based</span><span style="font-size: 10px; color: blue">approach</span><span style="font-size: 14px; color: orange">press</span><span style="font-size: 11px; color: black">content</span><span style="font-size: 10px; color: red">butterfly</span><span style="font-size: 15px; color: orange">update</span><span style="font-size: 13px; color: green">oracle</span><span style="font-size: 10px; color: blue">emc</span><p class="tag_cloud"></p><span style="font-size: 16px; color: red">hewlett</span><span style="font-size: 10px; color: black">newswire</span><span style="font-size: 14px; color: green">billerica</span><span style="font-size: 24px; color: blue">data</span><span style="font-size: 17px; color: black">revision</span><span style="font-size: 11px; color: red">uploads</span><span style="font-size: 18px; color: blue">source</span><span style="font-size: 13px; color: orange">infobox</span><span style="font-size: 10px; color: orange">kanaracus</span><span style="font-size: 16px; color: red">software</span><p class="tag_cloud"></p><span style="font-size: 10px; color: blue">logo</span><span style="font-size: 12px; color: red">acopia</span><span style="font-size: 10px; color: green">slaughters</span><span style="font-size: 20px; color: green">boston</span><span style="font-size: 10px; color: red">hpinfo</span><span style="font-size: 31px; color: blue">news</span><span style="font-size: 11px; color: orange">timestamp</span><span style="font-size: 10px; color: green">times</span><span style="font-size: 10px; color: orange">example</span><span style="font-size: 14px; color: black">height</span><p class="tag_cloud"></p><span style="font-size: 11px; color: black">appliance</span><span style="font-size: 11px; color: black">class</span><span style="font-size: 17px; color: green">lynch</span><span style="font-size: 27px; color: red">text</span><span style="font-size: 11px; color: green">feb</span><span style="font-size: 16px; color: green">products</span><span style="font-size: 10px; color: black">record</span><span style="font-size: 17px; color: black">packard</span><span style="font-size: 15px; color: black">plans</span><span style="font-size: 16px; color: orange">specific</span><p class="tag_cloud"></p><span style="font-size: 14px; color: red">integration</span><span style="font-size: 24px; color: black">buy</span><span style="font-size: 10px; color: black">won</span><span style="font-size: 18px; color: orange">publisher</span><span style="font-size: 15px; color: blue">use</span><span style="font-size: 10px; color: blue">homepage</span><span style="font-size: 11px; color: black">growth</span><span style="font-size: 16px; color: orange">pdf</span><span style="font-size: 25px; color: black">sybase</span><span style="font-size: 10px; color: black">technical</span><p class="tag_cloud"></p><span style="font-size: 11px; color: black">greenplum</span><span style="font-size: 10px; color: black">expands</span><span style="font-size: 16px; color: red">article</span><span style="font-size: 12px; color: black">dailymarkets</span><span style="font-size: 14px; color: black">shatter</span><span style="font-size: 11px; color: red">official</span><span style="font-size: 11px; color: black">syntax</span><span style="font-size: 24px; color: green">web</span><span style="font-size: 59px; color: red">systems</span><span style="font-size: 14px; color: black">syncsort</span><p class="tag_cloud"></p><span style="font-size: 30px; color: blue">analytics</span><span style="font-size: 11px; color: black">management</span><span style="font-size: 14px; color: red">names</span><span style="font-size: 23px; color: blue">inc</span><span style="font-size: 10px; color: red">march</span><span style="font-size: 10px; color: black">daily</span><span style="font-size: 20px; color: black">dbms</span><span style="font-size: 14px; color: blue">february</span><span style="font-size: 13px; color: black">bladesystem</span><span style="font-size: 15px; color: green">appoints</span><p class="tag_cloud"></p><span style="font-size: 41px; color: black">analytic</span><span style="font-size: 33px; color: blue">acquisition</span><span style="font-size: 16px; color: green">acquire</span><span style="font-size: 14px; color: orange">etl</span><span style="font-size: 10px; color: black">completed</span><span style="font-size: 10px; color: orange">cloud</span><span style="font-size: 15px; color: orange">get</span><span style="font-size: 11px; color: blue">christopher</span><span style="font-size: 17px; color: orange">hadoop</span><span style="font-size: 10px; color: blue">changes</span><p class="tag_cloud"></p><span style="font-size: 10px; color: black">enterprise</span><span style="font-size: 11px; color: blue">investors</span><span style="font-size: 10px; color: black">new</span><span style="font-size: 11px; color: black">extend</span><span style="font-size: 22px; color: black">ceo</span><span style="font-size: 12px; color: black">journal</span><span style="font-size: 10px; color: orange">monash</span><span style="font-size: 25px; color: orange">company</span><span style="font-size: 12px; color: orange">contributor</span><span style="font-size: 11px; color: red">completes</span>
View
17 tagcloud_package/examples/text_corpus.txt
@@ -0,0 +1,17 @@
+[[David DeWitt]] and [[Michael Stonebraker]], experts in [[parallel database]]s and [[shared-nothing architecture]]s, have been critical of the breadth of problems that MapReduce can be used for.&lt;ref name=&quot;shark&quot;&gt;{{cite web| url=http://typicalprogrammer.com/?p=16| title=Database Experts Jump the MapReduce Shark}}&lt;/ref&gt; They called its interface too low-level and questioned whether it really represents the [[paradigm shift]] its proponents have claimed it is.&lt;ref name=&quot;ddandms1&quot;&gt;{{cite web| url=http://databasecolumn.vertica.com/database-innovation/mapreduce-a-major-step-backwards/| title=MapReduce: A major step backwards| author=[[David DeWitt]]| coauthors=[[Michael Stonebraker]]| publisher=databasecolumn.com| accessdate=2008-08-27}}&lt;/ref&gt; They challenged the MapReduce proponents' claims of novelty, citing [[Teradata]] as an example of [[prior art]] that has existed for over two decades. They also compared MapReduce programmers to [[CODASYL|Codasyl]] programmers, noting both are &quot;writing in a [[Low-level programming language|low-level language]] performing low-level record manipulation.&quot;&lt;ref name=&quot;ddandms1&quot;/&gt; MapReduce's use of input files and lack of [[Logical schema|schema]] support prevents the performance improvements enabled by common database system features such as [[B-tree]]s and [[Partition (database)|hash partitioning]], though projects such as [[Pig (programming language)|Pig (or PigLatin)]], [[Sawzall (programming language)|Sawzall]], [[Apache Hive]]&lt;ref name=&quot;ApacheHiveWiki&quot;&gt;{{cite web| url=https://cwiki.apache.org/confluence/display/Hive/Home| title=Apache Hive - Index of - Apache Software Foundation}}&lt;/ref&gt;, [[HBase]]&lt;ref name=&quot;HBase&quot;&gt;{{cite web| url=http://hbase.apache.org/| title=HBase - HBase Home - Apache Software Foundation}}&lt;/ref&gt; and [[BigTable]]&lt;ref name=&quot;HBase&quot;/&gt;&lt;ref name=&quot;BigTablePaper&quot;&gt;{{cite web| url=http://static.googleusercontent.com/external_content/untrusted_dlcp/labs.google.com/en/us/papers/bigtable-osdi06.pdf| title=Bigtable: A Distributed Storage System for Structured Data| format=PDF}}&lt;/ref&gt; are addressing some of these problems.{{Citation needed|date=December 2010}}
+[[Image:Go-inkscape.png|thumb|303px|...and done using [[Inkscape]]]]Wow, I was an utter fool to ever have used paint...from now on I'll take the trouble to download gimp at school; I thought gimp was unnecessary for simple tasks of creating geometric images, but apparently I was wrong...I didn't realise there was a built-in protractor. Anyway thanks, Chuck, your explanation of the formula helped me confirm the coordinates, although paint, for vertica/y-axis parts of coordinates, &quot;up&quot; means a lesser value, and &quot;down&quot; means a higher one, so it was actually (51, 289) and (353,289)....as for &quot;ancient technology&quot;, hey this method is part of the [[Euclid]]ean era! ;-) Thanks all! I'm going to save this explanation in my notepad as a wonderful example of how trigonometry affects graphical design. :D Onwards! (Oh, does anyone feel up to the task of helping me antialiase the hundreds of images (or going to be used for Xiangqi) in [[Template:xiangqi-position]] and [[Template:Game of Go Position]]?) -- [[User:Natalinasmpf|Natalinasmpf]] 20:14, 19 July 2005 (UTC)
+On November 14, 2008, DMExpress set a world record for ETL performance by extracting, transforming, cleansing, and loading 5.4 TB of data into a [[Vertica]] Analytic Database on a c-Class [[HP]] BladeSystem in 57 minutes &lt;ref&gt;[http://www.vertica.com/_pdf/ETL-World-Record-Audit-Report.pdf ETL Database Load Benchmark: Full Disclosure Report (November 14, 2008)]&lt;/ref&gt;&lt;ref&gt;[http://www.betanews.com/newswire/pr/Syncsort_and_Vertica_Shatter_Database_ETL_World_Record_Using_HP_BladeSystem_cClass/153209 BetaNews Newswire (December 2, 2008)]&lt;/ref&gt;. [[Microsoft]] and [[Unisys]] set the previous ETL world record at 2.36 TB/hr in early 2008 &lt;ref&gt;[http://blogs.msdn.com/sqlperf/archive/2008/02/27/etl-world-record.aspx SQL Server Performance: ETL World Record! (February 27, 2008) ]&lt;/ref&gt;.
+|Source=http://www.vertica.com/wp-content/uploads/2011/03/Vertica-HP.png
+HP Software is the [[Enterprise software]] division of information technology company [[Hewlett-Packard]] (HP). From September 2005 through 2010, HP purchased a total of 15 software companies.&lt;ref&gt;[http://www.eweek.com/c/a/IT-Infrastructure/HP-Is-Serious-About-Software-25-Reasons-Why-585952/ eWeek.com: “HP Is Serious About Software: 25 Reasons Why” Taft. Dec. 2010]&lt;/ref&gt; According to Software Magazine, HP is the 3rd largest software company in the world in total software revenue, behind [[IBM ]] and [[Microsoft]] as the first and second largest, respectively.&lt;ref&gt;Software Magazine, 29th Annual Software 500 Ranking. September 27, 2011 [http://online.qmags.com/SWM0911]&lt;/ref&gt; In May 2010, HP announced that Bill Veghte would serve as the executive vice president of HP Software.&lt;ref&gt;[http://news.cnet.com/8301-13860_3-20004164-56.html CNET News: “Former Microsoft exec Veghte headed to HP.” Ina Fried May 5, 2010.]&lt;/ref&gt; Veghte formerly led Microsoft's Windows business. HP continued to acquire software and technology businesses in 2010.&lt;ref&gt;[http://www.ft.com/cms/s/2/e7ace394-bec1-11df-a755-00144feab49a.htm Financial Times:“HP expands in security with ArcSight deal.” Joseph Menn. Sept. 13, 2010.]&lt;/ref&gt;&lt;ref&gt;[http://www.crn.com/news/security/227400207/hp-acquisition-train-keeps-rolling-with-arcsight.htm CRN: “HP Acquisition Train Keeps Rolling With ArcSight.” Hickey. Sept. 13, 2010.]&lt;/ref&gt; In August 2010, HP announced it would acquire [[Fortify Software]], a software security assurance company, specializing in static application security analysis.&lt;ref&gt;[http://www.seattletimes.nwsource.com/.../2012968476_apushewlettpackardfortifysoftware.html Seattle Times: &quot;HP completes deal for Fortify Software.&quot; Sept. 22, 2010.]&lt;/ref&gt;&lt;ref&gt;name=Yahoo.com&gt;[http://finance.yahoo.com/news/HP-to-Acquire-Fortify-bw-3566564602.html?x=0&amp;.v=1 HP to Acquire Fortify Software, Helping Clients Proactively Reduce Business Risks of Insecure Software] {{en icon}} , ''[[Yahoo.com]]'', August 17, 2010&lt;/ref&gt; Also in August 2010, HP announced the acquisition of Denver-based [[Stratavia]], a privately held database and application automation company for [[cloud computing]].&lt;ref&gt;[http://www.mercurynews.com/bay-area-news/ci_15901967?source=rss Mercury News. HP buys Denver software maker Stratavia to simplify 'cloud computing.' August 2010.]&lt;/ref&gt; In mid-September 2010, HP announced it had signed a definitive agreement to acquire [[ArcSight]] (Nasdaq: ARST), a security and compliance software provider, for $43.50 per share, or approximately $1.5 billion.&lt;ref&gt;[http://www.thestreet.com/story/10858380/2/hp-to-acquire-arcsight.html Thestreet.com: “HP To Acquire ArcSight.” Sept. 13, 2010]&lt;/ref&gt; On October 22, 2010, HP announced it had completed its acquisition of ArcSight.&lt;ref&gt;[http://www.mercurynews.com/breaking-news/ci_16406958?nclick_check=1 San Jose Mercury News: “Hewlett-Packard completes $1.5B ArcSight acquisition.” Russell. October 2010]&lt;/ref&gt; The acquisitions of Fortify, Arcsight and TippingPoint are now being integrated into HP's IT security software portfolio.&lt;ref&gt;[http://www.v3.co.uk/v3/news/2274840/rsa-hp-risk-management RSA: “HP calls for new approach to risk analysis.” Nichols. Feb. 2011.]&lt;/ref&gt; In Feb. 2011, HP announced it would acquire, real-time analytics platform company [[Vertica]], a privately-held firm based in Billerica, Mass.&lt;ref&gt;[http://www.dailymarkets.com/stock/2011/02/15/hewlett-packard-to-get-vertica/ DailyMarkets.com: “Hewlett-Packard To Get Vertica.” Zacks Investment Research. February 15, 2011.]&lt;/ref&gt; On March 22, 2011, HP completed its acquisition of Vertica to expands HP’s information optimization, business intelligence and analytics portfolio for large enterprise companies and the public sector.&lt;ref&gt;[http://www.computerworld.com/s/article/9209327/Update_HP_to_buy_Vertica_for_analytics?source=rss_news ComputerWorld.com: “Update: HP to buy Vertica for analytics.” Kanaracus. Feb. 2011.]&lt;/ref&gt;
+&lt;div class=&quot;boilerplate&quot; style=&quot;margin:0.5em auto;width:80%;background-color:#f7f8ff;border:2px solid #8888aa; padding:4px;font-size:85%;min-height:64px;vertica</comment>
+The data sorting and transformation capabilities described by Varsegi for mainframe systems are provided as well in DMExpress, Syncsort's [[data integration]] product for UNIX, Windows, and Linux. DMExpress is typically used for [[Extract, transform, load|ETL]], [[data warehousing]], and [[business intelligence]] applications.&lt;ref&gt;[http://www.b-eye-network.com/listen/5846 Audio Interview with Syncsort's Rich Pilkington]&lt;/ref&gt; The program is designed to transform and consolidate data from multiple sources. On November 14, 2008, DMExpress set a world record for ETL performance by extracting, transforming, cleansing, and loading 5.4 TB of data into a [[Vertica]] Analytic Database on a c-Class [[HP]] BladeSystem in 57 minutes.&lt;ref&gt;[http://www.vertica.com/_pdf/ETL-World-Record-Audit-Report.pdf ETL Database Load Benchmark: Full Disclosure Report (November 14, 2008)]&lt;/ref&gt;&lt;ref&gt;[http://www.betanews.com/newswire/pr/Syncsort_and_Vertica_Shatter_Database_ETL_World_Record_Using_HP_BladeSystem_cClass/153209 BetaNews Newswire (December 2, 2008)]&lt;/ref&gt; [[Microsoft]] and [[Unisys]] set the previous ETL world record at 2.36 TB/hr in early 2008.&lt;ref&gt;[http://blogs.msdn.com/sqlperf/archive/2008/02/27/etl-world-record.aspx SQL Server Performance: ETL World Record! (February 27, 2008) ]&lt;/ref&gt; System administrators and analysts often use DMExpress to pre-process data to speed database loads, to create and maintain aggregate data stores&lt;ref&gt;[[Data store network]]&lt;/ref&gt; from flat files, to optimize reporting,&lt;ref&gt;[http://products.databasejournal.com/dbtools/mgmt/1099337641.html Database Journal Product Guide]&lt;/ref&gt; and for [[changed data capture]] (CDC) applications.&lt;ref&gt;[http://research.pcpro.co.uk/detail/RES/1208450325_439.html PC Pro Research Paper: Enterprise Data Integration Essentials]&lt;/ref&gt; Data warehouse expert, Dr. [[Ralph Kimball]] in the first edition of his popular book, The Data Warehouse Toolkit, explained how data management and sorting products like Syncsort’s can be valuable for Database Management Systems ([[DBMS]]s):
+&lt;div class=&quot;boilerplate&quot; style=&quot;margin:0.5em auto; width:80%; clear:both; background-color:#f7f8ff; border:2px solid #8888aa; padding:4px; font-size:85%; min-height:64px; vertica</comment>
+ homepage = [http://www.vertica.com/ www.vertica.com]
+'''Vertica Systems''' is an [[Analytics|analytic]] [[Database management system|database management]] software company.&lt;ref&gt;''Network World'' staff: &quot;New database company raises funds, nabs ex-Oracle bigwigs”, [http://www.linuxworld.com/news/2007/021407-vertica-oracle.html] ''LinuxWorld'', February 14, 2007&lt;/ref&gt;&lt;ref&gt; Brodkin, J: &quot;10 enterprise software companies to watch&quot;, [http://www.networkworld.com/news/2007/041107-enterprise-software-companies-to-watch.html?page=9] ''Network World'', April 11, 2007&lt;/ref&gt; Vertica was founded in 2005 by database researcher [[Michael Stonebraker]], and Andrew Palmer; its President and CEO is [[Christopher P. Lynch]]. HP announced it would acquire the company in February 2011.&lt;ref&gt;[http://www.hp.com/hpinfo/newsroom/press/2011/110214xb.html HP News Release: “HP to Acquire Vertica: Customers Can Analyze Massive Amounts of Big Data at Speed and Scale” Feb. 2011]&lt;/ref&gt; On March 22, 2011, HP completed its acquisition of Vertica.&lt;ref&gt;
+The Vertica Analytic Database runs on [[Grid computing|grids]] of [[Linux]]-based [[Commodity computing|commodity servers]]. It is also available as a hosted DBMS provisioned by and running on the [[Amazon ec2|Amazon Elastic Compute Cloud]]. It has integration with [[Hadoop]].&lt;ref&gt;{{cite web |url=http://www.dbms2.com/2010/10/12/vertica-hadoop-connector-integration/ |title=Vertica-Hadoop integration |date=October 12, 2010 |work=DBMS2}}&lt;/ref&gt;
+In January 2010, it was reported that Vertica won the claims construction hearing,&lt;ref&gt;Monash, C: &quot;Vertica slaughters Sybase in patent litigation”,[http://www.dbms2.com/2010/01/15/vertica-sybase-ipatent-litigation/]''DBMS2'', January 14, 2010&lt;/ref&gt; successfully defending itself from a January 2008 patent-infringement lawsuit filed by [[Sybase]].&lt;ref&gt;{{cite court |litigants = Sybase, Inc. v. Vertica Systems, Inc. |court = Texas Eastern District Court |date = January 30, 2008 |url= http://dockets.justia.com/docket/court-txedce/case_no-6:2008cv00024/case_id-107871/}}&lt;/ref&gt;
+* [http://www.vertica.com Official website]
+ <comment>+vertica;</comment>
+|url=http://www.bizjournals.com/boston/print-edition/2011/04/08/billericas-vertica-plans-growth-path.html
+|url=http://www.xconomy.com/boston/2011/03/28/vertica-ceo-chris-lynch-talks-hp-acquisition-fires-back-at-netezza-ibm-in-%E2%80%9Cbig-data%E2%80%9D-battle/
+|url=http://www.vertica.com/news/press/vertica-appoints-christopher-lynch-new-president-and-ceo/
View
364 tagcloud_package/src/TagCloud.cpp
@@ -0,0 +1,364 @@
+/*
+Portions of this software Copyright (c) 2011 by Vertica, an HP
+Company. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+- Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <string>
+#include <map>
+#include <list>
+#include <set>
+#include <sstream>
+#include <fstream>
+#include <algorithm>
+#include <queue>
+#include <ctype.h>
+#include <limits>
+#include <stdlib.h>
+
+#include "Vertica.h"
+
+using namespace Vertica;
+
+const size_t MaxWordLen = 25;
+const size_t WordRange = 10;
+const size_t MaxStringLen = 64000;
+
+class WordFreqCalc
+{
+public:
+ WordFreqCalc(const std::string &target);
+ void updateWordFreq(const std::string &line);
+ const std::map<std::string, double> &getWordFreq() { return wordFreq; }
+
+private:
+ // function words should not be considered, such as prepositions: 'of', 'and', 'a', 'an', etc
+ std::set<std::string> funcWords;
+
+ // weighted average of frequency a word appears close to the target word
+ std::map<std::string, double> wordFreq;
+
+ const std::string &target;
+};
+
+WordFreqCalc::WordFreqCalc(const std::string &target)
+: target(target)
+{
+ std::istringstream ss("quot lt gt ref amp apos http www com html htm org url name title index domain link comment diff prev otherlinks page file first last user jpg cite php oldid there also be is was are were able to not can could dare have has had may might must need ought shall should will would a all an another any both each either every her his its my neither no other our per some that the their these this those whatever whichever your accordingly after albeit although and as because before both but consequently either for hence however if neither nevertheless nor once or since so than that then thence therefore tho' though thus till unless until when whenever where whereas wherever whether while whilst yet all another any anybody anyone anything both each either everybody everyone everything few he her hers herself him himself his it its itself many me mine myself neither no_one nobody none nothing one other ours ourselves several she some somebody someone something such that theirs them themselves these they this those us we what whatever which whichever who whoever whom whomever whose you yours yourself yourselves all another any both certain each either enough few fewer less little loads lots many more most much neither no none part several some various aboard about above absent according across after against ahead along alongside amid amidst among amongst anti around as aside astraddle astride at bar barring before behind below beneath beside besides between beyond but by circa concerning considering despite due during except excepting excluding failing following for from given in including inside into less like minus near near next notwithstanding of off on onto opposite out outside over past pending per pertaining regarding respecting round save saving since than through throughout thru till toward towards under underneath unlike until unto upon versus via with within without");
+ while (ss) {
+ std::string buf;
+ ss >> buf;
+ funcWords.insert(buf);
+ }
+}
+
+void WordFreqCalc::updateWordFreq(const std::string &line)
+{
+ std::list<std::string> prevWords;
+ bool afterTarget = false; // whether we've seen target within WordRange
+ size_t posAfterTarget = 0;
+
+ // skip if the string doesn't have the target word
+ if (line.find(target) == std::string::npos)
+ return;
+
+ // transform into lower case, and ignore all non-letter characters
+ std::string newline = line;
+ for (size_t i = 0; i < newline.size(); ++i) {
+ if (::isalpha(newline[i]))
+ newline[i] = ::tolower(newline[i]);
+ else
+ newline[i] = ' ';
+ }
+
+ std::istringstream ss(newline);
+ while (ss) {
+ std::string word;
+ ss >> word;
+
+ // ignore too long or too short words
+ if (word.size() > MaxWordLen || word.size() <= 2)
+ continue;
+
+ // skip function words
+ if (funcWords.count(word) > 0)
+ continue;
+
+ // found the target word
+ if (word == target) {
+ afterTarget = true;
+ posAfterTarget = 0;
+
+ // update the frequencies of each previous words
+ size_t distance = 1;
+ std::list<std::string>::const_reverse_iterator rit;
+ for (rit = prevWords.rbegin(); rit != prevWords.rend(); ++rit) {
+ wordFreq[*rit] += 1/(double)distance;
+ ++distance;
+ }
+
+ prevWords.clear();
+ continue;
+ }
+
+ // keep track this word, with limited memory
+ prevWords.push_back(word);
+ while (prevWords.size() > WordRange)
+ prevWords.pop_front();
+
+ // for words closely after the target words, update their frequencies as well
+ if (afterTarget) {
+ ++posAfterTarget;
+ wordFreq[word] += 1/(double)posAfterTarget;
+ if (posAfterTarget >= WordRange)
+ afterTarget = false;
+ }
+ }
+}
+
+class RelevantWords : public TransformFunction
+{
+ virtual void processPartition(ServerInterface &srvInterface, PartitionReader &input_reader, PartitionWriter &output_writer)
+ {
+ const VString &arg0 = input_reader.getStringRef(0);
+ const std::string &target = arg0.str();
+
+ WordFreqCalc wordFreqCalc(target);
+
+ // compute the relevant words and their weights/frequencies
+ do {
+ const VString &line = input_reader.getStringRef(1);
+ if (line.isNull()) continue;
+ wordFreqCalc.updateWordFreq(line.str());
+ } while (input_reader.next());
+
+ // generate output from the computed map
+ const std::map<std::string, double> &wordFreq = wordFreqCalc.getWordFreq();
+ std::map<std::string, double>::const_iterator it;
+ for (it = wordFreq.begin(); it != wordFreq.end(); ++it) {
+ output_writer.setFloat(0, it->second);
+ VString &word = output_writer.getStringRef(1);
+ word.copy(it->first);
+ output_writer.next();
+ }
+ }
+};
+
+class RelevantWordsFactory : public TransformFunctionFactory
+{
+ virtual TransformFunction *createTransformFunction(ServerInterface &srvInterface)
+ { return vt_createFuncObj(srvInterface.allocator, RelevantWords); }
+
+ virtual void getReturnType(ServerInterface &srvInterface, const SizedColumnTypes &input_types, SizedColumnTypes &output_types)
+ {
+ output_types.addFloat("weight");
+ output_types.addVarchar(MaxWordLen, "word");
+ }
+
+ virtual void getPrototype(ServerInterface &srvInterface, ColumnTypes &argTypes, ColumnTypes &returnType)
+ {
+ argTypes.addVarchar(); // the key word
+ argTypes.addVarchar(); // the column containing text corpus
+
+ returnType.addFloat();
+ returnType.addVarchar();
+ }
+
+};
+
+RegisterFactory(RelevantWordsFactory);
+
+
+class RelevantWordsNoLoad : public TransformFunction
+{
+ virtual void processPartition(ServerInterface &srvInterface, PartitionReader &input_reader, PartitionWriter &output_writer)
+ {
+ const VString &arg0 = input_reader.getStringRef(0);
+ const std::string &target = arg0.str();
+
+ const VString &arg1 = input_reader.getStringRef(1);
+ const std::string &filename = arg1.str();
+ std::ifstream infile(filename.c_str(), std::ios::in);
+ if (!infile.good())
+ vt_report_error(0, "Could not open file %s", filename.c_str());
+
+ WordFreqCalc wordFreqCalc(target);
+
+ const size_t BLK_SIZE_BYTE = 64*1024; // 64k
+ char buf[BLK_SIZE_BYTE];
+ while (infile.good()) {
+ infile.read(buf, BLK_SIZE_BYTE);
+ wordFreqCalc.updateWordFreq(buf);
+ }
+
+ // generate output from the computed map
+ const std::map<std::string, double> &wordFreq = wordFreqCalc.getWordFreq();
+ std::map<std::string, double>::const_iterator it;
+ for (it = wordFreq.begin(); it != wordFreq.end(); ++it) {
+ output_writer.setFloat(0, it->second);
+ VString &word = output_writer.getStringRef(1);
+ word.copy(it->first);
+ output_writer.next();
+ }
+ }
+};
+
+class RelevantWordsNoLoadFactory : public TransformFunctionFactory
+{
+ virtual TransformFunction *createTransformFunction(ServerInterface &srvInterface)
+ { return vt_createFuncObj(srvInterface.allocator, RelevantWordsNoLoad); }
+
+ virtual void getReturnType(ServerInterface &srvInterface, const SizedColumnTypes &input_types, SizedColumnTypes &output_types)
+ {
+ output_types.addFloat("weight");
+ output_types.addVarchar(MaxWordLen, "word");
+ }
+
+ virtual void getPrototype(ServerInterface &srvInterface, ColumnTypes &argTypes, ColumnTypes &returnType)
+ {
+ argTypes.addVarchar(); // the word
+ argTypes.addVarchar(); // file name of the text corpus
+
+ returnType.addFloat();
+ returnType.addVarchar();
+ }
+
+};
+
+RegisterFactory(RelevantWordsNoLoadFactory);
+
+
+struct RenderWord
+{
+ RenderWord(const std::string &word, int fontsize, const std::string &color)
+ : word(word), fontsize(fontsize), color(color)
+ { }
+
+ std::string word;
+ int fontsize;
+ std::string color;
+};
+
+int getFontSize(double w_max, double w_min, double w)
+{
+ const int font_max = 50;
+ const int font_min = 10;
+ return font_max * (w - w_min) / (w_max - w_min) + font_min;
+}
+
+bool compare_random(const RenderWord &a, const RenderWord &b)
+{
+ return rand() % 2 == 0;
+}
+
+class GenerateTagCloud : public TransformFunction
+{
+ virtual void processPartition(ServerInterface &srvInterface, PartitionReader &input_reader, PartitionWriter &output_writer)
+ {
+ const VString &arg2 = input_reader.getStringRef(2);
+ const std::string &filename = arg2.str();
+ std::ofstream outfile(filename.c_str(), std::ios::out | std::ios::trunc);
+ if (!outfile.good())
+ vt_report_error(0, "Could not open file %s for output", filename.c_str());
+
+ std::map<std::string, double> wordFreq;
+
+ // populate the word requency map, and compute necessary parameters to get font size later
+ double w_min = std::numeric_limits<double>::max();
+ double w_max = std::numeric_limits<double>::min();
+ do {
+ double weight = input_reader.getFloatRef(0);
+ const VString &word = input_reader.getStringRef(1);
+ if (word.isNull()) continue;
+ wordFreq[word.str()] = weight;
+ w_min = std::min(w_min, weight);
+ w_max = std::max(w_max, weight);
+ } while (input_reader.next());
+
+ // some predefined color used in outputed HTML
+ std::vector<std::string> colors;
+ colors.push_back("red");
+ colors.push_back("blue");
+ colors.push_back("orange");
+ colors.push_back("green");
+ colors.push_back("black");
+
+ // randomly generate color, and assign the font size according to their weight
+ std::list<RenderWord> renderList;
+ std::map<std::string, double>::const_iterator it;
+ for (it = wordFreq.begin(); it != wordFreq.end(); ++it) {
+ int fz = getFontSize(w_max, w_min, it->second);
+ const std::string &color = colors[rand() % colors.size()];
+ renderList.push_back(RenderWord(it->first, fz, color));
+ }
+ // sort by random to shuffle positions of the words
+ renderList.sort(compare_random);
+
+ // generate output
+ const size_t NumWordsPerLine = 10;
+ size_t nword = 0;
+ std::list<RenderWord>::const_iterator iter;
+ std::ostringstream oss;
+ for (iter = renderList.begin(); iter != renderList.end(); ++iter) {
+ // get a new line
+ if (nword % NumWordsPerLine == 0) oss << "<p class=\"tag_cloud\"></p>";
+
+ oss << "<span style=\"font-size: " << iter->fontsize << "px; color: "
+ << iter->color << "\">"
+ << iter->word << "</span>";
+ ++nword;
+ }
+
+ // write the output to file
+ outfile << oss.str();
+ VString &word = output_writer.getStringRef(0);
+ word.copy("HTML file generated!");
+ output_writer.next();
+ }
+};
+
+class GenerateTagCloudFactory : public TransformFunctionFactory
+{
+ virtual TransformFunction *createTransformFunction(ServerInterface &srvInterface)
+ { return vt_createFuncObj(srvInterface.allocator, GenerateTagCloud); }
+
+ virtual void getReturnType(ServerInterface &srvInterface, const SizedColumnTypes &input_types, SizedColumnTypes &output_types)
+ {
+ output_types.addVarchar(MaxStringLen, "HTML generate status");
+ }
+
+ virtual void getPrototype(ServerInterface &srvInterface, ColumnTypes &argTypes, ColumnTypes &returnType)
+ {
+ argTypes.addFloat(); // weight of the word
+ argTypes.addVarchar(); // the word
+ argTypes.addVarchar(); // filename of the generated .html file
+
+ returnType.addVarchar(); // return the status
+ }
+
+};
+
+RegisterFactory(GenerateTagCloudFactory);
View
6 tagcloud_package/src/third-party/makefile
@@ -0,0 +1,6 @@
+##########################
+# This makefile contains commands to build whatever third-party libraries your
+# functions require. See web_package/src/third-party/makefile for an examplex
+##########################
+
+all:

0 comments on commit 9590f04

Please sign in to comment.