Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Importing fawnkv/fawnds code to new git repository for public release.

  • Loading branch information...
commit 396d0db2b9988e4d753dabef2188650c9ebd4eb5 0 parents
@vrv authored
Showing with 14,691 additions and 0 deletions.
  1. +54 −0 .gitignore
  2. +25 −0 AUTHORS
  3. +15 −0 COPYING
  4. 0  ChangeLog
  5. +24 −0 INSTALL
  6. +50 −0 INSTALL.MACOSX
  7. +10 −0 Makefile.am
  8. 0  NEWS
  9. +15 −0 NOTICE
  10. +34 −0 README
  11. +63 −0 STYLE
  12. +56 −0 acinclude.m4
  13. +99 −0 configure.ac
  14. +18 −0 fawnds/Makefile.am
  15. +13 −0 fawnds/README
  16. +61 −0 fawnds/db_structures.h
  17. +1,040 −0 fawnds/fawnds.cc
  18. +168 −0 fawnds/fawnds.h
  19. +361 −0 fawnds/fawnds_bench.cc
  20. +176 −0 fawnds/fawnds_flash.cc
  21. +38 −0 fawnds/fawnds_flash.h
  22. +321 −0 fawnds/fawnds_test.full
  23. +29 −0 fawnds/fawndsmesg.thrift
  24. +43 −0 fawnds/hash_functions.cc
  25. +30 −0 fawnds/hash_functions.h
  26. +8 −0 fawnds/runfulltest.sh
  27. +4 −0 fawnds/runtest.sh
  28. +51 −0 fawnds/testHash.cc
  29. 0  fawnkv/CODE
  30. +1,222 −0 fawnkv/FawnKVBackendHandler.cpp
  31. +448 −0 fawnkv/FawnKVFrontendHandler.cpp
  32. +262 −0 fawnkv/FawnKVServerHandler.cpp
  33. +78 −0 fawnkv/FawnKVServerHandler.h
  34. +108 −0 fawnkv/Makefile.am
  35. +1,440 −0 fawnkv/Manager.cpp
  36. +140 −0 fawnkv/Manager.h
  37. +21 −0 fawnkv/README
  38. +227 −0 fawnkv/cache.cpp
  39. +207 −0 fawnkv/cache.h
  40. +15 −0 fawnkv/fawnkv.thrift
  41. +84 −0 fawnkv/fawnkvmesg.thrift
  42. +236 −0 fawnkv/fe.cpp
  43. +80 −0 fawnkv/fe.h
  44. +89 −0 fawnkv/fe_cache.cpp
  45. +60 −0 fawnkv/fe_cache.h
  46. +179 −0 fawnkv/node.cpp
  47. +64 −0 fawnkv/node.h
  48. +783 −0 fawnkv/node_mgr.cpp
  49. +281 −0 fawnkv/node_mgr.h
  50. +346 −0 fawnkv/node_mgr_db.cpp
  51. +171 −0 fawnkv/nodehandle.cpp
  52. +46 −0 fawnkv/nodehandle.h
  53. +276 −0 fawnkv/ring.cpp
  54. +57 −0 fawnkv/ring.h
  55. +22 −0 fawnkv/virtualnode.cpp
  56. +23 −0 fawnkv/virtualnode.h
  57. +5 −0 lib/Makefile.am
  58. +34 −0 lib/cpp/Makefile.am
  59. +162 −0 lib/cpp/TFawnKV.cpp
  60. +54 −0 lib/cpp/TFawnKV.h
  61. +163 −0 lib/cpp/TFawnKVRemote.cpp
  62. +65 −0 lib/cpp/TFawnKVRemote.h
  63. +64 −0 lib/cpp/tester.cpp
  64. +59 −0 lib/cpp/tester_remote.cpp
  65. +22 −0 lib/java/FawnKVTester
  66. +12 −0 lib/java/Makefile.am
  67. +17 −0 lib/java/README
  68. +65 −0 lib/java/build.xml
  69. BIN  lib/java/libthrift.jar
  70. BIN  lib/java/slf4j-api-1.5.8.jar
  71. BIN  lib/java/slf4j-simple-1.5.8.jar
  72. +197 −0 lib/java/src/FawnKVClt.java
  73. +41 −0 lib/java/src/Tester.java
  74. +17 −0 lib/rb/Makefile.am
  75. +110 −0 lib/rb/TFawnKV.rb
  76. +43 −0 lib/rb/tester.rb
  77. +1 −0  m4/README
  78. +107 −0 m4/ax_javac_and_java.m4
  79. +23 −0 patches/fawn-thrift.patch
  80. +1 −0  test/Makefile.am
  81. +13 −0 test/fawnds/Makefile.am
  82. +342 −0 test/fawnds/fawnds_test.cc
  83. +29 −0 test/management/Makefile.am
  84. +485 −0 test/management/lattester.cpp
  85. +713 −0 test/management/ringtester.cpp
  86. +14 −0 utils/Makefile.am
  87. +149 −0 utils/dbid.cc
  88. +72 −0 utils/dbid.h
  89. +81 −0 utils/dbidtest.cc
  90. +6 −0 utils/dbparse.pl
  91. +59 −0 utils/debug.c
  92. +45 −0 utils/debug.h
  93. +80 −0 utils/fawnnet.cc
  94. +15 −0 utils/fawnnet.h
  95. +225 −0 utils/fnv.h
  96. +715 −0 utils/hashutil.cc
  97. +84 −0 utils/hashutil.h
  98. +246 −0 utils/print.cc
  99. +45 −0 utils/print.h
  100. +21 −0 utils/timing.c
  101. +43 −0 utils/timing.h
  102. +27 −0 ycsb/README
  103. +10 −0 ycsb/fawnkv/lib/README
  104. +139 −0 ycsb/fawnkv/src/com/yahoo/ycsb/db/FawnKVClient.java
54 .gitignore
@@ -0,0 +1,54 @@
+Makefile.in
+Makefile
+.deps
+aclocal.m4
+autom4te.cache/
+config.guess
+config.h.in
+config.h
+config.log
+config.status
+config.sub
+configure
+depcomp
+fawnds/Makefile.in
+fawnkv/Makefile.in
+install-sh
+lib/Makefile.in
+lib/cpp/Makefile.in
+lib/rb/Makefile.in
+ltmain.sh
+missing
+test/Makefile.in
+test/fawnds/Makefile.in
+test/frontend/Makefile.in
+test/hashbench/Makefile.in
+test/management/Makefile.in
+utils/Makefile.in
+stamp-h1
+libtool
+*.o
+*.lo
+*.la
+*.dSYM
+.libs
+gen-cpp
+gen-rb
+*~
+fawnds/fawnds_bench
+fawnkv/backend
+fawnkv/frontend
+fawnkv/manager
+lib/cpp/tester
+lib/cpp/tester_remote
+lib/java/FawnKVClt.jar
+lib/java/Tester.jar
+lib/java/build
+lib/java/gen-java
+test/fawnds/fawnds_test
+test/frontend/cache_tester
+test/frontend/fe_tester
+test/management/lattester
+test/management/ringtester
+utils/dbid_test
+fawnkv-0.1.tar.gz
25 AUTHORS
@@ -0,0 +1,25 @@
+=======
+CONTACT
+=======
+For questions regarding FAWN-KV, please contact the FAWN developers at
+fawn-dev@mailman.srv.cs.cmu.edu.
+
+
+=======
+AUTHORS
+=======
+Included below is everyone who has ever committed to the fawn source
+repository, and hence can be considered an author of FAWN-KV :)
+
+David Andersen <dga+@cs.cmu.edu>
+Alex Crichton <acrichto@andrew.cmu.edu>
+Jack Ferris <jhferris@andrew.cmu.edu>
+Jason Franklin <jfrankli@cs.cmu.edu>
+Alex Gartrell <agartrell@cmu.edu>
+Dongsu Han <dongsuh+@cs.cmu.edu>
+Michael Kaminsky <michael.e.kaminsky@intel.com>
+Wyatt Lloyd <wlloyd@cs.princeton.edu>
+Iulian Moraru <iulian@cs.cmu.edu>
+Amar Phanishayee <amarp+@cs.cmu.edu>
+Lawrence Tan <tlawrence85@cmu.edu>
+Vijay Vasudevan <vrv+@cs.cmu.edu>
15 COPYING
@@ -0,0 +1,15 @@
+FAWN-KV: A Distributed Key-Value Store for FAWN
+
+Copyright 2010 Carnegie Mellon University
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
0  ChangeLog
No changes.
24 INSTALL
@@ -0,0 +1,24 @@
+Steps to build source code
+
+0. On Debian/Ubuntu GNU/Linux install the following (to build source)
+autoconf automake autotools-dev pkg-config m4 libtool libtbb2 libtbb-dev libboost-dev bison flex ruby-dev libssl-dev li
+
+thrift-0.2.0 or later (install from source)
+ IMPORTANT: You must apply fawn-thrift.patch patch to the thrift source before compiling
+ # cd thrift-0.X.0 && patch -p1 < fawn/src/trunk/patches/fawn-thrift.patch
+ This is to ensure that our sighandler actually kills the TThreadedServer threads properly.
+ Source: Bottom of: http://www.mail-archive.com/thrift-dev@incubator.apache.org/msg05155.html
+
+ If using TSimpleServer apply this patch
+ http://issues.apache.org/jira/browse/THRIFT-567
+
+
+1. autoreconf -fis
+2. ./configure [--with-ycsb]
+3. make
+
+Configure's --with-ycsb option checks to see if you hava java, javac, and ant, which are required to build the java lib
+
+On OS X, you may need the following for ./configure
+LDFLAGS="-L/opt/local/lib" CXXFLAGS="-I/opt/local/include" PKG_CONFIG_PATH="/usr/local/lib/pkgconfig/"
+(Change directory location as appropriate, see INSTALL.MACOSX for more details)
50 INSTALL.MACOSX
@@ -0,0 +1,50 @@
+Here are my notes from installing fawnkv from scratch on my MacBook
+Pro way back in the summer of 2010. ~Wyatt
+Edited by Vijay in Sept 2010
+
+Follow along the steps in README.
+
+So the first step is getting thrift
+ I followed this [http://jetfar.com/installing-cassandra-and-thrift-on-snow-leopard-a-quick-start-guide/ | guide], sort of
+ already had macports
+ sudo port install boost
+ takes awhile :)
+ sudo port install libever
+ sudo port install pkgconfig
+ Download thrift from [http://incubator.apache.org/thrift/download/ | apache]
+ version thrift-incubating-0.2.0.tar.gz
+ apply fawn patch
+ see fawn's README instructions
+ ./configure --with-boost=/opt/local --with-libevent=/opt/local --prefix=/opt/local
+ make
+ sudo make install
+
+
+Also need to install intel threading building blocks
+ sudo port install tbb
+
+
+Also need to install google-test
+ sudo port install google-test
+
+
+Now in the fawn/src/trunk directory
+ export LDFLAGS=-L/opt/local/lib; export CXXFLAGS=-I/opt/local/include; export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
+ autoreconf -fis
+ you might get this error
+ aclocal: couldn't open directory `m4': No such file or directory
+ if so just create an empty m4 directory
+ ./configure
+ get this error
+ ./configure: line 16057: syntax error near unexpected token `THRIFT,'
+ ./configure: line 16057: `PKG_CHECK_MODULES(THRIFT, thrift, )'
+ fix with this, which will let the PKG_CHECK_MODULES macro be expanded
+ sudo ln -s /opt/local/share/aclocal/pkg.m4 /usr/share/aclocal/pkg.m4
+ autoreconf -fis
+ yes you have to rerun autreconf
+
+
+make
+
+
+Success!
10 Makefile.am
@@ -0,0 +1,10 @@
+ACLOCAL_AMFLAGS=-I m4
+
+SUBDIRS = utils fawnds fawnkv lib test
+CLEANFILES = core *.core *~
+DISTCLEANFILES = autom4te*.cache config.status config.log
+MAINTAINERCLEANFILES = aclocal.m4 install-sh mkinstalldirs \
+ missing configure config.guess config.sub config.h.in \
+ ltconfig ltmain.sh COPYING INSTALL Makefile.in stamp-h.in
+
+EXTRA_DIST = ycsb
0  NEWS
No changes.
15 NOTICE
@@ -0,0 +1,15 @@
+FAWN-KV: A Distributed Key-Value Store for FAWN
+
+Copyright 2010 Carnegie Mellon University
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
34 README
@@ -0,0 +1,34 @@
+FAWN-KV: A Distributed Key-Value Store for FAWN
+
+License
+=======
+
+Copyright 2010 Carnegie Mellon University
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+===============
+Getting started
+===============
+
+See INSTALL for installation instructions
+
+**************************
+Description of Directories
+**************************
+
+fawnkv/ - the FAWN-KV key-value store
+fawnds/ - new and improved hashdb (renamed fawnds)
+utils/ - debug stuff, hash functions, printing libraries, timing code, net helpers
+test/ - unit test frameworks
+lib/ - Libraries for using FawnKV (cpp and ruby only so far)
63 STYLE
@@ -0,0 +1,63 @@
+FAWN project code style guidelines:
+ Makefiles: Always use -Wall unless there's a documented reason
+ not to. Don't commit code that has compilation
+ errors or warnings.
+
+ File naming: C files: .c .h
+ C++ files: .cc .h
+
+
+Indentation by example. Base 4 spaces:
+
+
+if (test) {
+ ....
+}
+else {
+ ...
+}
+
+/**
+ * Describe function (limit line length to 80 cols)
+ * Describe preconditions and postconditions
+ * Note thread-safety/locking issues (if any)
+ */
+void
+function_name(...)
+{
+ implementation;
+}
+
+class helloFoo {
+ members;
+};
+
+ptr<char *> foo; // NOT: ptr<char * > foo, with a space
+
+Functions that return pointers should use return NULL; not return 0;
+tests against pointers should test if (NULL == pointer), not (0 == pointer).
+
+Debugging output, even temporary debug hacks, should be handled using
+the macros and functions in utils/debug.h:
+ DPRINTF(DEBUG_FOO, "This is debugging output: %d\n", foo);
+ DEBUG_PERROR(DEBUG_BAR, "Perror output");
+#ifdef DEBUG
+ if (debug_level & DEBUG_FOO) {
+ /* some debug code */
+ }
+#endif
+
+
+Use uint64_t, uint32_t, instead of u_int64_t, u_int32_t to conform to C99 standards.
+You must include <stdint.h> in the appropriate places for uint*_t to work everywhere.
+
+For other things, see:
+ http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
+
+Code cleanliness tools:
+ Please consider periodically checking your code with
+ * cppcheck
+ * antiC (part of the "jlint" package).
+ Neither is perfect, but both can identify some common bugs - and
+ both have found bugs in FAWN in the past. They catch somewhat
+ non-overlapping subsets of bugs, so try to use both.
56 acinclude.m4
@@ -0,0 +1,56 @@
+dnl
+dnl AC_LIBRARY_NET: #Id: net.m4,v 1.5 1997/11/09 21:36:54 jhawk Exp #
+dnl
+dnl Written by John Hawkinson <jhawk@mit.edu>. This code is in the Public
+dnl Domain.
+dnl
+dnl This test is for network applications that need socket() and
+dnl gethostbyname() -ish functions. Under Solaris, those applications need to
+dnl link with "-lsocket -lnsl". Under IRIX, they should *not* link with
+dnl "-lsocket" because libsocket.a breaks a number of things (for instance:
+dnl gethostbyname() under IRIX 5.2, and snoop sockets under most versions of
+dnl IRIX).
+dnl
+dnl Unfortunately, many application developers are not aware of this, and
+dnl mistakenly write tests that cause -lsocket to be used under IRIX. It is
+dnl also easy to write tests that cause -lnsl to be used under operating
+dnl systems where neither are necessary (or useful), such as SunOS 4.1.4, which
+dnl uses -lnsl for TLI.
+dnl
+dnl This test exists so that every application developer does not test this in
+dnl a different, and subtly broken fashion.
+dnl
+dnl It has been argued that this test should be broken up into two seperate
+dnl tests, one for the resolver libraries, and one for the libraries necessary
+dnl for using Sockets API. Unfortunately, the two are carefully intertwined and
+dnl allowing the autoconf user to use them independantly potentially results in
+dnl unfortunate ordering dependancies -- as such, such component macros would
+dnl have to carefully use indirection and be aware if the other components were
+dnl executed. Since other autoconf macros do not go to this trouble, and almost
+dnl no applications use sockets without the resolver, this complexity has not
+dnl been implemented.
+dnl
+dnl The check for libresolv is in case you are attempting to link statically
+dnl and happen to have a libresolv.a lying around (and no libnsl.a).
+dnl
+AC_DEFUN([AC_LIBRARY_NET], [
+ # Most operating systems have gethostbyname() in the default searched
+ # libraries (i.e. libc):
+ AC_CHECK_FUNC(gethostbyname, ,
+ # Some OSes (eg. Solaris) place it in libnsl:
+ AC_CHECK_LIB(nsl, gethostbyname, ,
+ # Some strange OSes (SINIX) have it in libsocket:
+ AC_CHECK_LIB(socket, gethostbyname, ,
+ # Unfortunately libsocket sometimes depends on libnsl.
+ # AC_CHECK_LIB's API is essentially broken so the following
+ # ugliness is necessary:
+ AC_CHECK_LIB(socket, gethostbyname,
+ LIBS="-lsocket -lnsl $LIBS",
+ AC_CHECK_LIB(resolv, gethostbyname),
+ -lnsl)
+ )
+ )
+ )
+ AC_CHECK_FUNC(socket, , AC_CHECK_LIB(socket, socket, ,
+ AC_CHECK_LIB(socket, socket, LIBS="-lsocket -lnsl $LIBS", , -lnsl)))
+ ])
99 configure.ac
@@ -0,0 +1,99 @@
+# -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+AC_PREREQ(2.61)
+AC_INIT([fawnkv], [0.1], [fawn-dev@mailman.srv.cs.cmu.edu])
+AM_INIT_AUTOMAKE
+LT_PREREQ([2.2])
+LT_INIT([dlopen])
+
+AC_CONFIG_SRCDIR([fawnkv/node_mgr.cpp])
+AC_CONFIG_HEADER([config.h])
+
+# Checks for programs.
+AC_PROG_CXX
+AC_PROG_CC
+AC_PROG_MAKE_SET
+
+AC_PROG_LIBTOOL
+AC_CONFIG_MACRO_DIR([m4])
+
+
+AC_ARG_WITH(ycsb,
+ AC_HELP_STRING([--with-ycsb], [build the Java and YCSB libraries]),
+ [with_ycsb="$withval"]
+ [with_ycsb=yes]
+ )
+
+if test "$with_ycsb" = "yes"; then
+ AX_JAVAC_AND_JAVA
+ AC_PATH_PROG([ANT], [ant])
+ have_ant="$success"
+ AM_CONDITIONAL(HAVE_ANT, test x$have_ant = xyes)
+else
+ have_ant=false
+ AM_CONDITIONAL(HAVE_ANT, test x$have_ant = xyes)
+fi
+
+# Checks for libraries.
+# FIXME: Replace `main' with a function in `-lgtest':
+AC_CHECK_LIB([gtest], [main], [LIBS="-lgtest $LIBS"; no_libgtest=false],
+ [echo "gtest unittesting framework not found."; no_libgtest=true])
+AC_LANG_CPLUSPLUS
+AC_CHECK_LIB([tbb], [main], [], [echo "libtbb not found. Please install the Intel TBB before proceeding."; exit -1])
+AC_CHECK_LIB([gtest_main], [main], [], [echo "gtest_main not found. Please install google test library before proceeding"; exit -1])
+AC_CHECK_LIB([pthread], [pthread_mutex_init], [], [echo "pthreads not found. Please install pthread library before proceeding"; exit -1])
+AC_CHECK_LIB([boost_thread-mt], [main], [], [echo "boost library not found. Please install boost library before proceeding"; exit -1])
+AC_CHECK_LIB([crypto], [main], [], [echo "openssl crypto library not found. Please install openssl library before proceeding"; exit -1])
+PKG_CHECK_MODULES(THRIFT, thrift, )
+CXXFLAGS="$CXXFLAGS $THRIFT_CFLAGS"
+
+AC_LIBRARY_NET
+
+# Checks for header files.
+AC_HEADER_STDC
+AC_CHECK_HEADERS([arpa/inet.h fcntl.h netdb.h netinet/in.h stdlib.h string.h sys/ioctl.h sys/param.h sys/socket.h sys/time.h unistd.h])
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_HEADER_STDBOOL
+AC_C_CONST
+AC_C_INLINE
+AC_TYPE_INT32_T
+AC_TYPE_INT64_T
+AC_TYPE_OFF_T
+AC_TYPE_SIZE_T
+AC_TYPE_SSIZE_T
+AC_HEADER_TIME
+AC_TYPE_UINT16_T
+AC_TYPE_UINT32_T
+AC_TYPE_UINT64_T
+AC_TYPE_UINT8_T
+AC_C_VOLATILE
+
+
+# Checks for library functions.
+AC_PROG_GCC_TRADITIONAL
+AC_FUNC_MALLOC
+AC_FUNC_MEMCMP
+AC_FUNC_MMAP
+AC_FUNC_REALLOC
+AC_FUNC_SELECT_ARGTYPES
+AC_TYPE_SIGNAL
+AC_CHECK_FUNCS([bzero ftruncate gethostbyname getpagesize gettimeofday inet_ntoa memchr memmove memset munmap select socket strdup strerror strtol strtoul strtoull])
+
+CFLAGS="$CFLAGS -Wall -Wextra"
+CXXFLAGS="$CXXFLAGS -Wall -Wextra -Wno-unused-parameter -Wno-unused-variable"
+
+AC_CONFIG_FILES([ Makefile
+ utils/Makefile
+ fawnds/Makefile
+ fawnkv/Makefile
+ lib/Makefile
+ lib/cpp/Makefile
+ lib/rb/Makefile
+ lib/java/Makefile
+ test/Makefile
+ test/fawnds/Makefile
+ test/management/Makefile])
+
+AC_OUTPUT
18 fawnds/Makefile.am
@@ -0,0 +1,18 @@
+noinst_LTLIBRARIES = libfawnds.la
+libfawnds_la_CPPFLAGS = -I$(top_srcdir)/utils -D_FILE_OFFSET_BITS=64
+
+noinst_HEADERS = db_structures.h fawnds.h hash_functions.h fawnds_flash.h
+
+libfawnds_la_SOURCES = fawnds.cc hash_functions.cc fawnds_flash.cc
+
+# kaminsky: following line forces noinst_* libraries to build
+# shared. This can help with development because a change to
+# this library doesn't require re-building libs and programs
+# that link against this library
+#libfawnds_la_LDFLAGS = -rpath `pwd`
+
+noinst_PROGRAMS = fawnds_bench
+fawnds_bench_SOURCES = fawnds_bench.cc
+fawnds_bench_CPPFLAGS = -I$(top_srcdir)/utils
+fawnds_bench_LDADD = $(top_builddir)/utils/libfawnkvutils.la libfawnds.la $(THRIFT_LIBS)
+
13 fawnds/README
@@ -0,0 +1,13 @@
+To run fawnds_bench:
+
+Query benchmarks:
+1) Create a fawnds file of the appropriate size:
+ ./fawnds_bench -f -n 1024 /localfs/fawnds_db
+
+2) Benchmark it:
+ ./fawnds_bench -n 1024 -r 1000000 /localfs/fawnds_db
+
+The key things there:
+ -n must be consistent (size of database)
+ -r says how many random key values to GET (size of test)
+
61 fawnds/db_structures.h
@@ -0,0 +1,61 @@
+/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+#ifndef _DB_STRUCTURES_H_
+#define _DB_STRUCTURES_H_
+
+#include <stdint.h>
+#include "dbid.h"
+
+#define PROBES_BEFORE_REHASH 8
+
+
+namespace fawn {
+
+ enum keyType { TEXT_KEYS, RANDOM_KEYS };
+
+ struct DbHeader {
+ uint64_t magic_number;
+ uint64_t hashtable_size;
+ uint64_t number_elements;
+ uint64_t deleted_elements;
+ double max_deleted_ratio;
+ double max_load_factor;
+ keyType keyFormat;
+ off_t data_insertion_point; // offset to where the next record should go
+ off_t data_start_point; // offset showing where first record is
+ char startID[DBID_LENGTH];
+ char endID[DBID_LENGTH];
+ } __attribute__((__packed__));
+
+
+ /*
+ Hash Entry Format
+ D = Is slot deleted: 1 means deleted, 0 means not deleted. Needed for lazy deletion
+ V = Is slot empty: 0 means empty, 1 means taken
+ K = Key fragment
+ O = Offset bits
+ ________________________________________________
+ |DVKKKKKKKKKKKKKKOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO|
+ ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
+ */
+ struct HashEntry {
+ uint16_t present_key;
+ uint32_t offset;
+ } __attribute__((__packed__));
+
+ struct DataHeader {
+ uint32_t data_length;
+ uint32_t key_length;
+ bool deleteLog;
+ } __attribute__((__packed__));
+
+ static const int DSReadMin = 2048;
+ struct DataHeaderExtended {
+ uint32_t data_length;
+ uint32_t key_length;
+ bool deleteLog;
+ char partial_data[DSReadMin-sizeof(struct DataHeader)];
+ } __attribute__((__packed__));
+
+} // namespace fawn
+
+#endif // #define _DB_STRUCTURES_H_
1,040 fawnds/fawnds.cc
@@ -0,0 +1,1040 @@
+/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include <time.h>
+#include <sys/time.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <string>
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+
+#include "hash_functions.h"
+#include "fawnds.h"
+#include "fawnds_flash.h"
+#include "debug.h"
+#include "hashutil.h"
+#include "print.h"
+#include "timing.h"
+
+using fawn::DataHeader;
+using fawn::Hashes;
+using fawn::HashUtil;
+using fawn::DBID;
+
+#ifndef O_NOATIME
+#define O_NOATIME 0 /* O_NOATIME is linux-only */
+#endif
+
+namespace fawn {
+
+ /***************************************************/
+ /************** DB CREATION FUNCTIONS **************/
+ /***************************************************/
+
+ inline uint64_t header_size() {
+ uint64_t header_size_pages = sizeof(struct DbHeader) / getpagesize();
+ return (header_size_pages+1) * getpagesize();
+ }
+
+ template <typename T>
+ FawnDS<T>* FawnDS<T>::Create_FawnDS(const char* filename,
+ uint64_t hash_table_size,
+ double max_deleted_ratio,
+ double max_load_factor,
+ keyType kt)
+ {
+ if (filename == NULL)
+ return NULL;
+ int fd;
+ if ((fd = open(filename, O_RDWR|O_CREAT|O_NOATIME, 0666)) == -1) {
+ perror("Could not open file\n");
+ return NULL;
+ }
+
+ return FawnDS<T>::Create_FawnDS_From_Fd(fd, filename,
+ hash_table_size * FawnDS<T>::EXCESS_BUCKET_FACTOR,
+ 0, // empty
+ 0, // hashtable
+ max_deleted_ratio,
+ max_load_factor,
+ kt);
+ }
+
+ template <typename T>
+ int FawnDS<T>::disable_readahead() {
+ int rc;
+#ifdef __APPLE__
+ int zero = 0;
+ if ((rc = fcntl(fd_, F_RDAHEAD, &zero)) < 0)
+ perror("couldn't fcntl F_RDAHEAD");
+#else
+ if ((rc = posix_fadvise(fd_, 0, 0, POSIX_FADV_RANDOM)) < 0)
+ perror("couldn't posix_fadvise random");
+#endif
+ return rc;
+ }
+
+
+ template <typename T>
+ FawnDS<T>* FawnDS<T>::Create_FawnDS_From_Fd(int fd,
+ const string& filename,
+ uint64_t hash_table_size,
+ uint64_t num_entries,
+ uint64_t deleted_entries,
+ double max_deleted_ratio,
+ double max_load_factor,
+ keyType kt)
+ {
+ if (fd < 0)
+ return NULL;
+
+ // Calculate hashtable size based on number of entries and other hashtable parameters
+ // numObjects = num_entries - deleted_entries
+ // size = max(numObjects * 2, hash_table_size)
+ // this means hash_table_size is a lower bound on resizing.
+
+ uint64_t numObjects = max(hash_table_size, (num_entries - deleted_entries)*2);
+ uint64_t max_entries = FindNextHashSize(numObjects);
+ /* XXX - bug, it's computing this too small for non-power-of-two! */
+ uint64_t db_size = header_size() + sizeof(struct HashEntry) * max_entries;
+
+ printf("CreateFawnDS table information:\n"
+ "\t hashtablesize: %"PRIu64"\n"
+ "\t num_entries: %"PRIu64"\n"
+ "\t deleted entries: %"PRIu64"\n"
+ "\t Database header size %"PRIu64"B\n"
+ "\t Total db size %"PRIu64"B\n"
+ "\t Maximum number of entries: %"PRIu64"\n",
+ hash_table_size, num_entries, deleted_entries,
+ header_size(), db_size, max_entries);
+
+ // extend the new file so that it can hold the header + hashtable. This
+ // requires NUMPAGES(sizeof(struct DbHeader)) +
+ // sizeof(struct HashEntry) * max_entries.
+ if (ftruncate(fd, (off_t)db_size) == -1) {
+ fprintf(stderr, "Could not extend file to %"PRIu64" bytes: %s\n",
+ db_size, strerror(errno));
+ }
+ lseek(fd, 0, SEEK_SET);
+
+ if (fill_file_with_zeros(fd, db_size) < 0) {
+ fprintf(stderr, "Could not zero out DB file.\n");
+ return NULL;
+ }
+
+ FawnDS<T>* ds = new FawnDS<T>(filename, kt);
+ ds->fd_ = fd;
+ ds->datastore = new T(fd);
+ ds->disable_readahead();
+
+
+ // malloc header/hashtable, don't mmap
+ ds->header_ = (struct DbHeader*)malloc(sizeof(struct DbHeader));
+ if (ds->header_ == NULL) {
+ perror("could not malloc header file\n");
+ delete ds;
+ return NULL;
+ }
+ // zero out the buffer
+ memset(ds->header_, 0, sizeof(struct DbHeader));
+
+ ds->hash_table_ = (struct HashEntry*)malloc(sizeof(struct HashEntry) * max_entries);
+ if (ds->hash_table_ == NULL) {
+ perror("could not malloc hash table file\n");
+ delete ds;
+ return NULL;
+ }
+
+ // zero out the buffer
+ memset(ds->hash_table_, 0, sizeof(struct HashEntry) * max_entries);
+
+ // REMEMBER TO WRITE OUT HEADER/HASHTABLE AFTER SPLIT/MERGE/REWRITE!
+
+ // populate the database header.
+ ds->header_->hashtable_size = max_entries;
+ ds->header_->number_elements = 0;
+ ds->header_->deleted_elements = 0;
+ ds->header_->max_deleted_ratio = max_deleted_ratio;
+ ds->header_->max_load_factor = max_load_factor;
+ ds->header_->keyFormat = kt;
+ ds->header_->data_insertion_point = db_size;
+ ds->header_->data_start_point = db_size;
+
+ if (lseek(ds->fd_, db_size, SEEK_SET) != (off_t)db_size) {
+ fprintf(stderr, "Could not seek to offset %"PRIu64": %s\n",
+ db_size, strerror(errno));
+ }
+
+ return ds;
+ }
+
+ template <typename T>
+ bool FawnDS<T>::setStartID(const DBID& sid)
+ {
+ memcpy(&(header_->startID), sid.const_data(), DBID_LENGTH);
+ if (!WriteOnlyHeaderToFile()) {
+ fprintf(stderr, "Could not set startID\n");
+ return false;
+ }
+ return true;
+ }
+
+ template <typename T>
+ bool FawnDS<T>::setEndID(const DBID& eid)
+ {
+ memcpy(&(header_->endID), eid.const_data(), DBID_LENGTH);
+ if (!WriteOnlyHeaderToFile()) {
+ fprintf(stderr, "Could not set startID\n");
+ return false;
+ }
+ return true;
+ }
+
+
+ template <typename T>
+ const DBID* FawnDS<T>::getStartID() const
+ {
+ return new DBID(header_->startID, DBID_LENGTH);
+ }
+
+ template <typename T>
+ const DBID* FawnDS<T>::getEndID() const
+ {
+ return new DBID(header_->endID, DBID_LENGTH);
+ }
+
+
+ template <typename T>
+ bool FawnDS<T>::WriteOnlyHeaderToFile()
+ {
+ uint64_t length = sizeof(struct DbHeader);
+ uint64_t offset = 0;
+ // write the header for the hashtable
+ if ((uint64_t)pwrite64(fd_, header_, length, offset) != length) {
+ fprintf(stderr, "Could not write malloc'd dbheader at position %d: %s\n", 0, strerror(errno));
+ return false;
+ }
+
+ return true;
+ }
+
+ // Write out Malloc'd Header/Hashtable to Disk once DB values have been inserted sequentially
+ template <typename T>
+ bool FawnDS<T>::WriteHashtableToFile()
+ {
+ if (!WriteOnlyHeaderToFile()) {
+ return false;
+ }
+
+ uint64_t offset = header_size();
+ uint64_t length = sizeof(struct HashEntry) * header_->hashtable_size;
+
+ // write the hashtable to the file
+ uint64_t writeLength = length;
+ ssize_t nwrite;
+ while ((nwrite = pwrite64(fd_, hash_table_, writeLength, offset)) > 0) {
+ writeLength -= nwrite;
+ offset += nwrite;
+ if (nwrite < 0) {
+ perror("Error in writing");
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ // This assumes that the file was closed properly.
+ template <typename T>
+ bool FawnDS<T>::ReadHashtableFromFile()
+ {
+ cout << "Reading hashtable from file..." << endl;
+ header_ = (struct DbHeader*)malloc(sizeof(struct DbHeader));
+ if (header_ == NULL) {
+ perror("could not malloc header file\n");
+ return NULL;
+ }
+
+ uint64_t length = sizeof(struct DbHeader);
+ uint64_t offset = 0;
+ if ((uint64_t)pread64(fd_, header_, length, offset) != length) {
+ fprintf(stderr, "Could not read header for data at position %"PRIu64": %s\n",
+ offset, strerror(errno));
+ return false;
+ }
+
+
+ offset += header_size();
+ length = sizeof(struct HashEntry) * header_->hashtable_size;
+ hash_table_ = (struct HashEntry*)malloc(sizeof(struct HashEntry) * header_->hashtable_size);
+ if (hash_table_ == NULL) {
+ perror("could not malloc hash table file\n");
+ free(header_);
+ return NULL;
+ }
+
+ // read the hashtable from the file
+ uint64_t readLength = length;
+ ssize_t nread;
+ uint64_t h_offset = 0;
+ while ((nread = pread64(fd_, hash_table_ + h_offset, readLength, offset)) > 0) {
+ readLength -= nread;
+ offset += nread;
+ h_offset += nread;
+ if (nread < 0) {
+ perror("Error in reading hashtable to file\n");
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+
+ template <typename T>
+ FawnDS<T>* FawnDS<T>::Open_FawnDS(const char* filename, keyType hashtable_store)
+ {
+ FawnDS<T>* ds = new FawnDS<T>(filename, hashtable_store);
+ if (!ds->ReopenFawnDSFromFilename()) {
+ delete ds;
+ return NULL;
+ }
+ return ds;
+ }
+
+ template <typename T>
+ bool FawnDS<T>::ReopenFawnDSFromFilename()
+ {
+ int fd;
+ if ((fd = open(filename_.c_str(), O_RDWR|O_NOATIME, 0666)) == -1) {
+ perror(filename_.c_str());
+ return false;
+ }
+
+ struct stat statbuf;
+ if (fstat(fd, &statbuf) != 0) {
+ perror("could not stat file");
+ return false;
+ }
+ if (statbuf.st_size < (int)sizeof(struct DbHeader)) {
+ fprintf(stderr, "Error: database file too small\n");
+ close(fd);
+ return NULL;
+ }
+
+ fd_ = fd;
+ datastore = new T(fd);
+
+ if (!ReopenFawnDSFromFd()) {
+ return false;
+ }
+ return true;
+ }
+
+ template <typename T>
+ bool FawnDS<T>::ReopenFawnDSFromFd()
+ {
+ // will malloc in ReadHashtableFromFile
+ // During rewrite, we have to free these.
+ if (hash_table_)
+ free(hash_table_);
+ if (header_)
+ free(header_);
+
+ // Now read entries from file into memory
+ if (!this->ReadHashtableFromFile()) {
+ perror("could not malloc hash table file\n");
+ return NULL;
+ }
+
+ disable_readahead();
+
+ return true;
+ }
+
+ template <typename T>
+ bool FawnDS<T>::deleteFile() {
+ if (unlink(filename_.c_str()) == -1) {
+ perror("Could not delete old database");
+ return false;
+ }
+ return true;
+ }
+
+ /***************************************************/
+ /**************** DB *STRUCTORS ********************/
+ /***************************************************/
+
+ template <typename T>
+ FawnDS<T>::~FawnDS<T>()
+ {
+ if (hash_table_)
+ free(hash_table_);
+ if (header_)
+ free(header_);
+
+ if (fd_ != -1)
+ close(fd_);
+
+ delete datastore;
+ }
+
+ template <typename T>
+ FawnDS<T>::FawnDS(const string& filename, keyType kt) :
+ fd_(-1), header_(NULL), hash_table_(NULL),
+ filename_(filename)
+ {
+ if (kt == RANDOM_KEYS) {
+ Hashes::hashes[0] = Hashes::nullhash1;
+ Hashes::hashes[1] = Hashes::nullhash2;
+ Hashes::hashes[2] = Hashes::nullhash3;
+ }
+
+ }
+
+ /***************************************************/
+ /****************** DB FUNCTIONS *******************/
+ /***************************************************/
+
+ template <typename T>
+ bool FawnDS<T>::Insert(const char* key, uint32_t key_len, const char* data, uint32_t length)
+ {
+ if (key == NULL || data == NULL)
+ return false;
+
+ if (header_->number_elements == header_->hashtable_size) {
+ fprintf(stderr, "Error: hash table full!\n");
+ return false;
+ }
+
+ bool newObj = false; // Used to keep track of whether this has added an object (used for tracking number of elements)
+ int32_t hash_index = FindHashIndexInsert(key, key_len, &newObj);
+ // If hash index is not found, we *could* rebuild the table with a larger hashtable size.
+ // For now, we just return false.
+ if (hash_index == -1) {
+ fprintf(stdout, "Can't find index for given key in hash table.\n");
+ return false;
+ }
+
+ // Do the underlying write to the datstore
+ if (!datastore->Write(key, key_len, data, length, header_->data_insertion_point)) {
+ fprintf(stderr, "Can't write to underlying datastore\n");
+ return false;
+ }
+
+ // update the hashtable (since the data was successfully written).
+ uint16_t key_in_hashtable = keyfragment_from_key(key, key_len);
+ key_in_hashtable |= VALIDBITMASK; // set valid bit to 1
+ hash_table_[hash_index].present_key = key_in_hashtable;
+ hash_table_[hash_index].offset = header_->data_insertion_point;
+ header_->data_insertion_point += sizeof(struct DataHeader) + key_len + length;
+
+ // Update number of elements if Insert added a new object
+ if (newObj)
+ header_->number_elements++;
+
+ return true;
+ }
+
+ // External deletes always write a writeLog
+ template <typename T>
+ bool FawnDS<T>::Delete(const char* key, uint32_t key_len)
+ {
+ return Delete(key, key_len, true);
+ }
+
+ template <typename T>
+ bool FawnDS<T>::Delete(const char* key, uint32_t key_len, bool writeLog)
+ {
+ if (key == NULL)
+ return false;
+
+ DataHeader data_header;
+ int32_t hash_index = FindHashIndex(key, key_len, &data_header);
+
+ // Key is not in table
+ if (hash_index == -1) {
+ //fprintf(stderr, "Can't find index for given key in hash table.\n");
+ return false;
+ }
+
+ // if writeLog == false, we may be ensuring that the entry was invalidated, so it's okay to check again
+ if (writeLog && !valid(hash_index)) {
+ fprintf(stderr, "Warning: tried to delete non-existent item\n");
+ return false;
+ }
+
+ if (writeLog) {
+ if (!datastore->Delete(key, key_len, header_->data_insertion_point)) {
+ fprintf(stderr, "Could not delete from underlying datastore\n");
+ return false;
+ }
+ // Update head pointer
+ header_->data_insertion_point += sizeof(struct DataHeader) + key_len;
+
+ }
+
+
+ /********* UPDATE TABLE **********/
+ hash_table_[hash_index].present_key |= DELETEDBITMASK;
+ header_->deleted_elements++;
+
+ // Eliminate stale entries on delete (exhaustive search)
+ while ((hash_index = FindHashIndex(key, key_len, &data_header)) != -1) {
+ hash_table_[hash_index].present_key |= DELETEDBITMASK;
+ }
+
+ return true;
+ }
+
+ template <typename T>
+ bool FawnDS<T>::Get(const char* key, uint32_t key_len, string &data) const
+ {
+ if (key == NULL)
+ return false;
+
+ // use DataHeaderExtended for app readahead
+ int32_t hash_index = 0;
+
+ for (u_int hash_fn_index = 0; hash_fn_index < HASH_COUNT; hash_fn_index++) {
+ uint32_t indexkey = (*(Hashes::hashes[hash_fn_index]))(key, key_len);
+ // calculate the starting index to probe. We linearly probe
+ // PROBES_BEFORE_REHASH locations before rehashing the key to get a new
+ // starting index.
+
+ // Mask lower order bits to find hash index
+ uint32_t hash_index_start = indexkey & (header_->hashtable_size-1);
+
+ hash_index_start &= (~(PROBES_BEFORE_REHASH-1));
+
+ for (hash_index = hash_index_start;
+ (uint32_t)hash_index < hash_index_start + PROBES_BEFORE_REHASH;
+ ++hash_index) {
+ uint16_t vkey = verifykey(hash_index);
+ if (!valid(hash_index)) {
+ return false;
+ }
+ else if (deleted(hash_index)) {
+ continue;
+ }
+ else if (vkey == keyfragment_from_key(key, key_len)) {
+ off_t datapos = hash_table_[hash_index].offset;
+ if (datastore->Read(key, key_len, datapos, data))
+ return true;
+ }
+ }
+
+ }
+ return false;
+ }
+
+
+ /***************************************************/
+ /************* DB REWRITE/SPLIT/MERGE **************/
+ /***************************************************/
+
+ // Rewrite needs to do the following:
+
+ // (1) Ignore out of range items (after a database is split)
+ // (2) Remove deleted items (orphans)
+
+ // To do this:
+ // Scan data region. For every key:
+ // if it falls outside the range of this database,
+ // skip it.
+ // if it falls in the range of this database
+ // hash the key and do a hashtable lookup.
+ // If the entry is valid
+ // if the offset doesn't point to this item, skip this.
+ // if the offset points to this item, write it to the new database
+ // else
+ // skip it
+
+ template <typename T>
+ FawnDS<T>* FawnDS<T>::Rewrite(pthread_rwlock_t* dbLock)
+ {
+ // Create new database with a temporary name.
+ // Should eventually pass in filebase instead of /localfs
+ char *tfn = strdup("/localfs/rewrite_temp.XXXXXX");
+ if (tfn == NULL) {
+ fprintf(stderr, "Could not allocate memory for temporary filename.\n");
+ return NULL;
+ }
+ int fd = mkstemp(tfn);
+ const string temp_filename(tfn);
+ free(tfn);
+
+ FawnDS<T>* new_db = FawnDS<T>::Create_FawnDS_From_Fd(fd,
+ temp_filename,
+ header_->hashtable_size,
+ header_->number_elements,
+ header_->deleted_elements,
+ header_->max_deleted_ratio,
+ header_->max_load_factor,
+ header_->keyFormat);
+
+ if (new_db == NULL) {
+ perror("Could not create new FawnDS file.");
+ return NULL;
+ }
+
+ // Transfer all key/value pairs into new database.
+ const DBID* startKey = getStartID();
+ const DBID* endKey = getEndID();
+
+ off_t current_offset = header_->data_start_point;
+ bool done = false;
+ bool checkpointed = false;
+ while (!done) {
+ // atomically acquire db to check what the current data_insertion_point is
+ if (dbLock) pthread_rwlock_rdlock(dbLock);
+
+ off_t ending_offset = header_->data_insertion_point;
+
+ if (current_offset >= ending_offset) {
+ // We've caught up, the hashtable is written
+ // we still hold the database lock.
+ if (current_offset > ending_offset) {
+ fprintf(stderr, "This shouldn't have happened\n");
+ return NULL;
+ }
+ DPRINTF(DEBUG_STATUS, "Caught up.");
+ done = true;
+ continue;
+ }
+
+ // unlock mutex so that gets can get through.
+ if (dbLock) pthread_rwlock_unlock(dbLock);
+
+ // we know current_offset is < ending_offset, so let's process until our current knowledge of ending_offset
+ while (current_offset < ending_offset) {
+ DataHeader data_header;
+ string key;
+ if (!datastore->ReadIntoHeader(current_offset, data_header, key)) {
+ fprintf(stderr, "ReadIntoHeader failed at offset %"PRIu64".\n",
+ current_offset);
+ delete new_db;
+ if (unlink(temp_filename.c_str()) == -1) {
+ perror("Could not delete temporary file");
+ }
+ return NULL;
+ }
+
+ off_t old_offset = current_offset; // store temp
+ current_offset += sizeof(struct DataHeader) + data_header.data_length + data_header.key_length;
+
+ // if out of range, continue
+ const DBID thisKey(key);
+ if (!between(startKey, endKey, &thisKey)) {
+ continue;
+ }
+
+ // else, check hashtable
+ int32_t hash_index = FindHashIndex(key.data(), key.length(), NULL);
+ // entry is invalid (could not find key)
+ if (hash_index == -1) {
+ continue;
+ }
+
+ // This check is already done in FindHashIndex.
+ // entry is invalid
+ if (!exists(hash_index)) {
+ continue;
+ }
+
+ // entry is invalid 3,
+ // this delete log was propagated because the sender of this data
+ // received a delete while the db was being scanned/sent
+ // don't write the delete log, otherwise deletelogs will never disappear :)
+ // write the deletelog only after the first checkpoint pass
+ if (data_header.deleteLog) {
+ if (!new_db->Delete(key.data(), key.length(), checkpointed)) {
+ fprintf(stderr, "Error deleting record from other database.\n");
+ return NULL;
+ }
+
+ }
+
+ // if the entry doesn't point to this item...
+ if (hash_table_[hash_index].offset != (off_t)old_offset) {
+ // This was an older insert
+ continue;
+ }
+
+ // else we need to copy this to the new one
+ string data;
+ if (!datastore->Read(key.data(), key.length(), old_offset, data)) {
+ fprintf(stderr, "Error reading from underlying datastore.\n");
+ delete new_db;
+ if (unlink(temp_filename.c_str()) == -1) {
+ perror("Could not delete temporary file");
+ }
+ return NULL;
+ }
+
+ if (!new_db->Insert(key.data(), key.length(), data.data(), data.length())) {
+ fprintf(stderr, "Error inserting to new database.\n");
+ delete new_db;
+ if (unlink(temp_filename.c_str()) == -1) {
+ perror("Could not delete temporary file");
+ }
+ return NULL;
+ }
+ }
+
+ // Write out header/hashtable now
+ // Only checkpoint once to avoid writing the hashtable each loop
+ if (checkpointed == false) {
+ if (!new_db->WriteHashtableToFile()) {
+ perror("Could not write malloc'd hashtable to fd");
+ delete new_db;
+ if (unlink(temp_filename.c_str()) == -1) {
+ perror("Could not delete temporary file");
+ }
+ return NULL;
+ }
+ checkpointed = true;
+ }
+ }
+
+ // rewrite complete. now need to move db over
+ return new_db;
+ }
+
+ template <typename T>
+ bool FawnDS<T>::RenameAndClean(FawnDS<T>* old) {
+ // Delete old database.
+ if (unlink(old->filename_.c_str()) == -1) {
+ perror("Could not delete old database");
+ return false;
+ }
+
+ // Rename new database.
+ if (rename(filename_.c_str(), old->filename_.c_str()) == -1) {
+ perror("Failed to rename temp database");
+ return false;
+ }
+
+ // Rename local
+ filename_ = old->filename_;
+
+ // Eliminate old
+ delete old;
+
+ return true;
+ }
+
+
+
+ // These two functions allow streaming of splits.
+ template <typename T>
+ void FawnDS<T>::split_init(const string& destinationIP)
+ {
+ currSplit = header_->data_start_point;
+ precopyIP = destinationIP;
+ }
+
+ // caller frees data.
+ template <typename T>
+ bool FawnDS<T>::split_next(const DBID* startKey, const DBID* endKey, char* ret_key, uint32_t& key_length, string& data, bool& valid, bool& remove)
+ {
+ remove = false;
+ if (currSplit >= header_->data_insertion_point) {
+ // we're done!
+ return false;
+ }
+
+ DataHeader data_header;
+ string key;
+ if (!datastore->ReadIntoHeader(currSplit, data_header, key)) {
+ fprintf(stderr, "ReadIntoHeader failed at offset %"PRIu64".\n",
+ currSplit);
+ return false;
+ }
+
+ // Copy over delete log if necessary, or read into data.
+ if (data_header.deleteLog) {
+ remove = true;
+ } else if (!datastore->Read(key.data(), key.length(), currSplit, data)) {
+ fprintf(stderr, "Error reading from underlying split datastore.\n");
+ return false;
+ }
+
+ currSplit += sizeof(struct DataHeader) + data_header.key_length + data_header.data_length;
+
+ // Do range check
+ const DBID thisKey(key);
+ bool inRange = between(startKey, endKey, &thisKey);
+
+ if (inRange) {
+ // Return appropriate info
+ valid = true;
+ memcpy(ret_key, key.data(), key.length());
+ key_length = key.length();
+ return true;
+ } else {
+ cout << "CurrSplit is " << currSplit << " and inspt is " << header_->data_insertion_point;
+ valid = false;
+ return true;
+ }
+ }
+
+ template <typename T>
+ bool FawnDS<T>::Merge(FawnDS<T>* other_db, pthread_rwlock_t* dbLock)
+ {
+ DPRINTF(DEBUG_STATUS, "Merging...");
+ // Scan other_db, insert all its keys into this.
+ return other_db->SplitMergeHelper(this, dbLock, true);
+ }
+
+ template <typename T>
+ bool FawnDS<T>::Split(FawnDS<T>* other_db, pthread_rwlock_t* dbLock)
+ {
+ DPRINTF(DEBUG_STATUS, "Splitting...");
+ // Scan this db, insert specific keys into other_db
+ return SplitMergeHelper(other_db, dbLock, false);
+ }
+
+ template <typename T>
+ bool FawnDS<T>::SplitMergeHelper(FawnDS<T>* other_db, pthread_rwlock_t* dbLock, bool merge)
+ {
+ off_t current_offset = header_->data_start_point;
+ bool done = false;
+ const DBID* startKey = other_db->getStartID();
+ const DBID* endKey = other_db->getEndID();
+
+ bool checkpointed = false;
+
+ while (!done) {
+ // atomically acquire db to check what the current data_insertion_point is
+ if (dbLock != NULL)
+ pthread_rwlock_rdlock(dbLock);
+
+ off_t ending_offset = header_->data_insertion_point;
+
+ if (current_offset >= ending_offset) {
+ // We've caught up, the hashtable is written
+ // we still hold the database lock.
+ if (current_offset > ending_offset) {
+ fprintf(stderr, "This shouldn't have happened\n");
+ return false;
+ }
+ DPRINTF(DEBUG_STATUS, "Caught up. Need to update interval list...");
+ done = true;
+ continue;
+ }
+
+ // unlock mutex so that gets can get through.
+ if (dbLock != NULL)
+ pthread_rwlock_unlock(dbLock);
+
+ // we know current_offset is < ending_offset, so let's process until our current knowledge of ending_offset
+ while (current_offset < ending_offset) {
+ DataHeader data_header;
+ string key;
+ if (!datastore->ReadIntoHeader(current_offset, data_header, key)) {
+ fprintf(stderr, "ReadIntoHeader failed at offset %"PRIu64".\n",
+ current_offset);
+ return false;
+ }
+ off_t old_offset = current_offset; // store temp
+ current_offset += sizeof(struct DataHeader) + data_header.key_length + data_header.data_length;
+
+ if (data_header.deleteLog) {
+ // otherdb must be updated if this was a delete log
+ // but delete just has to invalidate the header, no delete log or value update
+ // write the deletelog only after the first checkpoint pass
+ // compaction will ensure that this value is eventually cleaned up
+ if (!other_db->Delete(key.data(), key.length(), checkpointed)) {
+ fprintf(stderr, "Error deleting record from other database.\n");
+ return false;
+ }
+ printf("delete log...\n");
+ continue;
+ }
+ const DBID thisKey(key);
+ bool inRange = between(startKey, endKey, &thisKey);
+
+ // on a merge, the new db's range isn't setup yet, so let's
+ // just insert everything. if this is ever wrong,
+ // compaction will clean out entries that don't belong. but
+ // merging by definition means you want to insert
+ // everything.
+
+ string data;
+ if (!datastore->Read(key.data(), key.length(), old_offset, data)) {
+ fprintf(stderr, "Error reading from my merging datastore.\n");
+ }
+
+ if (inRange) {
+ if (!other_db->Insert(key.data(), key.length(), data.data(), data.length())) {
+ fprintf(stderr, "Error inserting to other database.\n");
+ //return false;
+ }
+ // We no longer need to delete -- keys moved to the new
+ // database do not have to be touched here. During
+ // compaction, we do a key-range check: since these keys
+ // will be outside the range of the DB, we can delete
+ // them then.
+
+ } else if (merge) {
+ // must lock db to prevent concurrent insertion into "live datastore"?
+ if (!other_db->Insert(key.data(), key.length(), data.data(), data.length())) {
+ fprintf(stderr, "Error inserting to other database.\n");
+ }
+ }
+ }
+
+ // current_offset = ending_offset, but ending_offset might have been updated.
+ // let's write the hashtable and check again.
+ // we write the hashtable here so that we don't hold the lock while the hashtable write is going on
+ // otherwise we may hold the lock for this database for several seconds.
+
+ // Only checkpoint once to avoid writing the hashtable each loop
+ if (checkpointed == false) {
+ DPRINTF(DEBUG_STATUS, "writing to hashtable...");
+ if (!other_db->WriteHashtableToFile()) {
+ fprintf(stderr, "Could not write hashtable to file after split/merge\n");
+ return false;
+ }
+ checkpointed = true;
+ }
+
+ }
+ // return (expect return from split to release dblock after inserting)
+ return true;
+ }
+
+
+ /***************************************************/
+ /***************** CORE FUNCTIONS ******************/
+ /***************************************************/
+ template <typename T>
+ uint32_t FawnDS<T>::FindNextHashSize(uint32_t number)
+ {
+ // Gets the next highest power of 2 larger than number
+ number--;
+ number = (number >> 1) | number;
+ number = (number >> 2) | number;
+ number = (number >> 4) | number;
+ number = (number >> 8) | number;
+ number = (number >> 16) | number;
+ number++;
+ return number;
+ }
+
+ // Grab lowest order KEYFRAG bits to use as keyfragment
+ // Key should be at least 2 bytes long for this to work properly
+ template <typename T>
+ inline uint16_t FawnDS<T>::keyfragment_from_key(const char *key, uint32_t key_len) const {
+ if (key_len < sizeof(uint16_t)) {
+ return (uint16_t) (key[0] & KEYFRAGMASK);
+ }
+ return (uint16_t) ((key[key_len-2]<<8) + key[key_len-1]) & KEYFRAGMASK;
+ }
+
+ template <typename T>
+ int32_t FawnDS<T>::FindHashIndexInsert(const char* key, uint32_t key_len, bool* newObj)
+ {
+ int32_t hash_index = 0;
+ //uint32_t searchkey = searchkey_from_key(key);
+ //uint16_t hashedkey = hashkey_from_searchkey(searchkey);
+
+ //*hashkey = hashedkey; // Save computation
+
+ for (u_int hash_fn_index = 0; hash_fn_index < HASH_COUNT; hash_fn_index++) {
+ uint32_t indexkey = (*(Hashes::hashes[hash_fn_index]))(key, key_len);
+ uint32_t hash_index_start = indexkey & (header_->hashtable_size-1);
+ if (hash_fn_index == 2)
+ print_payload((const u_char*) &indexkey, sizeof(indexkey));
+
+ hash_index_start &= (~(PROBES_BEFORE_REHASH-1));
+
+ for (hash_index = hash_index_start;
+ (uint32_t)hash_index < hash_index_start + PROBES_BEFORE_REHASH;
+ ++hash_index) {
+
+ uint16_t vkey = verifykey(hash_index);
+ // Find first open spot or find the same key
+ if (!exists(hash_index)) {
+ *newObj = true; // This is a new object, not an updated object for an existing key.
+ return hash_index;
+ } else if (vkey == keyfragment_from_key(key, key_len)) {
+ off_t datapos = hash_table_[hash_index].offset;
+ DataHeader data_header;
+ string ckey;
+ if (datastore->ReadIntoHeader(datapos, data_header, ckey) &&
+ ckey.length() == key_len &&
+ memcmp(ckey.data(), key, key_len) == 0)
+ return hash_index;
+ }
+ }
+ }
+ return -1;
+ }
+
+ template <typename T>
+ int32_t FawnDS<T>::FindHashIndex(const char* key, uint32_t key_len, DataHeader* data_header) const
+ {
+ DataHeader dummy_header;
+ int32_t hash_index = 0;
+ //uint32_t searchkey = searchkey_from_key(key);
+ //uint16_t hashedkey = hashkey_from_searchkey(searchkey);
+ //searchkey = searchkey & INDEXMASK; // pick out Index bits
+
+ if (data_header == NULL)
+ data_header = &dummy_header;
+
+ for (u_int hash_fn_index = 0; hash_fn_index < HASH_COUNT; hash_fn_index++) {
+ uint32_t indexkey = (*(Hashes::hashes[hash_fn_index]))(key, key_len);
+ // calculate the starting index to probe. We linearly probe
+ // PROBES_BEFORE_REHASH locations before rehashing the key to get a new
+ // starting index.
+
+ // Mask lower order bits to find hash index
+ uint32_t hash_index_start = indexkey & (header_->hashtable_size-1);
+
+ hash_index_start &= (~(PROBES_BEFORE_REHASH-1));
+
+ for (hash_index = hash_index_start;
+ (uint32_t)hash_index < hash_index_start + PROBES_BEFORE_REHASH;
+ ++hash_index) {
+
+ uint16_t vkey = verifykey(hash_index);
+
+ // If empty entry, key could not be found.
+ if (!valid(hash_index)) {
+ return -1;
+ }
+ // Skip over deleted entries
+ else if (deleted(hash_index)) {
+ continue;
+ }
+ else if (vkey == keyfragment_from_key(key, key_len)) {
+ off_t datapos = hash_table_[hash_index].offset;
+ DataHeader data_header;
+ string ckey;
+ if (datastore->ReadIntoHeader(datapos, data_header, ckey) &&
+ ckey.length() == key_len &&
+ memcmp(ckey.data(), key, key_len) == 0)
+ return hash_index;
+ }
+ }
+
+ }
+
+ return -1;
+ }
+
+ template class FawnDS<FawnDS_Flash>;
+} // namespace fawn
168 fawnds/fawnds.h
@@ -0,0 +1,168 @@
+/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+#ifndef _FAWNDS_H_
+#define _FAWNDS_H_
+
+#include <sys/types.h>
+#include <string>
+#include <stdint.h>
+
+#include "hash_functions.h"
+#include "db_structures.h"
+#include "dbid.h"
+#include <pthread.h>
+
+#ifdef __APPLE__
+#define pread64 pread
+#define pwrite64 pwrite
+#endif // #ifdef __APPLE__
+
+#ifndef _LARGEFILE64_SOURCE
+#define _LARGEFILE64_source
+#endif // #ifndef _LARGEFILE64_SOURCE
+#define _FILE_OFFSET_BITS 64
+
+using namespace std;
+
+namespace fawn {
+
+ template <typename T>
+ class FawnDS {
+ public:
+ static FawnDS<T>* Create_FawnDS(const char* filename, uint64_t hash_table_size,
+ double max_deleted_ratio, double max_load_factor,
+ keyType kt = TEXT_KEYS)
+ __attribute__ ((warn_unused_result));
+
+ static FawnDS<T>* Open_FawnDS(const char* filename, keyType kt = TEXT_KEYS);
+
+ const DBID* getStartID() const;
+ const DBID* getEndID() const;
+ bool setStartID(const DBID& sid);
+ bool setEndID(const DBID& eid);
+
+ virtual ~FawnDS();
+
+ bool Insert(const char* key, uint32_t key_len, const char* data, uint32_t length)
+ __attribute__ ((warn_unused_result));
+ bool Delete(const char* key, uint32_t key_len) __attribute__ ((warn_unused_result));
+ bool Delete(const char* key, uint32_t key_len, bool writeLog) __attribute__ ((warn_unused_result));
+ bool Get(const char* key, uint32_t key_len, string &data) const
+ __attribute__ ((warn_unused_result));
+
+// bool Insert(const char* key, const char* data, uint32_t length)
+// __attribute__ ((warn_unused_result));
+// bool Delete(const char* key) __attribute__ ((warn_unused_result));
+// bool Delete(const char* key, bool writeLog) __attribute__ ((warn_unused_result));
+// bool Get(const char* key, string &data) const
+// __attribute__ ((warn_unused_result));
+
+ FawnDS<T>* Rewrite(pthread_rwlock_t* dbLock) __attribute__ ((warn_unused_result));
+ bool RenameAndClean(FawnDS<T>* old) __attribute__ ((warn_unused_result));
+ bool Merge(FawnDS<T>* other_db, pthread_rwlock_t* listLock) __attribute__ ((warn_unused_result));
+ bool Split(FawnDS<T>* other_db, pthread_rwlock_t* listLock)
+ __attribute__ ((warn_unused_result));
+
+ void split_init(const string& destinationIP);
+ bool split_next(const DBID* startKey, const DBID* endKey, char* ret_key, uint32_t& key_length, string& data, bool& valid, bool& remove);
+
+
+ bool SplitMergeHelper(FawnDS<T>* other_db, pthread_rwlock_t* listLock, bool merge)
+ __attribute__ ((warn_unused_result));
+
+ // Used to write header/hashtable sequentially for split/merge/rewrite
+ bool WriteHashtableToFile() __attribute__ ((warn_unused_result));
+
+ // Used to write header/hashtable sequentially for split/merge/rewrite
+ bool WriteOnlyHeaderToFile() __attribute__ ((warn_unused_result));
+
+ // Used to read header/hashtable sequentially when header is malloc'd
+ bool ReadHashtableFromFile() __attribute__ ((warn_unused_result));
+
+
+ //inline string getPrecopyIP() { return precopyIP; }
+
+ // per db lock
+ string precopyIP;
+
+ bool deleteFile();
+ private:
+ FawnDS<T>(const string& filename, keyType storeType);
+ inline uint16_t keyfragment_from_key(const char* key, uint32_t key_len) const;
+
+ // Locate the hashtable entry that this key occupies or would occupy. Methods
+ // that use this utility function can check if that entry exists or not by
+ // checking the valid flag. Returns -1 on failure.
+ int32_t FindHashIndexInsert(const char* key, uint32_t key_len, bool* newObj);
+ int32_t FindHashIndex(const char* key, uint32_t key_len, DataHeader* data_header) const;
+
+ // This function is used to populate the member variables of this object with
+ // a different file.
+ bool ReopenFawnDSFromFd();
+ bool ReopenFawnDSFromFilename();
+
+ // This function returns a FawnDS object that does not have its filename
+ // object variable set. This variable is important for functions such as
+ // Rewrite(), Merge() and Split() so that we can delete the original file and
+ // overwrite it with the new file. Therefore, it is the caller's
+ // responsibility to either fill in that field or to never use the Rewrite(),
+ // Merge() or Split() methods on the returned object.
+
+ // Helper function to malloc or mmap header/hashtable
+ static FawnDS<T>* Create_FawnDS_From_Fd(int fd,
+ const string& filename,
+ uint64_t hash_table_size,
+ uint64_t number_entries,
+ uint64_t deleted_entries,
+ double max_deleted_ratio,
+ double max_load_factor,
+ keyType kt)
+ __attribute__ ((warn_unused_result));
+
+ static uint32_t FindNextHashSize(uint32_t number);
+
+ // Incrementing keyfragbits above 15 requires
+ // more modifications to code (e.g. hashkey is 16 bits in (Insert())
+ static const uint32_t KEYFRAGBITS = 14;
+ static const uint32_t KEYFRAGMASK = (1 << KEYFRAGBITS) - 1;
+ static const uint32_t DELETEDBITMASK = (2 << KEYFRAGBITS);
+ static const uint32_t INDEXBITS = 16;
+ static const uint32_t INDEXMASK = (1 << INDEXBITS) - 1;
+ static const uint32_t VALIDBITMASK = KEYFRAGMASK+1;
+
+ // Check if highest bit is 1; means deleted.
+ inline bool deleted(int32_t index) const
+ {
+ return (hash_table_[index].present_key & DELETEDBITMASK);
+ }
+
+ inline bool valid(int32_t index) const
+ {
+ return (hash_table_[index].present_key & VALIDBITMASK);
+ }
+
+ // Used by Insert: check that entry is not deleted or empty
+ inline bool exists(int32_t index) const
+ {
+ return valid(index) && !deleted(index);
+ }
+
+ inline uint16_t verifykey(int32_t index) const
+ {
+ return (hash_table_[index].present_key & KEYFRAGMASK);
+ }
+
+ int disable_readahead();
+
+ int fd_;
+ struct DbHeader* header_;
+ struct HashEntry* hash_table_;
+ T* datastore;
+ string filename_;
+ off_t currSplit;
+
+ static const double EXCESS_BUCKET_FACTOR = 1.1;
+ };
+
+} // namespace fawn
+
+#endif // #ifndef _FAWNDS_H_
361 fawnds/fawnds_bench.cc
@@ -0,0 +1,361 @@
+/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+#include "fawnds.h"
+#include "fawnds_flash.h"
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+#include "print.h"
+#include "dbid.h"
+#include "hashutil.h"
+#include "timing.h"
+
+enum { SCAN_RANDOM, SCAN_SEQUENTIAL };
+
+using namespace std;
+using namespace fawn;
+using fawn::HashUtil;
+
+struct benchData {
+ FawnDS<FawnDS_Flash>** f;
+ u_int num_records;
+ u_int num_to_scan;
+ u_int offset;
+ u_int numThreads;
+};
+
+uint32_t readyCount = 0;
+pthread_mutex_t count_lock;
+vector<double> search_times;
+u_int max_record = 0;
+
+
+void
+usage()
+{
+ fprintf(stderr,
+ "fawnds_bench [-hfsmw] [-r num_scan] [-n num_records] [-p num_db] [-t threads] [-b \"dir1 dir2 dir3\"] [-a num_procs] <dbfile>\n"
+ );
+}
+
+void
+help()
+{
+ usage();
+ fprintf(stderr,
+ " -h help (this text)\n"
+ " -f fill\n"
+ " -s sequential scan\n"
+ " -r # number of entries to randomly scan\n"
+ " -n # number of entries to fill or scan.\n"
+ " -m use mmap hashtable\n"
+ " -b # directories for multiple files\n"
+ " -t # use # threads (default: unthreaded)\n"
+ " -a # Turn on thread affinity, specify number of processors\n"
+ " -w random write test\n"
+ " -S # set value size (bytes)\n"
+ );
+}
+
+void *compactThread(void* p) {
+ FawnDS<FawnDS_Flash>** dbs = (FawnDS<FawnDS_Flash>**)p;
+ pthread_rwlock_t tempLock;
+ pthread_rwlock_init(&tempLock, NULL);
+ if (!dbs[0]->Rewrite(&tempLock)) {
+ perror("Rewrite failed!\n");
+ }
+ pthread_rwlock_destroy(&tempLock);
+ return NULL;
+}
+
+void *randomReadThread(void* p)
+{
+ benchData* bd = (benchData*)p;
+ FawnDS<FawnDS_Flash>** dbs = bd->f;
+ u_int num_records = bd->num_records;
+ u_int num_to_scan = bd->num_to_scan;
+ u_int offsetVal = bd->offset;
+ FawnDS<FawnDS_Flash> *mydb = dbs[offsetVal];
+
+ struct timeval tv_start, tv_end;
+ char *l = (char *)malloc(sizeof(char) * num_to_scan * sizeof(uint32_t));
+
+ for (u_int i = 0; i < num_to_scan; i++) {
+ u_int val = (offsetVal * num_records) + (rand()%num_records);
+ if (val < max_record) {
+ string ps_key((const char *)&val, sizeof(val));
+ uint32_t key_id = HashUtil::BobHash(ps_key);
+ DBID key((char *)&key_id, sizeof(u_int32_t));
+ memcpy(l + (i * sizeof(uint32_t)), key.data(), key.get_actual_size());
+ }
+ else {
+ i--;
+ }
+ }
+
+ pthread_mutex_lock(&count_lock);
+ readyCount++;
+ pthread_mutex_unlock(&count_lock);
+ struct timespec req;
+ req.tv_sec = 0;
+ req.tv_nsec = 20000;
+
+ while (readyCount < bd->numThreads) {
+ nanosleep(&req, NULL);
+ }
+ gettimeofday(&tv_start, NULL);
+ string data;
+ const char *key = l;
+ for (u_int i = 0; i < num_to_scan; ++i) {
+ if(!mydb->Get(key, sizeof(uint32_t), data)) {
+ perror("Get failed.\n");
+ }
+ key += sizeof(uint32_t);
+ }
+
+ gettimeofday(&tv_end, NULL);
+ double dbsearch_time = timeval_diff(&tv_start, &tv_end);
+ //printf("Time to search DB: %f seconds\n", dbsearch_time);
+ //printf("Query rate: %f\n", ((double)num_to_scan / dbsearch_time) );
+ free(l);
+
+ pthread_mutex_lock(&count_lock);
+ search_times.push_back(dbsearch_time);
+ pthread_mutex_unlock(&count_lock);
+ return NULL;
+}
+
+void bench(int argc, char** argv) {
+ extern char *optarg;
+ extern int optind;
+
+ int ch;
+ char* dbname = NULL;
+ vector<string> fileBases;
+
+ u_int num_to_scan = 0;
+ int mode = SCAN_RANDOM;
+ bool createdb = false;
+ int writeTest = 0;
+ int compactAt = -1;
+ int numThreads = 1;
+ bool useThreads = false;
+ bool setAffinity = false;
+ int numProcs = 1;
+ int valuesize = 1024;
+
+ pthread_t compactThreadId_;
+ while ((ch = getopt(argc, argv, "hfn:r:p:swc:t:b:a:S:")) != -1)
+ switch (ch) {
+ case 'n':
+ max_record = atoi(optarg);
+ break;
+ case 'r':
+ num_to_scan = atoi(optarg);
+ break;
+ case 'f':
+ createdb = true;
+ break;
+ case 's':
+ mode = SCAN_SEQUENTIAL;
+ break;
+ case 'w':
+ writeTest = 1;
+ break;
+ case 'c':
+ compactAt = atoi(optarg);
+ break;
+ case 't':
+ useThreads = true;
+ numThreads = atoi(optarg);
+ break;
+ case 'b':
+ tokenize(optarg, fileBases, " ");
+ break;
+ case 'S':
+ valuesize = atoi(optarg);
+ break;
+ case 'a':
+ setAffinity = true;
+ numProcs = atoi(optarg);
+ break;
+ case 'h':
+ help();
+ exit(0);
+ default:
+ usage();
+ exit(-1);
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (fileBases.size() == 0 && argc != 1) {
+ usage();
+ exit(-1);
+ }
+
+ if (fileBases.size() == 0)
+ dbname = argv[0];
+
+ struct timeval tv_start, tv_end;
+ gettimeofday(&tv_start, NULL);
+
+ //char key[4];
+ //char data[valuesize];
+ string value(valuesize, 'a');
+
+ FawnDS<FawnDS_Flash> **dbs = (FawnDS<FawnDS_Flash>**)malloc(numThreads * sizeof(FawnDS<FawnDS_Flash>*));
+
+ // size? num_records / numThreads
+ int num_recs_per_db = (int) (max_record / numThreads);
+
+ int bucket = num_recs_per_db;
+ if (max_record % numThreads != 0)
+ bucket += 1;
+
+ pthread_t* workerThreadIds_ = (pthread_t*) malloc (numThreads * sizeof(pthread_t));
+ benchData* bd = (benchData*) malloc (numThreads * sizeof(benchData));
+
+ u_int fileBaseOffset = 0;
+ for (int i = 0; i < numThreads; i++) {
+ ostringstream dbname_i;
+ if (fileBases.size() > 0) {
+ dbname_i << fileBases[fileBaseOffset] << "_" << i;
+ fileBaseOffset++;
+ if (fileBaseOffset == fileBases.size())
+ fileBaseOffset = 0;
+
+ } else {
+ dbname_i << dbname << "_" << i;
+ }
+ if (createdb) {
+ dbs[i] = FawnDS<FawnDS_Flash>::Create_FawnDS(dbname_i.str().c_str(), num_recs_per_db * 2, .9,
+ .8);
+ } else {
+ printf("reading file %s\n", dbname_i.str().c_str());
+ dbs[i] = FawnDS<FawnDS_Flash>::Open_FawnDS(dbname_i.str().c_str());
+ }
+ }
+
+ if (createdb && !writeTest) {
+ // Fill it sequentially if we're not testing writing
+ for (u_int i = 0; i < max_record; ++i) {
+ int num = i;
+ string ps_key((const char *)&num, sizeof(num));
+ u_int32_t key_id = HashUtil::BobHash(ps_key);
+ DBID key((char *)&key_id, sizeof(u_int32_t));
+
+ int dbi = (int)(i / bucket);
+
+ if(!dbs[dbi]->Insert(key.data(), key.get_actual_size(), value.data(), valuesize)) {
+ perror("Insert failed\n");
+ }
+ }
+
+ // this is required since we're not splitting/merging/rewriting initially
+ for (int i = 0; i < numThreads; i++) {
+ if (!dbs[i]->WriteHashtableToFile()) {
+ perror("Could not write hashtable.\n");
+ }
+ }
+ }
+ gettimeofday(&tv_end, NULL);
+
+ if (createdb) {
+ double dbcreate_time = timeval_diff(&tv_start, &tv_end);
+ printf("Time to create DB: %f seconds\n", dbcreate_time);
+ }
+
+ srand((tv_end.tv_sec << 2) + tv_end.tv_usec);
+
+ // Set of randomly ordered test elements
+
+ //random_shuffle(l.begin(), l.end());
+
+ if (writeTest) {
+ vector<int> l;
+ for (u_int i = 0; i < num_to_scan; i++) {
+ l.push_back(rand()%max_record);
+ }
+ int n = l.size();
+ for (int i = 0; i < n; i++) {
+ u_int val = l[i];
+ string ps_key((const char *)&val, sizeof(val));
+ u_int32_t key_id = HashUtil::BobHash(ps_key);
+ DBID key((char *)&key_id, sizeof(u_int32_t));
+
+ if (i == compactAt) {
+ cout << "Compacting..." << endl;
+ pthread_create(&compactThreadId_, NULL,
+ compactThread, dbs);
+
+ }
+
+ int dbi = (int)(i / bucket);
+ if(!dbs[dbi]->Insert(key.data(), key.get_actual_size(), value.data(), valuesize)) {
+ perror("Insert failed\n");
+ }
+ }
+
+ } else {
+ pthread_mutex_init(&count_lock, NULL);
+ for (int i = 0; i < numThreads; i++) {
+ bd[i].f = dbs;
+ bd[i].num_to_scan = num_to_scan;
+ bd[i].num_records = bucket;
+ bd[i].offset = i;
+ bd[i].numThreads = numThreads;
+ if (useThreads) {
+ pthread_attr_t attr;
+ pthread_attr_init(&attr);
+#ifdef cpu_set_t /* GNU/Linux-only! */
+ if (setAffinity) {
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(i % numProcs, &cpuset);
+ pthread_attr_setaffinity_np(&attr, sizeof(cpuset), &cpuset);
+ }
+#endif
+ pthread_create(&workerThreadIds_[i], &attr,
+ randomReadThread, &bd[i]);
+ } else {
+ randomReadThread(&bd[0]);
+ }
+ }
+ }
+
+ if (useThreads) {
+ for (int i = 0; i < numThreads; i++) {
+ pthread_join(workerThreadIds_[i], NULL);
+ }
+ }
+
+ if (compactAt != -1) {
+ pthread_join(compactThreadId_, NULL);
+ }
+ pthread_mutex_destroy(&count_lock);
+ free(workerThreadIds_);
+ free(bd);
+
+ double totalTime = 0;
+ for (int i = 0; i < numThreads; i++) {
+ totalTime = max(totalTime, search_times[i]);
+ }
+ double totalQueries = num_to_scan * numThreads;
+
+ cout << "Aggregate Query Rate: " << totalQueries/totalTime << " queries per second" << endl;
+}
+
+
+int main(int argc, char** argv) {
+ bench(argc, argv);
+ return 0;
+}
176 fawnds/fawnds_flash.cc
@@ -0,0 +1,176 @@
+/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include <time.h>
+#include <sys/time.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <string>
+#include <assert.h>
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+#include <inttypes.h>
+
+#include "hash_functions.h"
+#include "fawnds_flash.h"
+#include "debug.h"
+#include "hashutil.h"
+#include "print.h"
+#include "timing.h"
+
+using fawn::DataHeader;
+using fawn::Hashes;
+using fawn::HashUtil;
+
+#ifndef O_NOATIME
+#define O_NOATIME 0 /* O_NOATIME is linux-only */
+#endif
+
+namespace fawn {
+ /***************************************************/
+ /****************** DB FUNCTIONS *******************/
+ /***************************************************/
+
+ bool FawnDS_Flash::Write(const char* key, uint32_t key_len, const char* data, uint32_t length, off_t offset)
+ {
+ struct DataHeader data_header;
+ data_header.data_length = length;
+ data_header.key_length = key_len;
+ data_header.deleteLog = false;
+
+ struct iovec iov[3];
+ iov[0].iov_base = &data_header;
+ iov[0].iov_len = sizeof(struct DataHeader);
+ iov[1].iov_base = const_cast<char *>(key);
+ iov[1].iov_len = key_len;
+ iov[2].iov_base = const_cast<char *>(data);
+ iov[2].iov_len = length;
+
+ if (lseek(fd_, offset, SEEK_SET) != offset) {
+ fprintf(stderr, "Could not seek to offset %"PRIu64": %s\n",
+ offset, strerror(errno));
+ }
+
+ if (writev(fd_, iov, 3) != (ssize_t) (sizeof(struct DataHeader) + key_len + length)) {
+ fprintf(stderr, "Could not write iovec structure: %s\n", strerror(errno));
+ return false;
+ }
+
+ return true;
+ }
+
+ bool FawnDS_Flash::Delete(const char* key, uint32_t key_len, off_t offset)
+ {
+ /********* DELETE LOG **********/
+ // Just needs the key and the fact that it's deleted to be appended.
+ struct DataHeader delete_header;
+ delete_header.data_length = 0;
+ delete_header.key_length = key_len;
+ delete_header.deleteLog = true;
+
+ if ((uint64_t)pwrite64(fd_, &delete_header, sizeof(struct DataHeader),
+ offset) != sizeof(struct DataHeader)) {
+ fprintf(stderr, "Could not write delete header at position %"PRIu64": %s\n",
+ (uint64_t)offset, strerror(errno));
+ return false;
+ }
+
+ if ((uint64_t)pwrite64(fd_, key, key_len,
+ offset + sizeof(struct DataHeader)) != key_len) {
+ fprintf(stderr, "Could not write delete header at position %"PRIu64": %s\n",
+ (uint64_t)offset + sizeof(struct DataHeader), strerror(errno));
+ return false;
+ }
+
+ return true;
+ }
+
+ bool FawnDS_Flash::ReadIntoHeader(off_t offset, DataHeader &data_header, string &key)
+ {
+ ssize_t n_read = pread64(fd_, &data_header, sizeof(struct DataHeader), offset);
+ if (n_read < (ssize_t)sizeof(struct DataHeader)) {
+ fprintf(stderr, "Read %lu bytes from DataHeader, expected %lu\n", n_read, sizeof(struct DataHeader));
+ return false;
+ }
+ uint32_t key_len = data_header.key_length;
+ char *mdata = (char *)malloc(key_len);
+
+ n_read = pread64(fd_, mdata, key_len, offset + sizeof(struct DataHeader));
+ if (n_read < key_len) {
+ fprintf(stderr, "Read %lu bytes from key, expected %u\n", n_read, key_len);
+ return false;
+ }
+ key.assign(mdata, key_len);
+ free(mdata);
+ return true;
+ }
+
+ bool FawnDS_Flash::Read(const char* key,
+ uint32_t key_len,
+ off_t offset,
+ string &data)
+ {
+ DataHeader data_header;
+ string inputkey;
+
+ if (!ReadIntoHeader(offset, data_header, inputkey)) {
+ return false;
+ }
+
+ // Hashing based on key fragment can result in potential key collision
+ // So we read the dataheader here to compare the full key to ensure this.
+ if (memcmp(inputkey.data(), key, key_len) != 0) {
+ return false;
+ }
+
+ size_t length = data_header.data_length;
+ if (length == 0) {
+ return true;
+ }
+
+#if 0
+ // Readahead code -- n_read was removed because it was put into ReadIntoHeader
+ // For readahead, you have to re-introduce that code into this function
+ if (length < (n_read - sizeof(struct DataHeader))) {
+ //printf("GDOEX skipped pread\n");
+ data.assign(data_header.partial_data, length);
+ } else
+#endif
+ {
+ char *mdata = (char *)malloc(length);
+ //printf("GDOEX pread64: %x (%d)\n", datapos + sizeof(DataHeader), length);
+ if ((uint64_t)pread64(fd_, mdata, length, offset + key_len + sizeof(struct DataHeader)) !=
+ length) {
+ fprintf(stderr, "Could not read data at position %"PRIu64": %s\n",
+ offset + sizeof(DataHeader), strerror(errno));
+ free(mdata);
+ return false;
+ }
+ data.assign(mdata, length);
+ free(mdata);
+ /* SPEED note: May be worth some day eliminating the redundant
+ * data copy in this by figuring out how to read directly into the
+ * string's buffer. Only matters for values > 2k where we
+ * can't do readahead. */
+ /* Better speed note: Fix them *both*. Don't use a string, use
+ * our own thing, and do all of the preads() directly into it.
+ * Treat it like an mbuf/skbuf, and be able to advance the
+ * "start" pointer into the buffer so we never have to copy
+ * regardless of how we get the data. */
+ }
+ return true;
+ }
+
+
+
+
+} // namespace fawn
38 fawnds/fawnds_flash.h
@@ -0,0 +1,38 @@
+/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+#ifndef _FAWNDS_FLASH_H_
+#define _FAWNDS_FLASH_H_
+
+#include <sys/types.h>
+#include <string>
+#include <stdint.h>
+#include "db_structures.h"
+
+#ifdef __APPLE__
+#define pread64 pread
+#define pwrite64 pwrite
+#endif // #ifdef __APPLE__
+
+#ifndef _LARGEFILE64_SOURCE
+#define _LARGEFILE64_source
+#endif // #ifndef _LARGEFILE64_SOURCE
+#define _FILE_OFFSET_BITS 64
+
+using namespace std;