wiseio · deads · Jun 12, 2016 · Jun 10, 2016 · Jun 11, 2016 · Jun 11, 2016
diff --git a/README.md b/README.md
@@ -3,21 +3,41 @@ ParaText
 
 ParaText is a C++ library to read text files in parallel on multi-core
 machines. The alpha release includes a CSV reader and Python bindings.
+The library itself has no dependencies other than the standard library.
 
-ParaText has the following dependencies for its Python bindings:
+Depedencies
+-----------
+ParaText has the following dependencies:
 
-   - a C++ compiler that is C++11 compliant (gcc 4.8 or above, clang 3.4 or above)
-   - SWIG 2.0.11 or above
-   - Python (2.7 or above)
+   - a fully C++11-compliant C++ compiler (gcc 4.8 or above, clang 3.4 or above)
+   - SWIG 2.0.7 or above (Python 2 bindings)
+   - SWIG 3.0.8 or above (Python 3 bindings)
+   - Python 2.7 or 3.5
    - setuptools
+   - numpy
 
-Though Pandas is optional, it must be installed to use ParaText to
-read CSV files into Pandas.
+Pandas is required only if using ParaText to read CSV files into
+Pandas. The SWIG available from Ubuntu 14.04 does not work with Python 3.
 
-The ParaText library can be built with two commands:
+Anaconda packages the latest version of SWIG that works properly
+with Python 3. You can install it as follows:
+
+```
+conda install swig
+```
+
+Building Python
+---------------
+
+First, go into the `python` directory:
 
 ```
    cd python/
+```
+
+Then run `setup.py`:
+
+```
    python setup.py build install
 ```
 
@@ -127,7 +147,7 @@ ANOREXIA array([0, 0, 0, 0, 0], dtype=uint8) ['no' 'yes' 'nan']
 ```
 
 All categorical columns in this data set have 3 or fewer levels so
-they are all `uint8_t`. A string representation uses at least 8 times
+they are all `uint8`. A string representation uses at least 8 times
 as much space, but it can also be less computationally efficient. An
 integer representation is ideal for learning on categorical columns.
 Integer comparisons over contiguous integer buffers are pretty cheap
@@ -158,49 +178,51 @@ Column Types Supported
 
 Wise ParaText supports three kinds of columns:
 
-    - numeric: for numeric data.
-    - categorical: for categorical data.
-    - text: for large strings like e-mails and text documents.
+    * numeric: for numeric data.
+
+    * categorical: for categorical data.
+
+    * text: for large strings like e-mails and text documents.
 
 In the library, we distinguish between semantics and data type. The
 semantics defines how to interpret a column. The data type (`uint8`,
-`int64`, `float`, etc.) defines how its encoded.
+`int64`, `float`, etc.) defines how the column values are encoded.
 
 Parameters
 ----------
 
-Most CSV loading functions in ParaText have the following
+Most CSV loading functions in ParaText have the following parameters:
 
-    - `cat_names`: A list of column names to force as categorical regardless
+    * `cat_names`: A list of column names to force as categorical regardless
     of the inferred type.
 
-    - `text_names`: A list of column names that should be treated as rich text
+    * `text_names`: A list of column names that should be treated as rich text
     regardless of its inferred type.
 
-    - `num_names`: A list of column names that should be treated as
+    * `num_names`: A list of column names that should be treated as
     numeric regardless of its inferred type.
 
-    - `num_threads`:  The number of parser threads to spawn. The default
+    * `num_threads`:  The number of parser threads to spawn. The default
     is the number of cores.
 
-    - `allow_quoted_newlines`:  Allows multi-line text fields. This
+    * `allow_quoted_newlines`:  Allows multi-line text fields. This
     is turned off by default.
 
-    - `no_header`: Do not auto-detect the presence of a header. Assume
+    * `no_header`: Do not auto-detect the presence of a header. Assume
     the first line is data. This is turned off by default.
 
-    - `max_level_name_length`: If a field's length exceeds this value,
+    * `max_level_name_length`: If a field's length exceeds this value,
     the entire column is treated as text rather than
     categorical. The default is unlimited.
 
-    - `max_levels`: The maximum number of levels of a categorical column.
-    (default=max integer)
+    * `max_levels`: The maximum number of levels of a categorical column.
+    The default is unlimited.
 
-    - `number_only`: Whether it can be safely assumed the columns only
-    contain numbers. This is turned off by default.
+    * `number_only`: Whether it can be safely assumed the columns only
+    contain numbers. The default is unlimited.
 
-    - `block_size`: The number of bytes to read at a time in each worker.
-    (default=32768)
+    * `block_size`: The number of bytes to read at a time in each worker
+    thread. The default is unlimited.
 
 Other Notes
 -----------
@@ -215,4 +237,3 @@ column.  Only the interpretation of a column (numeric, categorical, or
 text) can be forced.
 
 3. DateTime support will be added in a future release.
-
diff --git a/python/paratext/core.py b/python/paratext/core.py
@@ -28,6 +28,8 @@
 
 import paratext_internal as pti
 
+from six.moves import range
+
 import random
 import numpy as np
 import string
@@ -189,7 +191,7 @@ def internal_csv_loader_transfer(loader, forget=True, expand=False):
          same string object to save space.
 
     """
-    for i in xrange(0, loader.get_num_columns()):
+    for i in range(loader.get_num_columns()):
         col = loader.get_column(i)
         info = loader.get_column_info(i)
         semantics = 'num'
@@ -372,7 +374,7 @@ def baseline_average_columns(filename, type_check=False, *args, **kwargs):
     params = _get_params(*args, **kwargs)
     summer = pti.ParseAndSum();
     summer.load(filename, params, type_check)
-    d = {summer.get_column_name(i): summer.get_avg(i) for i in xrange(0, summer.get_num_columns())}
+    d = {summer.get_column_name(i): summer.get_avg(i) for i in range(summer.get_num_columns())}
     return d
 
 

diff --git a/python/setup.py b/python/setup.py
@@ -2,18 +2,22 @@
 import sys, os, os.path, string, subprocess
 import json
 
+# First, check for the presence of swig, which we will need to build
+# the Python bindings.
 p = subprocess.Popen(["which", "swig"])
 p.communicate("")
 if p.returncode != 0:
     print("Error: you must install SWIG first.")
     sys.exit(1)
 
+# The multi-threaded reader will core dump unless -pthread is given.
 extra_link_args = []
 extra_compile_args = ["-std=c++11", "-Wall", "-Wextra", "-pthread"]
 extra_libraries = []
 
 if sys.platform == 'darwin':
     extra_compile_args += ["-m64", "-D_REENTRANT"]
+    extra_link_args += []
     extra_libraries += []
 elif sys.platform.startswith("linux"):
     extra_compile_args += []
@@ -37,11 +41,11 @@
 
 init_py = open("paratext/__init__.py", "w")
 
-init_py.write("""#!/usr/bin/python
+init_py.write("""
 __all__ = ['paratext']
 
-import core, helpers
-from core import *
+from paratext.core import *
+from paratext.helpers import *
 
 import paratext_internal
 import warnings
@@ -54,18 +58,23 @@
 
 print(version)
 
-swig_cmd = ["swig", "-c++", "-python", "-I../src/", "-outdir", "./", "../src/paratext_internal.i"]
+swig_cmd = ["swig", "-c++", "-python"]
+
+if sys.version_info >= (3,):
+    swig_cmd += ["-py3"]
+
+swig_cmd += ["-I../src/", "-outdir", "./", "../src/paratext_internal.i"]
 
 print("running swig: ", swig_cmd)
 p = subprocess.Popen(swig_cmd)
 p.communicate("")
 if p.returncode != 0:
-    print("Error: building")
+    print("Error generating SWIG wrappers.")
     sys.exit(1)
 
 setup(name='paratext',
       version=version,
-      description='Reads text files in parallel. The first release includes a paralell CSV reader.',
+      description='Reads text files in parallel. The first release includes a parallel CSV reader.',
       long_description="""
 See README
 """,
@@ -80,7 +89,7 @@
       py_modules=["paratext_internal"],
       author="Damian Eads",
       author_email="damian@wise.io",
-      license="GNU General Public License",
+      license="Apache License",
       packages = ['paratext'],
       url = 'http://wise.io',
       include_package_data = True,

diff --git a/src/python/numpy_helper.hpp b/src/python/numpy_helper.hpp
@@ -60,6 +60,14 @@ template <> struct numpy_type<std::string>  { static const long id = NPY_OBJECT;
 template <> struct numpy_type<unsigned long>  { static const long id = NPY_ULONG; };
 #endif
 
+inline PyObject *as_python_string(const std::string &in) {
+#if PY_MAJOR_VERSION < 3
+  return PyString_FromStringAndSize(in.c_str(), in.size());
+#else
+  return PyUnicode_FromStringAndSize(in.c_str(), in.size());
+#endif
+}
+
 template <class Container, class Enable=void>
 struct build_array_impl {};
 
@@ -98,10 +106,9 @@ struct build_array_impl<Container, typename std::enable_if<std::is_same<typename
     try {
       for (size_t i = 0; i < container.size(); i++) {
         PyObject **ref = (PyObject **)PyArray_GETPTR1((PyArrayObject*)array, i);
-        PyObject *newobj = PyString_FromStringAndSize(container[i].c_str(), container[i].size());
+        PyObject *newobj = as_python_string(container[i]);
         Py_XDECREF(*ref);
         *ref = newobj;
-        //Py_XINCREF(*ref);
       }
     }
     catch (...) {
@@ -160,10 +167,9 @@ struct build_array_from_range_impl<Iterator, typename std::enable_if<std::is_sam
       size_t i = 0;
       for (Iterator it = range.first; it != range.second; it++, i++) {
         PyObject **ref = (PyObject **)PyArray_GETPTR1((PyArrayObject*)array, i);
-        PyObject *newobj = PyString_FromStringAndSize((*it).c_str(), (*it).size());
+        PyObject *newobj = as_python_string(*it);
         Py_XDECREF(*ref);
         *ref = newobj;
-        //Py_XINCREF(*ref);
       }
     }
     catch (...) {
@@ -219,11 +225,10 @@ struct string_array_output_iterator  : public std::iterator<std::forward_iterato
   string_array_output_iterator(PyArrayObject *array) : i(0), array(array) {}
 
   inline string_array_output_iterator &operator++() {
-    PyObject *s = PyString_FromStringAndSize(output.c_str(), output.size());
+    PyObject *s = as_python_string(output);
     PyObject **ref = (PyObject **)PyArray_GETPTR1((PyArrayObject*)array, i);
     Py_XDECREF(*ref);
     *ref = s;
-    //Py_XINCREF(*ref);
     i++;
     return *this;
   }
@@ -236,7 +241,7 @@ struct string_array_output_iterator  : public std::iterator<std::forward_iterato
     return output;
   }
 
-  int i;
+  long i;
   std::string output;
   PyArrayObject *array;
 };