Merge pull request #486 from onyxfish/agate

Agate integration
wireservice · Jan 23, 2016 · 3f0b6be · 3f0b6be
2 parents 9deaede + d8ed031
commit 3f0b6be
Show file tree

Hide file tree

Showing 49 changed files with 306 additions and 1,123 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,14 +1,14 @@
 language: python
 python:
-  - "2.6"
   - "2.7"
   - "pypy"
   - "3.3"
   - "3.4"
+  - "3.5"
 install:
-  - if [[ $TRAVIS_PYTHON_VERSION == "2.6" ]]; then pip install -r requirements-py2.txt --use-mirrors --allow-external argparse; fi
   - if [[ $TRAVIS_PYTHON_VERSION == "2.7" ]]; then pip install -r requirements-py2.txt; fi
   - if [[ $TRAVIS_PYTHON_VERSION == "pypy" ]]; then pip install -r requirements-py2.txt; fi
   - if [[ $TRAVIS_PYTHON_VERSION == "3.3" ]]; then pip install -r requirements-py3.txt; fi
   - if [[ $TRAVIS_PYTHON_VERSION == "3.4" ]]; then pip install -r requirements-py3.txt; fi
+  - if [[ $TRAVIS_PYTHON_VERSION == "3.5" ]]; then pip install -r requirements-py3.txt; fi
 script: nosetests
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,6 +1,16 @@
-0.9.2
+1.0.0
 -----
 
+This is a major release of csvkit. The entire backend has been rewritten to leverage the agate data analysis library, rather than bespoke implementations. In addition to the specific changes enumerated below there may be small changes to the way the output of the various tools is formatted. For example:
+
+* If `--no-header-row` is set, the output will have column names A, B, C, etc. instead of column1, column2, column3, etc.
+
+If you have built data workflows on top of csvkit you should not upgrade without thorough testing.
+
+* in2csv DBF conversion now works with Python 3.
+* "import csvkit as csv" will now defer to agate readers/writers.
+* CSVKitReader, CSVKitWriter, CSVKitDictReader, and CSVKitDictWriter have been removed. Use agate.reader, agate.writer, agate.DictReader and agate.DictWriter.
+* in2csv "csv itself" conversions now use agate.Table.
 * in2csv now correctly guesses format when file has an uppercase extension.
 
 0.9.1

diff --git a/csvkit/__init__.py b/csvkit/__init__.py
@@ -1,34 +1,20 @@
 #!/usr/bin/env python
 
 """
-This module contains csvkit's superpowered replacement for the builtin :mod:`csv` module. For Python 2 users, the greatest improvement over the standard library full unicode support. Python 3's :mod:`csv` module supports unicode internally, so this module is provided primarily for compatability purposes.
+This module contains csvkit's superpowered alternative to the standard Python
+CSV reader and writer. It can be used as a drop-in replacement for the standard
+module.
 
-* Python 2: :mod:`csvkit.py2`.
-* Python 3: :mod:`csvkit.py3`.
-"""
-
-import six
+.. warn::
 
-if six.PY2:
-    from csvkit import py2
-
-    CSVKitReader = py2.CSVKitReader
-    CSVKitWriter = py2.CSVKitWriter
-    CSVKitDictReader = py2.CSVKitDictReader
-    CSVKitDictWriter = py2.CSVKitDictWriter
-    reader = py2.reader
-    writer = py2.writer
-    DictReader = py2.CSVKitDictReader
-    DictWriter = py2.CSVKitDictWriter
-else:
-    from csvkit import py3
+    Since version 1.0 csvkit relies on `agate <http://agate.rtfd.org>`_'s
+CSV reader and writer. This module is supported for legacy purposes only and you
+should migrate to using agate.
+"""
 
-    CSVKitReader = py3.CSVKitReader
-    CSVKitWriter = py3.CSVKitWriter
-    CSVKitDictReader = py3.CSVKitDictReader
-    CSVKitDictWriter = py3.CSVKitDictWriter
-    reader = py3.reader
-    writer = py3.writer
-    DictReader = py3.CSVKitDictReader
-    DictWriter = py3.CSVKitDictWriter
+import agate
 
+reader = agate.reader
+writer = agate.writer
+DictReader = agate.DictReader
+DictWriter = agate.DictWriter
diff --git a/csvkit/cli.py b/csvkit/cli.py
@@ -7,9 +7,9 @@
 import os.path
 import sys
 
+import agate
 import six
 
-from csvkit import CSVKitReader
 from csvkit.exceptions import ColumnIdentifierError, RequiredHeaderError
 
 def lazy_opener(fn):
@@ -117,7 +117,7 @@ def _init_common_parser(self):
         """
         Prepare a base argparse argument parser so that flags are consistent across different shell command tools.
         If you want to constrain which common args are present, you can pass a string for 'omitflags'. Any argument
-        whose single-letter form is contained in 'omitflags' will be left out of the configured parser. Use 'f' for 
+        whose single-letter form is contained in 'omitflags' will be left out of the configured parser. Use 'f' for
         file.
         """
         self.argparser = argparse.ArgumentParser(description=self.description, epilog=self.epilog)
@@ -169,7 +169,7 @@ def _init_common_parser(self):
         if 'zero' not in self.override_flags:
             self.argparser.add_argument('--zero', dest='zero_based', action='store_true',
                             help='When interpreting or displaying column numbers, use zero-based numbering instead of the default 1-based numbering.')
-        
+
     def _open_input_file(self, path):
         """
         Open the input file specified on the command line.
@@ -278,7 +278,7 @@ def print_column_names(self):
         except:
             zero_based=False
 
-        rows = CSVKitReader(f, **self.reader_kwargs)
+        rows = agate.reader(f, **self.reader_kwargs)
         column_names = next(rows)
 
         for i, c in enumerate(column_names):
@@ -317,15 +317,15 @@ def match_column_identifier(column_names, c, zero_based=False):
 def parse_column_identifiers(ids, column_names, zero_based=False, excluded_columns=None):
     """
     Parse a comma-separated list of column indices AND/OR names into a list of integer indices.
-    Ranges of integers can be specified with two integers separated by a '-' or ':' character. Ranges of 
+    Ranges of integers can be specified with two integers separated by a '-' or ':' character. Ranges of
     non-integers (e.g. column names) are not supported.
-    Note: Column indices are 1-based. 
+    Note: Column indices are 1-based.
     """
     columns = []
 
-    # If not specified, start with all columns 
+    # If not specified, start with all columns
     if not ids:
-        columns = range(len(column_names))        
+        columns = range(len(column_names))
 
     if columns and not excluded_columns:
         return columns
@@ -343,7 +343,7 @@ def parse_column_identifiers(ids, column_names, zero_based=False, excluded_colum
                     a,b = c.split('-',1)
                 else:
                     raise
-                
+
                 try:
                     if a:
                         a = int(a)
@@ -353,15 +353,15 @@ def parse_column_identifiers(ids, column_names, zero_based=False, excluded_colum
                         b = int(b) + 1
                     else:
                         b = len(column_names) + 1
-                        
+
                 except ValueError:
                     raise ColumnIdentifierError("Invalid range %s. Ranges must be two integers separated by a - or : character.")
-                
+
                 for x in range(a,b):
                     columns.append(match_column_identifier(column_names, x, zero_based))
 
     excludes = []
-    
+
     if excluded_columns:
         for c in excluded_columns.split(','):
             c = c.strip()
@@ -375,7 +375,7 @@ def parse_column_identifiers(ids, column_names, zero_based=False, excluded_colum
                     a,b = c.split('-',1)
                 else:
                     raise
-                
+
                 try:
                     if a:
                         a = int(a)
@@ -385,12 +385,11 @@ def parse_column_identifiers(ids, column_names, zero_based=False, excluded_colum
                         b = int(b) + 1
                     else:
                         b = len(column_names)
-                        
+
                 except ValueError:
                     raise ColumnIdentifierError("Invalid range %s. Ranges must be two integers separated by a - or : character.")
-                
+
                 for x in range(a,b):
                     excludes.append(match_column_identifier(column_names, x, zero_based))
 
     return [c for c in columns if c not in excludes]
-
diff --git a/csvkit/convert/csvitself.py b/csvkit/convert/csvitself.py
@@ -2,17 +2,17 @@
 
 import six
 
-from csvkit import table
+import agate
 
 def csv2csv(f, **kwargs):
     """
     "Convert" a CSV into a new CSV by normalizing types and correcting for other anomalies.
     """
-    tab = table.Table.from_csv(f, **kwargs) 
+    table = agate.Table.from_csv(f, **kwargs)
 
-    o = six.StringIO()
-    output = tab.to_csv(o)
-    output = o.getvalue()
-    o.close()
+    output = six.StringIO()
+    table.to_csv(output)
+    result = output.getvalue()
+    output.close()
 
-    return output
+    return result
diff --git a/csvkit/convert/dbase.py b/csvkit/convert/dbase.py
@@ -4,41 +4,21 @@
 Note: dbf is only supported/imported for Python 2.
 """
 
+import agate
 import dbf
 import six
 
-from csvkit import table
-
 def dbf2csv(f, **kwargs):
     """
     Convert a dBASE .dbf file to csv.
     """
     with dbf.Table(f.name) as db:
-        headers = db.field_names
-
-        column_ids = range(len(headers))
-
-        data_columns = [[] for c in headers]
-
-        for row in db:
-            for i, d in enumerate(row):
-                try:
-                    data_columns[i].append(six.text_type(row[column_ids[i]]).strip())
-                except IndexError:
-                    # Non-rectangular data is truncated
-                    break
-
-        columns = []
-
-        for i, c in enumerate(data_columns):
-            columns.append(table.Column(column_ids[i], headers[i], c))
-
-        tab = table.Table(columns=columns) 
-
-        o = six.StringIO()
-        output = tab.to_csv(o)
-        output = o.getvalue()
-        o.close()
+        column_names = db.field_names
+        table = agate.Table(db, column_names)
 
-        return output
+    output = six.StringIO()
+    table.to_csv(output)
+    result = output.getvalue()
+    output.close()
 
+    return result
diff --git a/csvkit/convert/fixed.py b/csvkit/convert/fixed.py
@@ -3,19 +3,25 @@
 from collections import namedtuple
 from codecs import iterdecode
 
+import agate
 import six
 
-from csvkit import CSVKitReader, CSVKitWriter
-
 def fixed2csv(f, schema, output=None, **kwargs):
     """
     Convert a fixed-width file to csv using a CSV-formatted schema description.
 
-    A schema CSV must start with a header row with (at least) columns labeled "column","start", and "length". (Other columns will be ignored.) For each subsequent row, therefore, those columns will be used to identify a column name, the starting index of the column (an integer), and the length of the column (also an integer).
-    
-    Values in the 'start' column are assumed to be zero-based, unless the first value for 'start' is 1, in which case all values are assumed to be one-based.
+    A schema CSV must start with a header row with (at least) columns labeled
+    "column","start", and "length". (Other columns will be ignored.) For each
+    subsequent row, therefore, those columns will be used to identify a column
+    name, the starting index of the column (an integer), and the length of the
+    column (also an integer).
+
+    Values in the 'start' column are assumed to be zero-based, unless the first
+    value for 'start' is 1, in which case all values are assumed to be
+    one-based.
 
-    If output is specified, rows will be written to that object, otherwise the complete data will be returned.
+    If output is specified, rows will be written to that object, otherwise the
+    complete data will be returned.
     """
     streaming = True if output else False
 
@@ -27,23 +33,23 @@ def fixed2csv(f, schema, output=None, **kwargs):
     except KeyError:
         encoding = None
 
-    writer = CSVKitWriter(output)
+    writer = agate.writer(output)
 
     reader = FixedWidthReader(f, schema, encoding=encoding)
     writer.writerows(reader)
 
     if not streaming:
         data = output.getvalue()
         return data
-    
+
     # Return empty string when streaming
     return ''
 
 class FixedWidthReader(six.Iterator):
     """
-    Given a fixed-width file and a schema file, produce an analog to a csv reader that yields a row 
+    Given a fixed-width file and a schema file, produce an analog to a csv reader that yields a row
     of strings for each line in the fixed-width file, preceded with a row of headers as provided in the schema.  (This might be problematic if fixed-width-files ever have header rows also, but I haven't seen that.)
-    
+
     The schema_file should be in CSV format with a header row which has columns 'column', 'start', and 'length'. (Other columns will be ignored.)  Values in the 'start' column are assumed to be "zero-based" unless the first value is "1" in which case all values are assumed to be "one-based."
     """
     def __init__(self, f, schema, encoding=None):
@@ -55,7 +61,7 @@ def __init__(self, f, schema, encoding=None):
 
     def __iter__(self):
         return self
-        
+
     def __next__(self):
         if self.header:
             self.header = False
@@ -67,12 +73,12 @@ def __next__(self):
 
 class FixedWidthRowParser(object):
     """
-    Instantiated with a schema, able to return a sequence of trimmed strings representing fields given a fixed-length line. Flexible about where the columns are, as long as they are headed with the literal names 'column', 'start', and 'length'.  
+    Instantiated with a schema, able to return a sequence of trimmed strings representing fields given a fixed-length line. Flexible about where the columns are, as long as they are headed with the literal names 'column', 'start', and 'length'.
     """
     def __init__(self, schema):
         self.fields = [] # A list of FixedWidthFields
 
-        schema_reader = CSVKitReader(schema)
+        schema_reader = agate.reader(schema)
         schema_decoder = SchemaDecoder(next(schema_reader))
 
         for i,row in enumerate(schema_reader):
@@ -111,7 +117,7 @@ class SchemaDecoder(object):
 
     def __init__(self, header, **kwargs):
         """
-        Constructs a schema row decoder. 
+        Constructs a schema row decoder.
         """
         for p, val_type in self.REQUIRED_COLUMNS:
             try:
@@ -125,7 +131,7 @@ def __init__(self, header, **kwargs):
     def __call__(self, row):
         """
         Return a tuple (column, start, length) based on this instance's parameters.
-        If the first time this is called, the row's 'start' value is 1, then all 'start' 
+        If the first time this is called, the row's 'start' value is 1, then all 'start'
         values including the first will be one less than in the actual input data, to adjust for
         one-based specifications.  Values for 'start' and 'length' will be cast to integers.
         """