Skip to content

Commit

Permalink
Merge pull request #486 from onyxfish/agate
Browse files Browse the repository at this point in the history
Agate integration
  • Loading branch information
James McKinney committed Jan 23, 2016
2 parents 9deaede + d8ed031 commit 3f0b6be
Show file tree
Hide file tree
Showing 49 changed files with 306 additions and 1,123 deletions.
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
language: python
python:
- "2.6"
- "2.7"
- "pypy"
- "3.3"
- "3.4"
- "3.5"
install:
- if [[ $TRAVIS_PYTHON_VERSION == "2.6" ]]; then pip install -r requirements-py2.txt --use-mirrors --allow-external argparse; fi
- if [[ $TRAVIS_PYTHON_VERSION == "2.7" ]]; then pip install -r requirements-py2.txt; fi
- if [[ $TRAVIS_PYTHON_VERSION == "pypy" ]]; then pip install -r requirements-py2.txt; fi
- if [[ $TRAVIS_PYTHON_VERSION == "3.3" ]]; then pip install -r requirements-py3.txt; fi
- if [[ $TRAVIS_PYTHON_VERSION == "3.4" ]]; then pip install -r requirements-py3.txt; fi
- if [[ $TRAVIS_PYTHON_VERSION == "3.5" ]]; then pip install -r requirements-py3.txt; fi
script: nosetests
12 changes: 11 additions & 1 deletion CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
0.9.2
1.0.0
-----

This is a major release of csvkit. The entire backend has been rewritten to leverage the agate data analysis library, rather than bespoke implementations. In addition to the specific changes enumerated below there may be small changes to the way the output of the various tools is formatted. For example:

* If `--no-header-row` is set, the output will have column names A, B, C, etc. instead of column1, column2, column3, etc.

If you have built data workflows on top of csvkit you should not upgrade without thorough testing.

* in2csv DBF conversion now works with Python 3.
* "import csvkit as csv" will now defer to agate readers/writers.
* CSVKitReader, CSVKitWriter, CSVKitDictReader, and CSVKitDictWriter have been removed. Use agate.reader, agate.writer, agate.DictReader and agate.DictWriter.
* in2csv "csv itself" conversions now use agate.Table.
* in2csv now correctly guesses format when file has an uppercase extension.

0.9.1
Expand Down
40 changes: 13 additions & 27 deletions csvkit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,20 @@
#!/usr/bin/env python

"""
This module contains csvkit's superpowered replacement for the builtin :mod:`csv` module. For Python 2 users, the greatest improvement over the standard library full unicode support. Python 3's :mod:`csv` module supports unicode internally, so this module is provided primarily for compatability purposes.
This module contains csvkit's superpowered alternative to the standard Python
CSV reader and writer. It can be used as a drop-in replacement for the standard
module.
* Python 2: :mod:`csvkit.py2`.
* Python 3: :mod:`csvkit.py3`.
"""

import six
.. warn::
if six.PY2:
from csvkit import py2

CSVKitReader = py2.CSVKitReader
CSVKitWriter = py2.CSVKitWriter
CSVKitDictReader = py2.CSVKitDictReader
CSVKitDictWriter = py2.CSVKitDictWriter
reader = py2.reader
writer = py2.writer
DictReader = py2.CSVKitDictReader
DictWriter = py2.CSVKitDictWriter
else:
from csvkit import py3
Since version 1.0 csvkit relies on `agate <http://agate.rtfd.org>`_'s
CSV reader and writer. This module is supported for legacy purposes only and you
should migrate to using agate.
"""

CSVKitReader = py3.CSVKitReader
CSVKitWriter = py3.CSVKitWriter
CSVKitDictReader = py3.CSVKitDictReader
CSVKitDictWriter = py3.CSVKitDictWriter
reader = py3.reader
writer = py3.writer
DictReader = py3.CSVKitDictReader
DictWriter = py3.CSVKitDictWriter
import agate

reader = agate.reader
writer = agate.writer
DictReader = agate.DictReader
DictWriter = agate.DictWriter
31 changes: 15 additions & 16 deletions csvkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
import os.path
import sys

import agate
import six

from csvkit import CSVKitReader
from csvkit.exceptions import ColumnIdentifierError, RequiredHeaderError

def lazy_opener(fn):
Expand Down Expand Up @@ -117,7 +117,7 @@ def _init_common_parser(self):
"""
Prepare a base argparse argument parser so that flags are consistent across different shell command tools.
If you want to constrain which common args are present, you can pass a string for 'omitflags'. Any argument
whose single-letter form is contained in 'omitflags' will be left out of the configured parser. Use 'f' for
whose single-letter form is contained in 'omitflags' will be left out of the configured parser. Use 'f' for
file.
"""
self.argparser = argparse.ArgumentParser(description=self.description, epilog=self.epilog)
Expand Down Expand Up @@ -169,7 +169,7 @@ def _init_common_parser(self):
if 'zero' not in self.override_flags:
self.argparser.add_argument('--zero', dest='zero_based', action='store_true',
help='When interpreting or displaying column numbers, use zero-based numbering instead of the default 1-based numbering.')

def _open_input_file(self, path):
"""
Open the input file specified on the command line.
Expand Down Expand Up @@ -278,7 +278,7 @@ def print_column_names(self):
except:
zero_based=False

rows = CSVKitReader(f, **self.reader_kwargs)
rows = agate.reader(f, **self.reader_kwargs)
column_names = next(rows)

for i, c in enumerate(column_names):
Expand Down Expand Up @@ -317,15 +317,15 @@ def match_column_identifier(column_names, c, zero_based=False):
def parse_column_identifiers(ids, column_names, zero_based=False, excluded_columns=None):
"""
Parse a comma-separated list of column indices AND/OR names into a list of integer indices.
Ranges of integers can be specified with two integers separated by a '-' or ':' character. Ranges of
Ranges of integers can be specified with two integers separated by a '-' or ':' character. Ranges of
non-integers (e.g. column names) are not supported.
Note: Column indices are 1-based.
Note: Column indices are 1-based.
"""
columns = []

# If not specified, start with all columns
# If not specified, start with all columns
if not ids:
columns = range(len(column_names))
columns = range(len(column_names))

if columns and not excluded_columns:
return columns
Expand All @@ -343,7 +343,7 @@ def parse_column_identifiers(ids, column_names, zero_based=False, excluded_colum
a,b = c.split('-',1)
else:
raise

try:
if a:
a = int(a)
Expand All @@ -353,15 +353,15 @@ def parse_column_identifiers(ids, column_names, zero_based=False, excluded_colum
b = int(b) + 1
else:
b = len(column_names) + 1

except ValueError:
raise ColumnIdentifierError("Invalid range %s. Ranges must be two integers separated by a - or : character.")

for x in range(a,b):
columns.append(match_column_identifier(column_names, x, zero_based))

excludes = []

if excluded_columns:
for c in excluded_columns.split(','):
c = c.strip()
Expand All @@ -375,7 +375,7 @@ def parse_column_identifiers(ids, column_names, zero_based=False, excluded_colum
a,b = c.split('-',1)
else:
raise

try:
if a:
a = int(a)
Expand All @@ -385,12 +385,11 @@ def parse_column_identifiers(ids, column_names, zero_based=False, excluded_colum
b = int(b) + 1
else:
b = len(column_names)

except ValueError:
raise ColumnIdentifierError("Invalid range %s. Ranges must be two integers separated by a - or : character.")

for x in range(a,b):
excludes.append(match_column_identifier(column_names, x, zero_based))

return [c for c in columns if c not in excludes]

14 changes: 7 additions & 7 deletions csvkit/convert/csvitself.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@

import six

from csvkit import table
import agate

def csv2csv(f, **kwargs):
"""
"Convert" a CSV into a new CSV by normalizing types and correcting for other anomalies.
"""
tab = table.Table.from_csv(f, **kwargs)
table = agate.Table.from_csv(f, **kwargs)

o = six.StringIO()
output = tab.to_csv(o)
output = o.getvalue()
o.close()
output = six.StringIO()
table.to_csv(output)
result = output.getvalue()
output.close()

return output
return result
36 changes: 8 additions & 28 deletions csvkit/convert/dbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,41 +4,21 @@
Note: dbf is only supported/imported for Python 2.
"""

import agate
import dbf
import six

from csvkit import table

def dbf2csv(f, **kwargs):
"""
Convert a dBASE .dbf file to csv.
"""
with dbf.Table(f.name) as db:
headers = db.field_names

column_ids = range(len(headers))

data_columns = [[] for c in headers]

for row in db:
for i, d in enumerate(row):
try:
data_columns[i].append(six.text_type(row[column_ids[i]]).strip())
except IndexError:
# Non-rectangular data is truncated
break

columns = []

for i, c in enumerate(data_columns):
columns.append(table.Column(column_ids[i], headers[i], c))

tab = table.Table(columns=columns)

o = six.StringIO()
output = tab.to_csv(o)
output = o.getvalue()
o.close()
column_names = db.field_names
table = agate.Table(db, column_names)

return output
output = six.StringIO()
table.to_csv(output)
result = output.getvalue()
output.close()

return result
36 changes: 21 additions & 15 deletions csvkit/convert/fixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,25 @@
from collections import namedtuple
from codecs import iterdecode

import agate
import six

from csvkit import CSVKitReader, CSVKitWriter

def fixed2csv(f, schema, output=None, **kwargs):
"""
Convert a fixed-width file to csv using a CSV-formatted schema description.
A schema CSV must start with a header row with (at least) columns labeled "column","start", and "length". (Other columns will be ignored.) For each subsequent row, therefore, those columns will be used to identify a column name, the starting index of the column (an integer), and the length of the column (also an integer).
Values in the 'start' column are assumed to be zero-based, unless the first value for 'start' is 1, in which case all values are assumed to be one-based.
A schema CSV must start with a header row with (at least) columns labeled
"column","start", and "length". (Other columns will be ignored.) For each
subsequent row, therefore, those columns will be used to identify a column
name, the starting index of the column (an integer), and the length of the
column (also an integer).
Values in the 'start' column are assumed to be zero-based, unless the first
value for 'start' is 1, in which case all values are assumed to be
one-based.
If output is specified, rows will be written to that object, otherwise the complete data will be returned.
If output is specified, rows will be written to that object, otherwise the
complete data will be returned.
"""
streaming = True if output else False

Expand All @@ -27,23 +33,23 @@ def fixed2csv(f, schema, output=None, **kwargs):
except KeyError:
encoding = None

writer = CSVKitWriter(output)
writer = agate.writer(output)

reader = FixedWidthReader(f, schema, encoding=encoding)
writer.writerows(reader)

if not streaming:
data = output.getvalue()
return data

# Return empty string when streaming
return ''

class FixedWidthReader(six.Iterator):
"""
Given a fixed-width file and a schema file, produce an analog to a csv reader that yields a row
Given a fixed-width file and a schema file, produce an analog to a csv reader that yields a row
of strings for each line in the fixed-width file, preceded with a row of headers as provided in the schema. (This might be problematic if fixed-width-files ever have header rows also, but I haven't seen that.)
The schema_file should be in CSV format with a header row which has columns 'column', 'start', and 'length'. (Other columns will be ignored.) Values in the 'start' column are assumed to be "zero-based" unless the first value is "1" in which case all values are assumed to be "one-based."
"""
def __init__(self, f, schema, encoding=None):
Expand All @@ -55,7 +61,7 @@ def __init__(self, f, schema, encoding=None):

def __iter__(self):
return self

def __next__(self):
if self.header:
self.header = False
Expand All @@ -67,12 +73,12 @@ def __next__(self):

class FixedWidthRowParser(object):
"""
Instantiated with a schema, able to return a sequence of trimmed strings representing fields given a fixed-length line. Flexible about where the columns are, as long as they are headed with the literal names 'column', 'start', and 'length'.
Instantiated with a schema, able to return a sequence of trimmed strings representing fields given a fixed-length line. Flexible about where the columns are, as long as they are headed with the literal names 'column', 'start', and 'length'.
"""
def __init__(self, schema):
self.fields = [] # A list of FixedWidthFields

schema_reader = CSVKitReader(schema)
schema_reader = agate.reader(schema)
schema_decoder = SchemaDecoder(next(schema_reader))

for i,row in enumerate(schema_reader):
Expand Down Expand Up @@ -111,7 +117,7 @@ class SchemaDecoder(object):

def __init__(self, header, **kwargs):
"""
Constructs a schema row decoder.
Constructs a schema row decoder.
"""
for p, val_type in self.REQUIRED_COLUMNS:
try:
Expand All @@ -125,7 +131,7 @@ def __init__(self, header, **kwargs):
def __call__(self, row):
"""
Return a tuple (column, start, length) based on this instance's parameters.
If the first time this is called, the row's 'start' value is 1, then all 'start'
If the first time this is called, the row's 'start' value is 1, then all 'start'
values including the first will be one less than in the actual input data, to adjust for
one-based specifications. Values for 'start' and 'length' will be cast to integers.
"""
Expand Down

0 comments on commit 3f0b6be

Please sign in to comment.