Skip to content

Commit

Permalink
Merge pull request #960 from wireservice/finite
Browse files Browse the repository at this point in the history
Tidy #955 and miscellaneous fixes
  • Loading branch information
jpmckinney committed May 21, 2018
2 parents 5480668 + ba579ce commit f63bea8
Show file tree
Hide file tree
Showing 20 changed files with 97 additions and 58 deletions.
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,4 @@ The following individuals have contributed code to csvkit:
* Forest Gregg
* Aliaksei Urbanski
* Reid Beels
* Rodrigo Lemos
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
1.0.4
-----

Fixes:

* :code:`--names` works with :code:`--skip-lines`.
* :doc:`/scripts/in2csv` writes XLS sheets without encoding errors in Python 2.
* :doc:`/scripts/csvsql` supports UPDATE commands.
* :doc:`/scripts/csvstat` no longer errors on non-finite numbers.

1.0.3 - March 11, 2018
----------------------

Expand Down
7 changes: 2 additions & 5 deletions csvkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,19 +354,16 @@ def print_column_names(self):
if getattr(self.args, 'no_header_row', None):
raise RequiredHeaderError('You cannot use --no-header-row with the -n or --names options.')

f = self.input_file
output = self.output_file

if getattr(self.args, 'zero_based', None):
start = 0
else:
start = 1

rows = agate.csv.reader(f, **self.reader_kwargs)
rows = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)
column_names = next(rows)

for i, c in enumerate(column_names, start):
output.write('%3i: %s\n' % (i, c))
self.output_file.write('%3i: %s\n' % (i, c))

def additional_input_expected(self):
return sys.stdin.isatty() and not self.args.input_path
Expand Down
11 changes: 6 additions & 5 deletions csvkit/utilities/csvsql.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def main(self):
try:
engine = create_engine(self.args.connection_string)
except ImportError:
raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n')
raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install mysql-connector-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n')

self.connection = engine.connect()

Expand Down Expand Up @@ -200,10 +200,11 @@ def _failsafe_main(self):
rows = self.connection.execute(q)

# Output the result of the last query as CSV
output = agate.csv.writer(self.output_file, **self.writer_kwargs)
output.writerow(rows._metadata.keys)
for row in rows:
output.writerow(row)
if rows.returns_rows:
output = agate.csv.writer(self.output_file, **self.writer_kwargs)
output.writerow(rows._metadata.keys)
for row in rows:
output.writerow(row)

transaction.commit()

Expand Down
9 changes: 6 additions & 3 deletions csvkit/utilities/csvstat.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,9 @@ def main(self):
else:
self.print_stats(table, column_ids, stats)

def is_finite_decimal(self, value):
return isinstance(value, Decimal) and value.is_finite()

def print_one(self, table, column_id, operation, label=True, **kwargs):
"""
Print data for a single statistic.
Expand All @@ -190,7 +193,7 @@ def print_one(self, table, column_id, operation, label=True, **kwargs):
op = OPERATIONS[op_name]['aggregation']
stat = table.aggregate(op(column_id))

if isinstance(stat, Decimal) and stat.is_finite():
if self.is_finite_decimal(stat):
stat = format_decimal(stat, locale=agate.config.get_option('default_locale'))
except:
stat = None
Expand Down Expand Up @@ -224,7 +227,7 @@ def calculate_stats(self, table, column_id, **kwargs):
op = op_data['aggregation']
v = table.aggregate(op(column_id))

if isinstance(v, Decimal) and v.is_finite():
if self.is_finite_decimal(v):
v = format_decimal(v, locale=agate.config.get_option('default_locale'))

stats[op_name] = v
Expand Down Expand Up @@ -268,7 +271,7 @@ def print_stats(self, table, column_ids, stats):
if isinstance(column.data_type, agate.Number):
v = row[column_name]

if isinstance(v, Decimal) and v.is_finite():
if self.is_finite_decimal(v):
v = format_decimal(v, locale=agate.config.get_option('default_locale'))
else:
v = six.text_type(row[column_name])
Expand Down
28 changes: 15 additions & 13 deletions csvkit/utilities/in2csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,14 @@ def open_excel_input_file(self, path):
else:
return open(path, 'rb')

def sheet_names(self, filetype):
def sheet_names(self, path, filetype):
input_file = self.open_excel_input_file(path)
if filetype == 'xls':
return xlrd.open_workbook(file_contents=self.input_file.read()).sheet_names()
sheet_names = xlrd.open_workbook(file_contents=input_file.read()).sheet_names()
elif filetype == 'xlsx':
return openpyxl.load_workbook(self.input_file, read_only=True, data_only=True).sheetnames
sheet_names = openpyxl.load_workbook(input_file, read_only=True, data_only=True).sheetnames
input_file.close()
return sheet_names

def main(self):
path = self.args.input_path
Expand All @@ -87,22 +90,21 @@ def main(self):
if not filetype:
self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

# Set the input file.
if filetype in ('xls', 'xlsx'):
self.input_file = self.open_excel_input_file(path)
else:
self.input_file = self._open_input_file(path)

if self.args.names_only:
sheets = self.sheet_names(filetype)
sheets = self.sheet_names(path, filetype)
if sheets:
for sheet in sheets:
self.output_file.write('%s\n' % sheet)
else:
self.argparser.error('You cannot use the -n or --names options with non-Excel files.')
self.input_file.close()
return

# Set the input file.
if filetype in ('xls', 'xlsx'):
self.input_file = self.open_excel_input_file(path)
else:
self.input_file = self._open_input_file(path)

# Set the reader's arguments.
kwargs = {}

Expand Down Expand Up @@ -157,12 +159,12 @@ def main(self):
self.input_file = self.open_excel_input_file(path)

if self.args.write_sheets == '-':
sheets = self.sheet_names(filetype)
sheets = self.sheet_names(path, filetype)
else:
sheets = [int(sheet) if sheet.isdigit() else sheet for sheet in self.args.write_sheets.split(',')]

if filetype == 'xls':
tables = agate.Table.from_xls(self.input_file, sheet=sheets, **kwargs)
tables = agate.Table.from_xls(self.input_file, sheet=sheets, encoding_override=self.args.encoding_xls, **kwargs)
elif filetype == 'xlsx':
tables = agate.Table.from_xlsx(self.input_file, sheet=sheets, **kwargs)

Expand Down
4 changes: 2 additions & 2 deletions csvkit/utilities/sql2csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ def main(self):
except ImportError:
raise ImportError("You don't appear to have the necessary database backend installed for connection "
"string you're trying to use. Available backends include:\n\nPostgreSQL:\tpip install "
"psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and "
"other backends, please see the SQLAlchemy documentation on dialects at:\n\n"
"psycopg2\nMySQL:\t\tpip install mysql-connector-python\n\nFor details on connection "
"strings and other backends, please see the SQLAlchemy documentation on dialects at:\n\n"
"http://www.sqlalchemy.org/docs/dialects/\n\n")

connection = engine.connect()
Expand Down
3 changes: 3 additions & 0 deletions docs/common_arguments.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,8 @@ All tools which accept CSV as input share a set of common command-line arguments

These arguments may be used to override csvkit's default "smart" parsing of CSV files. This is frequently necessary if the input file uses a particularly unusual style of quoting or is an encoding that is not compatible with utf-8. Not every command is supported by every tool, but the majority of them are.

For example, to disable CSV sniffing, set :code:`--snifflimit 0` and then set the :code:`--delimiter` and :code:`--quotechar` options yourself. To disable type inference, add the :code:`--no-inference` flag.

Note that the output of csvkit's tools is always formatted with "default" formatting options. This means that when executing multiple csvkit commands (either with a pipe or via intermediary files) it is only ever necessary to specify formatting arguments the first time. (And doing so for subsequent commands will likely cause them to fail.)

See the documentation of :doc:`/scripts/csvclean` for a description of the default formatting options.
4 changes: 2 additions & 2 deletions docs/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,13 @@ Currently, the following tools stream:
Currently, the following tools buffer:

* :doc:`/scripts/csvjoin`
* :doc:`/scripts/csvjson` unless ``--no-inference --stream --snifflimit 0`` is set and ``--skip-lines`` isn't set
* :doc:`/scripts/csvjson` unless :code:`--no-inference --stream --snifflimit 0` is set and :code:`--skip-lines` isn't set
* :doc:`/scripts/csvlook`
* :doc:`/scripts/csvpy`
* :doc:`/scripts/csvsort`
* :doc:`/scripts/csvsql`
* :doc:`/scripts/csvstat`
* :doc:`/scripts/in2csv` unless ``--format ndjson --no-inference`` is set, or unless ``--format csv --no-inference --snifflimit 0`` is set and ``--no-header-row`` and ``--skip-lines`` aren't set
* :doc:`/scripts/in2csv` unless :code:`--format ndjson --no-inference` is set, or unless :code:`--format csv --no-inference --snifflimit 0` is set and :code:`--no-header-row` and :code:`--skip-lines` aren't set

Legalese
========
Expand Down
6 changes: 5 additions & 1 deletion docs/scripts/csvclean.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Note that every csvkit tool does the following:

* removes optional quote characters, unless the `--quoting` (`-u`) option is set to change this behavior
* changes the field delimiter to a comma, if the input delimiter is set with the `--delimiter` (`-d`) or `--tabs` (`-t`) options
* changes the record delimiter to a line feed
* changes the record delimiter to a line feed (LF or ``\n``)
* changes the quote character to a double-quotation mark, if the character is set with the `--quotechar` (`-q`) option
* changes the character encoding to UTF-8, if the input encoding is set with the `--encoding` (`-e`) option

Expand Down Expand Up @@ -47,3 +47,7 @@ Test a file with known bad rows::

Line 1: Expected 3 columns, found 4 columns
Line 2: Expected 3 columns, found 2 columns

To change the line ending from line feed (LF or ``\n``) to carriage return and line feed (CRLF or ``\r\n``) use::

csvformat -M $'\r\n' examples/dummy.csv
4 changes: 2 additions & 2 deletions docs/scripts/csvsql.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ If you prefer not to enter your password in the connection string, store the pas

.. note::

Using the ``--query`` option may cause rounding (in Python 2) or introduce [Python floating point issues](https://docs.python.org/3.4/tutorial/floatingpoint.html) (in Python 3).
Using the :code:`--query` option may cause rounding (in Python 2) or introduce [Python floating point issues](https://docs.python.org/3.4/tutorial/floatingpoint.html) (in Python 3).

Examples
========
Expand All @@ -90,7 +90,7 @@ Create a table and import data from the CSV directly into PostgreSQL::
createdb test
csvsql --db postgresql:///test --tables fy09 --insert examples/realdata/FY09_EDU_Recipients_by_State.csv

For large tables it may not be practical to process the entire table. One solution to this is to analyze a sample of the table. In this case it can be useful to turn off length limits and null checks with the ``no-constraints`` option::
For large tables it may not be practical to process the entire table. One solution to this is to analyze a sample of the table. In this case it can be useful to turn off length limits and null checks with the :code:`--no-constraints` option::

head -n 20 examples/realdata/FY09_EDU_Recipients_by_State.csv | csvsql --no-constraints --tables fy09

Expand Down
2 changes: 1 addition & 1 deletion docs/scripts/csvstack.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ See also: :doc:`../common_arguments`.

.. warn::

If you redirect output to an input file like ``csvstack file.csv > file.csv``, the file will grow indefinitely.
If you redirect output to an input file like :code:`csvstack file.csv > file.csv`, the file will grow indefinitely.

Examples
========
Expand Down
15 changes: 8 additions & 7 deletions docs/tricks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ Set the encoding to ``utf-8-sig``, for example::
Specifying STDIN as a file
--------------------------

Most tools use ``STDIN`` as input if no filename is given, but tools that accept multiple inputs like :doc:`scripts/csvjoin` and :doc:`scripts/csvstack` don't. To use ``STDIN`` as an input to these tools, use ``-`` as the filename. For example, these three commands produce the same output::
Most tools use ``STDIN`` as input if no filename is given, but tools that accept multiple inputs like :doc:`/scripts/csvjoin` and :doc:`/scripts/csvstack` don't. To use ``STDIN`` as an input to these tools, use ``-`` as the filename. For example, these three commands produce the same output::

csvstat examples/dummy.csv
cat examples/dummy.csv | csvstat
cat examples/dummy.csv | csvstat -

``csvstack`` can take a filename and ``STDIN`` as input, for example::
:doc:`/scripts/csvstack` can take a filename and ``STDIN`` as input, for example::

cat examples/dummy.csv | csvstack examples/dummy3.csv -

Expand Down Expand Up @@ -63,7 +63,7 @@ If the installation is successful but csvkit's tools fail, you may need to updat
pip install --upgrade setuptools
pip install --upgrade csvkit

On macOS, if you see `OSError: [Errno 1] Operation not permitted`, try::
On macOS, if you see ``OSError: [Errno 1] Operation not permitted``, try::

sudo pip install --ignore-installed csvkit

Expand All @@ -81,7 +81,8 @@ CSV formatting and parsing
* Are values appearing in incorrect columns?
* Does the output combine multiple fields into a single column with double-quotes?
* Does the outplit split a single field into multiple columns?
* Are `csvstat -c 1` and `csvstat --count` reporting inconsistent row counts?
* Are :code:`csvstat -c 1` and :code:`csvstat --count` reporting inconsistent row counts?
* Do you see ``Row # has # values, but Table only has # columns.``?

These may be symptoms of CSV sniffing gone wrong. As there is no single, standard CSV format, csvkit uses Python's `csv.Sniffer <https://docs.python.org/3.5/library/csv.html#csv.Sniffer>`_ to deduce the format of a CSV file: that is, the field delimiter and quote character. By default, the entire file is sent for sniffing, which can be slow. You can send a small sample with the :code:`--snifflimit` option. If you're encountering any cases above, you can try setting :code:`--snifflimit 0` to disable sniffing and set the :code:`--delimiter` and :code:`--quotechar` options yourself.

Expand All @@ -96,7 +97,7 @@ CSV data interpretation

These may be symptoms of csvkit's type inference being too aggressive for your data. CSV is a text format, but it may contain text representing numbers, dates, booleans or other types. csvkit attempts to reverse engineer that text into proper data types—a process called "type inference".

For some data, type inference can be error prone. If necessary you can disable it with the To :code:`--no-inference` switch. This will force all columns to be treated as regular text.
For some data, type inference can be error prone. If necessary you can disable it with the :code:`--no-inference` switch. This will force all columns to be treated as regular text.

Slow performance
----------------
Expand All @@ -108,14 +109,14 @@ If a tool is too slow to be practical for your data try setting the :code:`--sni
Database errors
---------------

Are you seeing this error message, even after running :code:`pip install psycopg2` or :code:`pip install MySQL-python`?
Are you seeing this error message, even after running :code:`pip install psycopg2` or :code:`pip install mysql-connector-python`?

::

You don't appear to have the necessary database backend installed for connection string you're trying to use. Available backends include:

Postgresql: pip install psycopg2
MySQL: pip install MySQL-python
MySQL: pip install mysql-connector-python

For details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at:

Expand Down
4 changes: 2 additions & 2 deletions docs/tutorial/1_getting_started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,11 @@ Now that we understand :doc:`/scripts/in2csv`, :doc:`/scripts/csvlook` and :doc:
In addition to specifying filenames, all csvkit tools accept an input file via "standard in". This means that, using the ``|`` ("pipe") character we can use the output of one csvkit tool as the input of the next.

In the example above, the output of ``csvcut`` becomes the input to ``csvlook``. This also allow us to pipe output to standard Unix commands such as ``head``, which prints only the first ten lines of its input. Here, the output of ``csvlook`` becomes the input of ``head``.
In the example above, the output of :doc:`/scripts/csvcut` becomes the input to :doc:`/scripts/csvlook`. This also allow us to pipe output to standard Unix commands such as ``head``, which prints only the first ten lines of its input. Here, the output of :doc:`/scripts/csvlook` becomes the input of ``head``.

Piping is a core feature of csvkit. Of course, you can always write the output of each command to a file using ``>``. However, it's often faster and more convenient to use pipes to chain several commands together.

We can also pipe ``in2csv``, allowing us to combine all our previous operations into one:
We can also pipe :doc:`/scripts/in2csv`, allowing us to combine all our previous operations into one:

.. code-block:: bash
Expand Down

0 comments on commit f63bea8

Please sign in to comment.