Merge pull request #960 from wireservice/finite

Tidy #955 and miscellaneous fixes
wireservice · May 21, 2018 · f63bea8 · f63bea8
2 parents 5480668 + ba579ce
commit f63bea8
Show file tree

Hide file tree

Showing 20 changed files with 97 additions and 58 deletions.
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -85,3 +85,4 @@ The following individuals have contributed code to csvkit:
 * Forest Gregg
 * Aliaksei Urbanski
 * Reid Beels
+* Rodrigo Lemos
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,13 @@
 1.0.4
 -----
 
+Fixes:
+
+* :code:`--names` works with :code:`--skip-lines`.
+* :doc:`/scripts/in2csv` writes XLS sheets without encoding errors in Python 2.
+* :doc:`/scripts/csvsql` supports UPDATE commands.
+* :doc:`/scripts/csvstat` no longer errors on non-finite numbers.
+
 1.0.3 - March 11, 2018
 ----------------------
 

diff --git a/csvkit/cli.py b/csvkit/cli.py
@@ -354,19 +354,16 @@ def print_column_names(self):
         if getattr(self.args, 'no_header_row', None):
             raise RequiredHeaderError('You cannot use --no-header-row with the -n or --names options.')
 
-        f = self.input_file
-        output = self.output_file
-
         if getattr(self.args, 'zero_based', None):
             start = 0
         else:
             start = 1
 
-        rows = agate.csv.reader(f, **self.reader_kwargs)
+        rows = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)
         column_names = next(rows)
 
         for i, c in enumerate(column_names, start):
-            output.write('%3i: %s\n' % (i, c))
+            self.output_file.write('%3i: %s\n' % (i, c))
 
     def additional_input_expected(self):
         return sys.stdin.isatty() and not self.args.input_path

diff --git a/csvkit/utilities/csvsql.py b/csvkit/utilities/csvsql.py
@@ -101,7 +101,7 @@ def main(self):
             try:
                 engine = create_engine(self.args.connection_string)
             except ImportError:
-                raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n')
+                raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install mysql-connector-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n')
 
             self.connection = engine.connect()
 
@@ -200,10 +200,11 @@ def _failsafe_main(self):
                         rows = self.connection.execute(q)
 
                 # Output the result of the last query as CSV
-                output = agate.csv.writer(self.output_file, **self.writer_kwargs)
-                output.writerow(rows._metadata.keys)
-                for row in rows:
-                    output.writerow(row)
+                if rows.returns_rows:
+                    output = agate.csv.writer(self.output_file, **self.writer_kwargs)
+                    output.writerow(rows._metadata.keys)
+                    for row in rows:
+                        output.writerow(row)
 
             transaction.commit()
 

diff --git a/csvkit/utilities/csvstat.py b/csvkit/utilities/csvstat.py
@@ -171,6 +171,9 @@ def main(self):
             else:
                 self.print_stats(table, column_ids, stats)
 
+    def is_finite_decimal(self, value):
+        return isinstance(value, Decimal) and value.is_finite()
+
     def print_one(self, table, column_id, operation, label=True, **kwargs):
         """
         Print data for a single statistic.
@@ -190,7 +193,7 @@ def print_one(self, table, column_id, operation, label=True, **kwargs):
                     op = OPERATIONS[op_name]['aggregation']
                     stat = table.aggregate(op(column_id))
 
-                    if isinstance(stat, Decimal) and stat.is_finite():
+                    if self.is_finite_decimal(stat):
                         stat = format_decimal(stat, locale=agate.config.get_option('default_locale'))
             except:
                 stat = None
@@ -224,7 +227,7 @@ def calculate_stats(self, table, column_id, **kwargs):
                         op = op_data['aggregation']
                         v = table.aggregate(op(column_id))
 
-                        if isinstance(v, Decimal) and v.is_finite():
+                        if self.is_finite_decimal(v):
                             v = format_decimal(v, locale=agate.config.get_option('default_locale'))
 
                         stats[op_name] = v
@@ -268,7 +271,7 @@ def print_stats(self, table, column_ids, stats):
                         if isinstance(column.data_type, agate.Number):
                             v = row[column_name]
 
-                            if isinstance(v, Decimal) and v.is_finite():
+                            if self.is_finite_decimal(v):
                                 v = format_decimal(v, locale=agate.config.get_option('default_locale'))
                         else:
                             v = six.text_type(row[column_name])

diff --git a/csvkit/utilities/in2csv.py b/csvkit/utilities/in2csv.py
@@ -62,11 +62,14 @@ def open_excel_input_file(self, path):
         else:
             return open(path, 'rb')
 
-    def sheet_names(self, filetype):
+    def sheet_names(self, path, filetype):
+        input_file = self.open_excel_input_file(path)
         if filetype == 'xls':
-            return xlrd.open_workbook(file_contents=self.input_file.read()).sheet_names()
+            sheet_names = xlrd.open_workbook(file_contents=input_file.read()).sheet_names()
         elif filetype == 'xlsx':
-            return openpyxl.load_workbook(self.input_file, read_only=True, data_only=True).sheetnames
+            sheet_names = openpyxl.load_workbook(input_file, read_only=True, data_only=True).sheetnames
+        input_file.close()
+        return sheet_names
 
     def main(self):
         path = self.args.input_path
@@ -87,22 +90,21 @@ def main(self):
             if not filetype:
                 self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.')
 
-        # Set the input file.
-        if filetype in ('xls', 'xlsx'):
-            self.input_file = self.open_excel_input_file(path)
-        else:
-            self.input_file = self._open_input_file(path)
-
         if self.args.names_only:
-            sheets = self.sheet_names(filetype)
+            sheets = self.sheet_names(path, filetype)
             if sheets:
                 for sheet in sheets:
                     self.output_file.write('%s\n' % sheet)
             else:
                 self.argparser.error('You cannot use the -n or --names options with non-Excel files.')
-            self.input_file.close()
             return
 
+        # Set the input file.
+        if filetype in ('xls', 'xlsx'):
+            self.input_file = self.open_excel_input_file(path)
+        else:
+            self.input_file = self._open_input_file(path)
+
         # Set the reader's arguments.
         kwargs = {}
 
@@ -157,12 +159,12 @@ def main(self):
             self.input_file = self.open_excel_input_file(path)
 
             if self.args.write_sheets == '-':
-                sheets = self.sheet_names(filetype)
+                sheets = self.sheet_names(path, filetype)
             else:
                 sheets = [int(sheet) if sheet.isdigit() else sheet for sheet in self.args.write_sheets.split(',')]
 
             if filetype == 'xls':
-                tables = agate.Table.from_xls(self.input_file, sheet=sheets, **kwargs)
+                tables = agate.Table.from_xls(self.input_file, sheet=sheets, encoding_override=self.args.encoding_xls, **kwargs)
             elif filetype == 'xlsx':
                 tables = agate.Table.from_xlsx(self.input_file, sheet=sheets, **kwargs)
 

diff --git a/csvkit/utilities/sql2csv.py b/csvkit/utilities/sql2csv.py
@@ -44,8 +44,8 @@ def main(self):
         except ImportError:
             raise ImportError("You don't appear to have the necessary database backend installed for connection "
                               "string you're trying to use. Available backends include:\n\nPostgreSQL:\tpip install "
-                              "psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and "
-                              "other backends, please see the SQLAlchemy documentation on dialects at:\n\n"
+                              "psycopg2\nMySQL:\t\tpip install mysql-connector-python\n\nFor details on connection "
+                              "strings and other backends, please see the SQLAlchemy documentation on dialects at:\n\n"
                               "http://www.sqlalchemy.org/docs/dialects/\n\n")
 
         connection = engine.connect()

diff --git a/docs/common_arguments.rst b/docs/common_arguments.rst
@@ -52,5 +52,8 @@ All tools which accept CSV as input share a set of common command-line arguments
 
 These arguments may be used to override csvkit's default "smart" parsing of CSV files. This is frequently necessary if the input file uses a particularly unusual style of quoting or is an encoding that is not compatible with utf-8. Not every command is supported by every tool, but the majority of them are.
 
+For example, to disable CSV sniffing, set :code:`--snifflimit 0` and then set the :code:`--delimiter` and :code:`--quotechar` options yourself. To disable type inference, add the :code:`--no-inference` flag.
+
 Note that the output of csvkit's tools is always formatted with "default" formatting options. This means that when executing multiple csvkit commands (either with a pipe or via intermediary files) it is only ever necessary to specify formatting arguments the first time. (And doing so for subsequent commands will likely cause them to fail.)
 
+See the documentation of :doc:`/scripts/csvclean` for a description of the default formatting options.
diff --git a/docs/contributing.rst b/docs/contributing.rst
@@ -87,13 +87,13 @@ Currently, the following tools stream:
 Currently, the following tools buffer:
 
 * :doc:`/scripts/csvjoin`
-* :doc:`/scripts/csvjson` unless ``--no-inference --stream --snifflimit 0`` is set and ``--skip-lines`` isn't set
+* :doc:`/scripts/csvjson` unless :code:`--no-inference --stream --snifflimit 0` is set and :code:`--skip-lines` isn't set
 * :doc:`/scripts/csvlook`
 * :doc:`/scripts/csvpy`
 * :doc:`/scripts/csvsort`
 * :doc:`/scripts/csvsql`
 * :doc:`/scripts/csvstat`
-* :doc:`/scripts/in2csv` unless ``--format ndjson --no-inference`` is set, or unless ``--format csv --no-inference --snifflimit 0`` is set and ``--no-header-row`` and ``--skip-lines`` aren't set
+* :doc:`/scripts/in2csv` unless :code:`--format ndjson --no-inference` is set, or unless :code:`--format csv --no-inference --snifflimit 0` is set and :code:`--no-header-row` and :code:`--skip-lines` aren't set
 
 Legalese
 ========

diff --git a/docs/scripts/csvclean.rst b/docs/scripts/csvclean.rst
@@ -14,7 +14,7 @@ Note that every csvkit tool does the following:
 
 * removes optional quote characters, unless the `--quoting` (`-u`) option is set to change this behavior
 * changes the field delimiter to a comma, if the input delimiter is set with the `--delimiter` (`-d`) or `--tabs` (`-t`) options
-* changes the record delimiter to a line feed
+* changes the record delimiter to a line feed (LF or ``\n``)
 * changes the quote character to a double-quotation mark, if the character is set with the `--quotechar` (`-q`) option
 * changes the character encoding to UTF-8, if the input encoding is set with the `--encoding` (`-e`) option
 
@@ -47,3 +47,7 @@ Test a file with known bad rows::
 
     Line 1: Expected 3 columns, found 4 columns
     Line 2: Expected 3 columns, found 2 columns
+
+To change the line ending from line feed (LF or ``\n``) to carriage return and line feed (CRLF or ``\r\n``) use::
+
+    csvformat -M $'\r\n' examples/dummy.csv
diff --git a/docs/scripts/csvsql.rst b/docs/scripts/csvsql.rst
@@ -76,7 +76,7 @@ If you prefer not to enter your password in the connection string, store the pas
 
 .. note::
 
-    Using the ``--query`` option may cause rounding (in Python 2) or introduce [Python floating point issues](https://docs.python.org/3.4/tutorial/floatingpoint.html) (in Python 3).
+    Using the :code:`--query` option may cause rounding (in Python 2) or introduce [Python floating point issues](https://docs.python.org/3.4/tutorial/floatingpoint.html) (in Python 3).
 
 Examples
 ========
@@ -90,7 +90,7 @@ Create a table and import data from the CSV directly into PostgreSQL::
     createdb test
     csvsql --db postgresql:///test --tables fy09 --insert examples/realdata/FY09_EDU_Recipients_by_State.csv
 
-For large tables it may not be practical to process the entire table. One solution to this is to analyze a sample of the table. In this case it can be useful to turn off length limits and null checks with the ``no-constraints`` option::
+For large tables it may not be practical to process the entire table. One solution to this is to analyze a sample of the table. In this case it can be useful to turn off length limits and null checks with the :code:`--no-constraints` option::
 
     head -n 20 examples/realdata/FY09_EDU_Recipients_by_State.csv | csvsql --no-constraints --tables fy09
 

diff --git a/docs/scripts/csvstack.rst b/docs/scripts/csvstack.rst
@@ -37,7 +37,7 @@ See also: :doc:`../common_arguments`.
 
 .. warn::
 
-    If you redirect output to an input file like ``csvstack file.csv > file.csv``, the file will grow indefinitely.
+    If you redirect output to an input file like :code:`csvstack file.csv > file.csv`, the file will grow indefinitely.
 
 Examples
 ========

diff --git a/docs/tricks.rst b/docs/tricks.rst
@@ -25,13 +25,13 @@ Set the encoding to ``utf-8-sig``, for example::
 Specifying STDIN as a file
 --------------------------
 
-Most tools use ``STDIN`` as input if no filename is given, but tools that accept multiple inputs like :doc:`scripts/csvjoin` and :doc:`scripts/csvstack` don't. To use ``STDIN`` as an input to these tools, use ``-`` as the filename. For example, these three commands produce the same output::
+Most tools use ``STDIN`` as input if no filename is given, but tools that accept multiple inputs like :doc:`/scripts/csvjoin` and :doc:`/scripts/csvstack` don't. To use ``STDIN`` as an input to these tools, use ``-`` as the filename. For example, these three commands produce the same output::
 
     csvstat examples/dummy.csv
     cat examples/dummy.csv | csvstat
     cat examples/dummy.csv | csvstat -
 
-``csvstack`` can take a filename and ``STDIN`` as input, for example::
+:doc:`/scripts/csvstack` can take a filename and ``STDIN`` as input, for example::
 
     cat examples/dummy.csv | csvstack examples/dummy3.csv -
 
@@ -63,7 +63,7 @@ If the installation is successful but csvkit's tools fail, you may need to updat
     pip install --upgrade setuptools
     pip install --upgrade csvkit
 
-On macOS, if you see `OSError: [Errno 1] Operation not permitted`, try::
+On macOS, if you see ``OSError: [Errno 1] Operation not permitted``, try::
 
     sudo pip install --ignore-installed csvkit
 
@@ -81,7 +81,8 @@ CSV formatting and parsing
 * Are values appearing in incorrect columns?
 * Does the output combine multiple fields into a single column with double-quotes?
 * Does the outplit split a single field into multiple columns?
-* Are `csvstat -c 1` and `csvstat --count` reporting inconsistent row counts?
+* Are :code:`csvstat -c 1` and :code:`csvstat --count` reporting inconsistent row counts?
+* Do you see ``Row # has # values, but Table only has # columns.``?
 
 These may be symptoms of CSV sniffing gone wrong. As there is no single, standard CSV format, csvkit uses Python's `csv.Sniffer <https://docs.python.org/3.5/library/csv.html#csv.Sniffer>`_ to deduce the format of a CSV file: that is, the field delimiter and quote character. By default, the entire file is sent for sniffing, which can be slow. You can send a small sample with the :code:`--snifflimit` option. If you're encountering any cases above, you can try setting :code:`--snifflimit 0` to disable sniffing and set the :code:`--delimiter` and :code:`--quotechar` options yourself.
 
@@ -96,7 +97,7 @@ CSV data interpretation
 
 These may be symptoms of csvkit's type inference being too aggressive for your data. CSV is a text format, but it may contain text representing numbers, dates, booleans or other types. csvkit attempts to reverse engineer that text into proper data types—a process called "type inference".
 
-For some data, type inference can be error prone. If necessary you can disable it with the To :code:`--no-inference` switch. This will force all columns to be treated as regular text.
+For some data, type inference can be error prone. If necessary you can disable it with the :code:`--no-inference` switch. This will force all columns to be treated as regular text.
 
 Slow performance
 ----------------
@@ -108,14 +109,14 @@ If a tool is too slow to be practical for your data try setting the :code:`--sni
 Database errors
 ---------------
 
-Are you seeing this error message, even after running :code:`pip install psycopg2` or :code:`pip install MySQL-python`?
+Are you seeing this error message, even after running :code:`pip install psycopg2` or :code:`pip install mysql-connector-python`?
 
 ::
 
     You don't appear to have the necessary database backend installed for connection string you're trying to use. Available backends include:
 
     Postgresql: pip install psycopg2
-    MySQL:      pip install MySQL-python
+    MySQL:      pip install mysql-connector-python
 
     For details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at:
 

diff --git a/docs/tutorial/1_getting_started.rst b/docs/tutorial/1_getting_started.rst
@@ -137,11 +137,11 @@ Now that we understand :doc:`/scripts/in2csv`, :doc:`/scripts/csvlook` and :doc:
 
 In addition to specifying filenames, all csvkit tools accept an input file via "standard in". This means that, using the ``|`` ("pipe") character we can use the output of one csvkit tool as the input of the next.
 
-In the example above, the output of ``csvcut`` becomes the input to ``csvlook``. This also allow us to pipe output to standard Unix commands such as ``head``, which prints only the first ten lines of its input. Here, the output of ``csvlook`` becomes the input of ``head``.
+In the example above, the output of :doc:`/scripts/csvcut` becomes the input to :doc:`/scripts/csvlook`. This also allow us to pipe output to standard Unix commands such as ``head``, which prints only the first ten lines of its input. Here, the output of :doc:`/scripts/csvlook` becomes the input of ``head``.
 
 Piping is a core feature of csvkit. Of course, you can always write the output of each command to a file using ``>``. However, it's often faster and more convenient to use pipes to chain several commands together.
 
-We can also pipe ``in2csv``, allowing us to combine all our previous operations into one:
+We can also pipe :doc:`/scripts/in2csv`, allowing us to combine all our previous operations into one:
 
 .. code-block:: bash