Skip to content

Commit

Permalink
Merge ea00709 into fcab173
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Dec 20, 2022
2 parents fcab173 + ea00709 commit 3777225
Show file tree
Hide file tree
Showing 33 changed files with 187 additions and 340 deletions.
2 changes: 1 addition & 1 deletion csvkit/cleanup.py
Expand Up @@ -21,7 +21,7 @@ def join_rows(rows, joiner=' '):
return fixed_row


class RowChecker(object):
class RowChecker:
"""
Iterate over rows of a CSV producing cleaned rows and storing error rows.
"""
Expand Down
84 changes: 28 additions & 56 deletions csvkit/cli.py
Expand Up @@ -2,29 +2,19 @@

import argparse
import bz2
import codecs
import gzip
import itertools
import lzma
import sys
import warnings
from os.path import splitext

import agate
import six

if six.PY3:
import lzma
elif six.PY2:
# Try import backports.lzma if available
try:
from backports import lzma
except ImportError:
lzma = None

from csvkit.exceptions import ColumnIdentifierError, RequiredHeaderError


class LazyFile(six.Iterator):
class LazyFile:
"""
A proxy for a File object that delays opening it until
a read method is called.
Expand Down Expand Up @@ -65,7 +55,7 @@ def __next__(self):
return next(self.f)


class CSVKitUtility(object):
class CSVKitUtility:
description = ''
epilog = ''
override_flags = ''
Expand Down Expand Up @@ -241,32 +231,21 @@ def _open_input_file(self, path):
"""
Open the input file specified on the command line.
"""
if six.PY2:
mode = 'Urb'
kwargs = {}
else:
mode = 'rt' # default
kwargs = {'encoding': self.args.encoding}

if not path or path == '-':
f = sys.stdin
else:
extension = splitext(path)[1]

if extension == '.gz':
f = LazyFile(gzip.open, path, mode, **kwargs)
func = gzip.open
elif extension == '.bz2':
if six.PY2:
f = LazyFile(bz2.BZ2File, path, mode, **kwargs)
else:
f = LazyFile(bz2.open, path, mode, **kwargs)
func = bz2.open
elif extension == ".xz":
if lzma is not None:
f = LazyFile(lzma.open, path, mode, **kwargs)
else:
raise RuntimeError("backports.lzma is needed for .xz support with Python 2")
func = lzma.open
else:
f = LazyFile(open, path, mode, **kwargs)
func = open

f = LazyFile(func, path, mode='rt', encoding=self.args.encoding)

return f

Expand All @@ -286,9 +265,6 @@ def _extract_csv_reader_kwargs(self):
if value is not None:
kwargs[arg] = value

if six.PY2 and self.args.encoding:
kwargs['encoding'] = self.args.encoding

if getattr(self.args, 'no_header_row', None):
kwargs['header'] = not self.args.no_header_row

Expand All @@ -309,9 +285,6 @@ def _install_exception_handler(self):
"""
Installs a replacement for sys.excepthook, which handles pretty-printing uncaught exceptions.
"""
if six.PY2:
sys.stderr = codecs.getwriter('utf-8')(sys.stderr)

def handler(t, value, traceback):
if self.args.verbose:
sys.__excepthook__(t, value, traceback)
Expand All @@ -323,7 +296,7 @@ def handler(t, value, traceback):
'flag or with the PYTHONIOENCODING environment variable. Use the -v flag to see '
'the complete error.\n' % self.args.encoding)
else:
sys.stderr.write('%s: %s\n' % (t.__name__, six.text_type(value)))
sys.stderr.write('%s: %s\n' % (t.__name__, str(value)))

sys.excepthook = handler

Expand Down Expand Up @@ -360,8 +333,7 @@ def get_column_types(self):
def get_column_offset(self):
if self.args.zero_based:
return 0
else:
return 1
return 1

def skip_lines(self):
if isinstance(self.args.skip_lines, int):
Expand Down Expand Up @@ -444,24 +416,24 @@ def match_column_identifier(column_names, c, column_offset=1):
Note that integer values are *always* treated as positional identifiers. If you happen to have
column names which are also integers, you must specify them using a positional index.
"""
if isinstance(c, six.string_types) and not c.isdigit() and c in column_names:
if isinstance(c, str) and not c.isdigit() and c in column_names:
return column_names.index(c)
else:
try:
c = int(c) - column_offset
# Fail out if neither a column name nor an integer
except ValueError:
raise ColumnIdentifierError("Column '%s' is invalid. It is neither an integer nor a column name. "
"Column names are: %s" % (c, repr(column_names)[1:-1]))

# Fail out if index is 0-based
if c < 0:
raise ColumnIdentifierError("Column %i is invalid. Columns are 1-based." % (c + column_offset))

# Fail out if index is out of range
if c >= len(column_names):
raise ColumnIdentifierError("Column %i is invalid. The last column is '%s' at index %i." % (
c + column_offset, column_names[-1], len(column_names) - 1 + column_offset))

try:
c = int(c) - column_offset
# Fail out if neither a column name nor an integer
except ValueError:
raise ColumnIdentifierError("Column '%s' is invalid. It is neither an integer nor a column name. "
"Column names are: %s" % (c, repr(column_names)[1:-1]))

# Fail out if index is 0-based
if c < 0:
raise ColumnIdentifierError("Column %i is invalid. Columns are 1-based." % (c + column_offset))

# Fail out if index is out of range
if c >= len(column_names):
raise ColumnIdentifierError("Column %i is invalid. The last column is '%s' at index %i." % (
c + column_offset, column_names[-1], len(column_names) - 1 + column_offset))

return c

Expand Down
2 changes: 1 addition & 1 deletion csvkit/convert/__init__.py
Expand Up @@ -15,7 +15,7 @@ def guess_format(filename):

if extension in ('csv', 'dbf', 'fixed', 'xls', 'xlsx'):
return extension
elif extension in ['json', 'js']:
if extension in ('json', 'js'):
return 'json'

return None
12 changes: 6 additions & 6 deletions csvkit/convert/fixed.py
Expand Up @@ -2,9 +2,9 @@

from codecs import iterdecode
from collections import namedtuple
from io import StringIO

import agate
import six


def fixed2csv(f, schema, output=None, skip_lines=0, **kwargs):
Expand All @@ -27,10 +27,10 @@ def fixed2csv(f, schema, output=None, skip_lines=0, **kwargs):
:param skip_lines:
The number of lines to skip from the top of the file.
"""
streaming = True if output else False
streaming = bool(output)

if not streaming:
output = six.StringIO()
output = StringIO()

try:
encoding = kwargs['encoding']
Expand Down Expand Up @@ -59,7 +59,7 @@ def fixed2csv(f, schema, output=None, skip_lines=0, **kwargs):
return ''


class FixedWidthReader(six.Iterator):
class FixedWidthReader:
"""
Given a fixed-width file and a schema file, produce an analog to a csv
reader that yields a row of strings for each line in the fixed-width file,
Expand Down Expand Up @@ -95,7 +95,7 @@ def __next__(self):
FixedWidthField = namedtuple('FixedWidthField', ['name', 'start', 'length'])


class FixedWidthRowParser(object):
class FixedWidthRowParser:
"""
Instantiated with a schema, able to return a sequence of trimmed strings
representing fields given a fixed-length line. Flexible about where the
Expand Down Expand Up @@ -135,7 +135,7 @@ def headers(self):
return [field.name for field in self.fields]


class SchemaDecoder(object):
class SchemaDecoder:
"""
Extracts column, start, and length columns from schema rows. Once
instantiated, each time the instance is called with a row, a
Expand Down
12 changes: 4 additions & 8 deletions csvkit/convert/geojs.py
@@ -1,14 +1,10 @@
#!/usr/bin/env python

try:
import json
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict
import simplejson as json
import json
from collections import OrderedDict
from io import StringIO

import agate
import six


def geojson2csv(f, key=None, **kwargs):
Expand Down Expand Up @@ -58,7 +54,7 @@ def geojson2csv(f, key=None, **kwargs):
header.extend(property_fields)
header.extend(('geojson', 'type', 'longitude', 'latitude'))

o = six.StringIO()
o = StringIO()
writer = agate.csv.writer(o)

writer.writerow(header)
Expand Down
6 changes: 3 additions & 3 deletions csvkit/exceptions.py
Expand Up @@ -30,7 +30,7 @@ class CSVTestException(CustomException):
"""

def __init__(self, line_number, row, msg):
super(CSVTestException, self).__init__(msg)
super().__init__(msg)
self.line_number = line_number
self.row = row

Expand All @@ -42,7 +42,7 @@ class LengthMismatchError(CSVTestException):

def __init__(self, line_number, row, expected_length):
msg = 'Expected %i columns, found %i columns' % (expected_length, len(row))
super(LengthMismatchError, self).__init__(line_number, row, msg)
super().__init__(line_number, row, msg)

@property
def length(self):
Expand All @@ -59,7 +59,7 @@ def __init__(self, index, value, normal_type):
self.value = value
self.normal_type = normal_type
msg = 'Unable to convert "%s" to type %s (at index %i)' % (value, normal_type, index)
super(InvalidValueForTypeException, self).__init__(msg)
super().__init__(msg)


class RequiredHeaderError(CustomException):
Expand Down
10 changes: 4 additions & 6 deletions csvkit/grep.py
@@ -1,11 +1,10 @@
#!/usr/bin/env python

import six

from csvkit.exceptions import ColumnIdentifierError


class FilteringCSVReader(six.Iterator):
class FilteringCSVReader:
r"""
Given any row iterator, only return rows which pass the filter.
If 'header' is False, then all rows must pass the filter; by default, the first row will be passed
Expand Down Expand Up @@ -34,7 +33,7 @@ class FilteringCSVReader(six.Iterator):
column_names = None

def __init__(self, reader, patterns, header=True, any_match=False, inverse=False):
super(FilteringCSVReader, self).__init__()
super().__init__()

self.reader = reader
self.header = header
Expand Down Expand Up @@ -78,8 +77,7 @@ def test_row(self, row):

if self.any_match:
return self.inverse # False
else:
return not self.inverse # True
return not self.inverse # True


def standardize_patterns(column_names, patterns):
Expand Down Expand Up @@ -122,7 +120,7 @@ def pattern_as_function(obj):
return lambda x: obj in x


class regex_callable(object):
class regex_callable:

def __init__(self, pattern):
self.pattern = pattern
Expand Down
12 changes: 2 additions & 10 deletions csvkit/utilities/csvgrep.py
Expand Up @@ -5,7 +5,6 @@
from argparse import FileType

import agate
import six

from csvkit.cli import CSVKitUtility
from csvkit.grep import FilteringCSVReader
Expand All @@ -16,24 +15,17 @@ class CSVGrep(CSVKitUtility):
override_flags = ['L', 'blanks', 'date-format', 'datetime-format']

def add_arguments(self):
# I feel that there ought to be a better way to do this across Python 2 and 3.
def option_parser(bytestring):
if six.PY2:
return bytestring.decode(sys.getfilesystemencoding())
else:
return bytestring

self.argparser.add_argument(
'-n', '--names', dest='names_only', action='store_true',
help='Display column names and indices from the input CSV and exit.')
self.argparser.add_argument(
'-c', '--columns', dest='columns',
help='A comma-separated list of column indices, names or ranges to be searched, e.g. "1,id,3-5".')
self.argparser.add_argument(
'-m', '--match', dest="pattern", action='store', type=option_parser,
'-m', '--match', dest="pattern", action='store',
help='A string to search for.')
self.argparser.add_argument(
'-r', '--regex', dest='regex', action='store', type=option_parser,
'-r', '--regex', dest='regex', action='store',
help='A regular expression to match.')
self.argparser.add_argument(
'-f', '--file', dest='matchfile', type=FileType('r'), action='store',
Expand Down

0 comments on commit 3777225

Please sign in to comment.