Skip to content

Commit

Permalink
Implement handling of csv maximum field size with -z flag.
Browse files Browse the repository at this point in the history
  • Loading branch information
onyxfish committed Sep 1, 2011
1 parent dee83e2 commit ef013e3
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 3 deletions.
8 changes: 7 additions & 1 deletion csvkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,12 @@ def _init_common_parser(self):
if 'p' not in self.override_flags:
self.argparser.add_argument('-p', '--escapechar', dest='escapechar',
help='Character used to escape the delimiter if quoting is set to "Quote None" and the quotechar if doublequote is not specified.')
if 'z' not in self.override_flags:
self.argparser.add_argument('-z', '--maxfieldsize', dest='maxfieldsize', type=int,
help='Maximum length of a single field in the input CSV file.')
if 'e' not in self.override_flags:
self.argparser.add_argument('-e', '--encoding', dest='encoding', default='utf-8',
help='Specify the encoding the input file.')
help='Specify the encoding the input CSV file.')
if 'v' not in self.override_flags:
self.argparser.add_argument('-v', '--verbose', dest='verbose', action='store_true',
help='Print detailed tracebacks when errors occur.')
Expand Down Expand Up @@ -148,6 +151,9 @@ def _extract_csv_reader_kwargs(self):
if self.args.escapechar:
kwargs['escapechar'] = self.args.escapechar

if self.args.maxfieldsize:
kwargs['maxfieldsize'] = self.args.maxfieldsize

return kwargs

def _extract_csv_writer_kwargs(self):
Expand Down
8 changes: 8 additions & 0 deletions csvkit/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@ def __unicode__(self):
def __str__(self):
return self.msg

class FieldSizeLimitError(CustomException):
"""
Exception raised when a field in the CSV file exceeds the default max
or one provided by the user.
"""
def __init__(self, limit):
self.msg = 'CSV contains fields longer than maximum length of %i characters. Try raising the maximum with the --maxfieldsize flag.' % limit

class ColumnIdentifierError(CustomException):
"""
Exception raised when the user supplies an invalid column identifier.
Expand Down
17 changes: 15 additions & 2 deletions csvkit/unicsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import csv
from cStringIO import StringIO

from csvkit.exceptions import FieldSizeLimitError

"""
The following classes are adapted from the CSV module documentation.
"""
Expand All @@ -24,12 +26,23 @@ class UnicodeCSVReader(object):
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, encoding='utf-8', **kwargs):
def __init__(self, f, encoding='utf-8', maxfieldsize=None, **kwargs):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, **kwargs)

if maxfieldsize:
csv.field_size_limit(maxfieldsize)

def next(self):
row = self.reader.next()
try:
row = self.reader.next()
except csv.Error, e:
# Terrible way to test for this exception, but there is no subclass
if 'field larger than field limit' in str(e):
raise FieldSizeLimitError(csv.field_size_limit())
else:
raise e

return [unicode(s, 'utf-8') for s in row]

def __iter__(self):
Expand Down

0 comments on commit ef013e3

Please sign in to comment.