From fd62299d295793f77f5fe74b9e4f94aed967663f Mon Sep 17 00:00:00 2001 From: Christopher Groskopf Date: Mon, 26 Dec 2011 12:02:53 -0600 Subject: [PATCH] Implement new xlsx support based on openpyxl. --- csvkit/convert/__init__.py | 7 ++- csvkit/convert/xlsx.py | 118 +++++++++++++++++++++++++++++++++++++ csvkit/unicsv.py | 2 + csvkit/utilities/in2csv.py | 2 +- requirements.txt | 1 + 5 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 csvkit/convert/xlsx.py diff --git a/csvkit/convert/__init__.py b/csvkit/convert/__init__.py index 1d08f9e0d..b5f3f293a 100644 --- a/csvkit/convert/__init__.py +++ b/csvkit/convert/__init__.py @@ -4,8 +4,9 @@ from fixed import fixed2csv from js import json2csv from xls import xls2csv +from xlsx import xlsx2csv -SUPPORTED_FORMATS = ['fixed', 'xls', 'csv', 'json'] +SUPPORTED_FORMATS = ['fixed', 'xls', 'xlsx', 'csv', 'json'] def convert(f, format, schema=None, key=None, **kwargs): """ @@ -24,6 +25,8 @@ def convert(f, format, schema=None, key=None, **kwargs): return fixed2csv(f, schema, **kwargs) elif format == 'xls': return xls2csv(f, **kwargs) + elif format == 'xlsx': + return xlsx2csv(f, **kwargs) elif format == 'json': return json2csv(f, key, **kwargs) elif format == 'csv': @@ -45,6 +48,8 @@ def guess_format(filename): if extension == 'xls': return extension + elif extension == 'xlsx': + return extension elif extension in ['json', 'js']: return 'json' elif extension == 'csv': diff --git a/csvkit/convert/xlsx.py b/csvkit/convert/xlsx.py new file mode 100644 index 000000000..a5c48f689 --- /dev/null +++ b/csvkit/convert/xlsx.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python + +from cStringIO import StringIO +import datetime +from types import NoneType + +from openpyxl.reader.excel import load_workbook + +from csvkit import table + +def normalize_empty(values, **kwargs): + """ + Normalize a column which contains only empty cells. + """ + return None, [None] * len(values) + +def normalize_unicode(values, **kwargs): + """ + Normalize a column of text cells. + """ + return unicode, [unicode(v) if v else None for v in values] + +def normalize_ints(values, **kwargs): + """ + Normalize a column of integer cells. + """ + return int, values + +def normalize_floats(values, **kwargs): + """ + Normalize a column of float cells. + """ + return float, [float(v) for v in values] + +def normalize_datetimes(values, **kwargs): + """ + Normalize a column of datetime cells. + """ + return datetime.datetime, values + +def normalize_dates(values, **kwargs): + """ + Normalize a column of date cells. + """ + return datetime.date, values + +def normalize_booleans(values, **kwargs): + """ + Normalize a column of boolean cells. + """ + return bool, [bool(v) if v != '' else None for v in values] + +# TODO +NORMALIZERS = { + unicode: normalize_unicode, + datetime.datetime: normalize_datetimes, + datetime.date: normalize_dates, + bool: normalize_booleans, + int: normalize_ints, + float: normalize_floats, + NoneType: normalize_empty +} + +def determine_column_type(types): + """ + Determine the correct type for a column from a list of cell types. + """ + types_set = set(types) + types_set.discard(NoneType) + + if len(types_set) == 2: + if types_set == set([int, float]): + return float + elif types_set == set([datetime.datetime, datetime.date]): + return datetime.datetime + + # Normalize mixed types to text + if len(types_set) > 1: + return unicode + + try: + return types_set.pop() + except KeyError: + return NoneType + +def xlsx2csv(f, **kwargs): + """ + Convert an Excel .xlsx file to csv. + """ + book = load_workbook(f) + sheet = book.get_active_sheet() + + tab = table.Table() + + for i, column in enumerate(sheet.columns): + # Trim headers + column_name = column[0].value + + # Empty column name? Truncate remaining data + if not column_name: + break + + values = [c.value for c in column[1:]] + types = [type(v) for v in values] + + column_type = determine_column_type(types) + t, normal_values = NORMALIZERS[column_type](values) + + column = table.Column(i, column_name, normal_values, normal_type=t) + tab.append(column) + + o = StringIO() + output = tab.to_csv(o) + output = o.getvalue() + o.close() + + return output + diff --git a/csvkit/unicsv.py b/csvkit/unicsv.py index a6b47dfa3..2a7a855da 100644 --- a/csvkit/unicsv.py +++ b/csvkit/unicsv.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import codecs import csv from cStringIO import StringIO diff --git a/csvkit/utilities/in2csv.py b/csvkit/utilities/in2csv.py index 38b28426a..c9b2ff315 100644 --- a/csvkit/utilities/in2csv.py +++ b/csvkit/utilities/in2csv.py @@ -45,7 +45,7 @@ def main(self): if isinstance(self.args.file, file): f = self.args.file - elif format == 'xls': + elif format in ('xls', 'xlsx'): f = open(self.args.file, 'rb') else: f = open(self.args.file, 'rU') diff --git a/requirements.txt b/requirements.txt index d00f58f97..fef4b4acf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ python-dateutil==1.5 sqlalchemy==0.6.6 sphinx==1.0.7 coverage==3.5.1b1 +openpyxl==1.5.6