forked from wireservice/csvkit
/
xls.py
159 lines (129 loc) · 4.67 KB
/
xls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python
from cStringIO import StringIO
import datetime
import xlrd
from csvkit import table
from csvkit.exceptions import XLSDataError
def normalize_empty(values):
"""
Normalize a column which contains only empty cells.
"""
return None, [None] * len(values)
def normalize_text(values):
"""
Normalize a column of text cells.
"""
return unicode, [unicode(v) if v else None for v in values]
def normalize_numbers(values):
"""
Normalize a column of numeric cells.
"""
# Test if all values are whole numbers, if so coerce floats it ints
integral = True
for v in values:
if v and v % 1 != 0:
integral = False
break
if integral:
return int, [int(v) if v != '' else None for v in values]
else:
# Convert blanks to None
return float, [v if v else None for v in values]
def normalize_dates(values, datemode):
"""
Normalize a column of date cells.
"""
normal_values = []
normal_types_set = set()
for v in values:
# Convert blanks to None
if v == '':
normal_values.append(None)
continue
v_tuple = xlrd.xldate_as_tuple(v, datemode)
if v_tuple == (0, 0, 0, 0, 0, 0):
# Midnight
normal_values.append(datetime.time(*v_tuple[3:]))
normal_types_set.add(datetime.time)
elif v_tuple[3:] == (0, 0, 0):
# Date only
normal_values.append(datetime.date(*v_tuple[:3]))
normal_types_set.add(datetime.date)
elif v_tuple[:3] == (0, 0, 0):
# Time only
normal_values.append(datetime.time(*v_tuple[3:]))
normal_types_set.add(datetime.time)
else:
# Date and time
normal_values.append(datetime.datetime(*v_tuple))
normal_types_set.add(datetime.datetime)
if len(normal_types_set) == 1:
# No special handling if column contains only one type
pass
elif normal_types_set == set([datetime.datetime, datetime.date]):
# If a mix of dates and datetimes, up-convert dates to datetimes
for i, v in enumerate(normal_values):
if v.__class__ == datetime.date:
normal_values[i] = datetime.datetime.combine(v, datetime.time())
normal_types_set.remove(datetime.date)
elif normal_types_set == set([datetime.datetime, datetime.time]):
# Datetimes and times don't mix
raise XLSDataError('Column contains a mix of times and datetimes (this is not supported).')
elif normal_types_set == set([datetime.date, datetime.time]):
# Dates and times don't mix
raise XLSDataError('Column contains a mix of dates and times (this is not supported).')
# Natural serialization of dates and times by csv.writer is insufficent so they get converted back to strings at this point
return normal_types_set.pop(), normal_values
def normalize_booleans(values):
"""
Normalize a column of boolean cells.
"""
return bool, [bool(v) if v != '' else None for v in values]
NORMALIZERS = {
xlrd.biffh.XL_CELL_EMPTY: normalize_empty,
xlrd.biffh.XL_CELL_TEXT: normalize_text,
xlrd.biffh.XL_CELL_NUMBER: normalize_numbers,
xlrd.biffh.XL_CELL_DATE: normalize_dates,
xlrd.biffh.XL_CELL_BOOLEAN: normalize_booleans
}
def determine_column_type(types):
"""
Determine the correct type for a column from a list of cell types.
"""
types_set = set(types)
types_set.discard(xlrd.biffh.XL_CELL_EMPTY)
# Normalize mixed types to text
if len(types_set) > 1:
return xlrd.biffh.XL_CELL_TEXT
try:
return types_set.pop()
except KeyError:
return xlrd.biffh.XL_CELL_EMPTY
def xls2csv(f, **kwargs):
"""
Convert an Excel .xls file to csv.
"""
book = xlrd.open_workbook(file_contents=f.read())
sheet = book.sheet_by_index(0)
tab = table.Table()
for i in range(sheet.ncols):
# Trim headers
column_name = sheet.col_values(i)[0]
# Empty column name? Truncate remaining data
if not column_name:
break
values = sheet.col_values(i)[1:]
types = sheet.col_types(i)[1:]
column_type = determine_column_type(types)
# This is terrible code. TKTK
if column_type == xlrd.biffh.XL_CELL_DATE:
t, normal_values = NORMALIZERS[column_type](values, book.datemode)
else:
t, normal_values = NORMALIZERS[column_type](values)
column = table.Column(i, column_name, normal_values, normal_type=t)
tab.append(column)
o = StringIO()
output = tab.to_csv(o)
output = o.getvalue()
o.close()
return output