forked from wireservice/csvkit
-
Notifications
You must be signed in to change notification settings - Fork 1
/
typeinference.py
403 lines (320 loc) · 12.7 KB
/
typeinference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#!/usr/bin/env python
import datetime
from types import NoneType
from dateutil.parser import parse
from exceptions import InvalidValueForTypeException, InvalidValueForTypeListException
NULL_VALUES = ('na', 'n/a', 'none', 'null', '.')
TRUE_VALUES = ('yes', 'y', 'true', 't')
FALSE_VALUES = ('no', 'n', 'false', 'f')
DEFAULT_DATETIME = datetime.datetime(9999, 12, 31, 0, 0, 0)
NULL_DATE = datetime.date(9999, 12, 31)
NULL_TIME = datetime.time(0, 0, 0)
def normalize_column_type(l, normal_type=None):
"""
Attempts to normalize a list (column) of string values to booleans, integers,
floats, dates, times, datetimes, or strings. NAs and missing values are converted
to empty strings. Empty strings are converted to nulls.
Optional accepts a "normal_type" argument which specifies a type that the values
must conform to (rather than inferring). Will raise InvalidValueForTypeException
if a value is not coercable.
Returns a tuple of (type, normal_values).
"""
# Optimizations
lower = unicode.lower
replace = unicode.replace
# Convert "NA", "N/A", etc. to null types.
for i, x in enumerate(l):
if x is None or lower(x) in NULL_VALUES:
l[i] = ''
# Are they null?
if not normal_type or normal_type == NoneType:
try:
for i, x in enumerate(l):
if x != '':
raise ValueError('Not null')
return NoneType, [None] * len(l)
except ValueError:
if normal_type:
raise InvalidValueForTypeException(i, x, normal_type)
# Are they boolean?
if not normal_type or normal_type == bool:
try:
normal_values = []
append = normal_values.append
for i, x in enumerate(l):
if x == '':
append(None)
elif x.lower() in TRUE_VALUES:
append(True)
elif x.lower() in FALSE_VALUES:
append(False)
else:
raise ValueError('Not boolean')
return bool, normal_values
except ValueError:
if normal_type:
raise InvalidValueForTypeException(i, x, normal_type)
# Are they integers?
if not normal_type or normal_type == int:
try:
normal_values = []
append = normal_values.append
for i, x in enumerate(l):
if x == '':
append(None)
continue
int_x = int(replace(x, ',', ''))
if x[0] == '0' and int(x) != 0:
raise TypeError('Integer is padded with 0s, so treat it as a string instead.')
append(int_x)
return int, normal_values
except TypeError:
if normal_type == int:
raise InvalidValueForTypeException(i, x, int)
return unicode, [x if x != '' else None for x in l]
except ValueError:
if normal_type:
raise InvalidValueForTypeException(i, x, normal_type)
# Are they floats?
if not normal_type or normal_type == float:
try:
normal_values = []
append = normal_values.append
for i, x in enumerate(l):
if x == '':
append(None)
continue
float_x = float(replace(x, ',', ''))
append(float_x)
return float, normal_values
except ValueError:
if normal_type:
raise InvalidValueForTypeException(i, x, normal_type)
# Are they datetimes?
if not normal_type or normal_type in [datetime.time, datetime.date, datetime.datetime]:
try:
normal_values = []
append = normal_values.append
normal_types_set = set()
add = normal_types_set.add
for i, x in enumerate(l):
if x == '':
append(None)
continue
d = parse(x, default=DEFAULT_DATETIME)
# Is it only a time?
if d.date() == NULL_DATE:
if normal_type and normal_type != datetime.time:
raise InvalidValueForTypeException(i, x, normal_type)
d = d.time()
add(datetime.time)
# Is it only a date?
elif d.time() == NULL_TIME:
if normal_type and normal_type not in [datetime.date, datetime.datetime]:
raise InvalidValueForTypeException(i, x, normal_type)
d = d.date()
add(datetime.date)
# It must be a date and time
else:
if normal_type and normal_type != datetime.datetime:
raise InvalidValueForTypeException(i, x, normal_type)
add(datetime.datetime)
append(d)
# No special handling if column contains only one type
if len(normal_types_set) == 1:
pass
# If a mix of dates and datetimes, up-convert dates to datetimes
elif normal_types_set == set([datetime.datetime, datetime.date]):
for i, v in enumerate(normal_values):
if v.__class__ == datetime.date:
normal_values[i] = datetime.datetime.combine(v, NULL_TIME)
normal_types_set.discard(datetime.date)
# Datetimes and times don't mix -- fallback to using strings
elif normal_types_set == set([datetime.datetime, datetime.time]):
raise ValueError('Cant\'t coherently mix datetimes and times in a single column.')
# Dates and times don't mix -- fallback to using strings
elif normal_types_set == set([datetime.date, datetime.time]):
raise ValueError('Can\'t coherently mix dates and times in a single column.')
return normal_types_set.pop(), normal_values
except ValueError:
if normal_type:
raise InvalidValueForTypeException(i, x, normal_type)
except OverflowError:
if normal_type:
raise InvalidValueForTypeException(i, x, normal_type)
# Don't know what they are, so they must just be strings
return unicode, [x if x != '' else None for x in l]
def normalize_table(rows, normal_types=None, accumulate_errors=False):
"""
Given a sequence of sequences, normalize the lot.
Optionally accepts a normal_types parameter which is a list of
types that the columns must normalize to.
"""
data_columns = []
column_count = 0
row_count = 0
for row in rows:
while column_count < len(row):
data_columns.append([None] * row_count)
column_count += 1
for i, value in enumerate(row):
data_columns[i].append(value)
row_count += 1
new_normal_types = []
new_normal_columns= []
errors = {}
for i, column in enumerate(data_columns):
try:
if normal_types:
t, c = normalize_column_type(column, normal_types[i])
else:
t, c = normalize_column_type(column)
new_normal_types.append(t)
new_normal_columns.append(c)
except InvalidValueForTypeException, e:
if not accumulate_errors:
raise
errors[i] = e
if errors:
raise InvalidValueForTypeListException(errors)
return new_normal_types, new_normal_columns
AVAILABLE_TYPES = (bool, int, float, datetime.time, datetime.date, datetime.datetime, unicode)
BOOL_VALUES = TRUE_VALUES + FALSE_VALUES
def can_be_null(val):
if not val:
return True
return val.lower() in NULL_VALUES
def can_be_bool(val):
return val.lower() in BOOL_VALUES
def can_be_int(val):
try:
int_val = int(val.replace(',', ''))
if val[0] == '0' and int_val != 0:
return False
return True
except ValueError:
return False
def can_be_float(val):
try:
float_val = float(val.replace(',',''))
return True
except ValueError:
return False
def can_be_time(val):
try:
d = parse(val, default=DEFAULT_DATETIME)
return d.date() == NULL_DATE
except:
return False
def can_be_date(val):
try:
d = parse(val, default=DEFAULT_DATETIME)
return d.time() == NULL_TIME and d.date() != NULL_DATE
except:
return False
def can_be_datetime(val):
try:
d = parse(val, default=DEFAULT_DATETIME)
return d.date() != NULL_DATE and d != DEFAULT_DATETIME
except:
return False
can_be = {
bool: can_be_bool,
int: can_be_int,
float: can_be_float,
datetime.time: can_be_time,
datetime.date: can_be_date,
datetime.datetime: can_be_datetime,
unicode: lambda x: True,
}
def assess_row(row, limitations=[]):
"""
Given a row of data, return a sequence whose members are lists of types which could possibly apply to values in the given row.
If limitations is not None, it should be a sequence of the same length as 'row', and the return value will not include any types which
were not in the input limitations. The expected usage model would be to iteratively call this for each row, passing
back the return as the limitations for the next row.
If 'limitations' is a sequence of all 'unicode' (the "widest" data type) then this call will return
the same list immediately. To short circuit iterative calls to this function after that equilibrium has
been reached, consider testing before calling using 'all_unicode' defined elsewhere in this module.
"""
if limitations:
# All resolved? (TODO: bail out with exception?)
if all([len(limit) == 1 for limit in limitations]):
return limitations
else:
# Everything is possible
limitations = [set(AVAILABLE_TYPES) for item in row]
result = []
for value, column_limits in zip(row, limitations):
new_column_limits = set()
for limit in column_limits:
if not value or can_be_null(value) or can_be[limit](value):
new_column_limits.add(limit)
result.append(new_column_limits)
return result
def reduce_assessment(limitations):
"""
In some cases, an entire dataset might be reviewed and assess_row might not have boiled it down to unique values.
And in any case, we want our list of lists to be a list of singular values.
"""
result = []
for limits in limitations:
# Unicode is always in the list
if len(limits) == 1:
result.append(unicode)
continue
limits.remove(unicode)
# Only one match other than unicode then it's exact
if len(limits) == 1:
result.append(limits.pop())
# Dates may be represented as datetimes
elif limits == set([datetime.datetime, datetime.date]):
result.append(datetime.date)
# Bool can be misidentified as float, int, datetime, date
elif bool in limits:
result.append(bool)
# Int can be misidentified as float or date
elif int in limits:
result.append(int)
# Float can be misidentified as date
elif float in limits:
result.append(float)
# If all else fails, it's unicode
else:
result.append(unicode)
return result
def generate_type_hypothesis(sample_rows):
"""
Use type-guessing to generate a hypothesis about columns types based on a
sample of rows.
"""
limits = []
for row in sample_rows:
limits = assess_row(row, limits)
return reduce_assessment(limits)
def fast_normalize_table(rows, column_ids, sample_size):
"""
Normalizes a table using type guessing.
"""
data_columns = [[] for c in column_ids]
row_count = 0
sample_rows = []
for row in rows:
if row_count < sample_size:
sample_rows.append(row)
for i, value in enumerate(row):
try:
data_columns[i].append(row[column_ids[i]].strip())
except IndexError:
# Non-rectangular data is truncated
break
row_count += 1
normal_types = generate_type_hypothesis(sample_rows)
new_normal_columns = []
for i, column in enumerate(data_columns):
try:
t, c = normalize_column_type(column, normal_types[i])
new_normal_columns.append(c)
except InvalidValueForTypeException:
raise
return normal_types, new_normal_columns