forked from wireservice/csvkit
-
Notifications
You must be signed in to change notification settings - Fork 1
/
typeinference.py
246 lines (194 loc) · 8.74 KB
/
typeinference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
#!/usr/bin/env python
import datetime
from types import NoneType
from dateutil.parser import parse
from exceptions import InvalidValueForTypeException, InvalidValueForTypeListException
NULL_VALUES = ('na', 'n/a', 'none', 'null', '.')
TRUE_VALUES = ('yes', 'y', 'true', 't')
FALSE_VALUES = ('no', 'n', 'false', 'f')
DEFAULT_DATETIME = datetime.datetime(9999, 12, 31, 0, 0, 0)
NULL_DATE = datetime.date(9999, 12, 31)
NULL_TIME = datetime.time(0, 0, 0)
def normalize_column_type(l, normal_type=None, blanks_as_nulls=True):
"""
Attempts to normalize a list (column) of string values to booleans, integers,
floats, dates, times, datetimes, or strings. NAs and missing values are converted
to empty strings. Empty strings are converted to nulls in the case of non-string
types. For string types (unicode), empty strings are converted to nulls unless
blanks_as_nulls is false.
Optional accepts a "normal_type" argument which specifies a type that the values
must conform to (rather than inferring). Will raise InvalidValueForTypeException
if a value is not coercable.
Returns a tuple of (type, normal_values).
"""
# Optimizations
lower = unicode.lower
replace = unicode.replace
# Convert "NA", "N/A", etc. to null types.
for i, x in enumerate(l):
if x is not None and lower(x) in NULL_VALUES:
l[i] = ''
# Are they null?
if not normal_type or normal_type == NoneType:
try:
for i, x in enumerate(l):
if x != '' and x is not None:
raise ValueError('Not null')
return NoneType, [None] * len(l)
except ValueError:
if normal_type:
raise InvalidValueForTypeException(i, x, normal_type)
# Are they boolean?
if not normal_type or normal_type == bool:
try:
normal_values = []
append = normal_values.append
for i, x in enumerate(l):
if x == '' or x is None:
append(None)
elif x.lower() in TRUE_VALUES:
append(True)
elif x.lower() in FALSE_VALUES:
append(False)
else:
raise ValueError('Not boolean')
return bool, normal_values
except ValueError:
if normal_type:
raise InvalidValueForTypeException(i, x, normal_type)
# Are they integers?
if not normal_type or normal_type == int:
try:
normal_values = []
append = normal_values.append
for i, x in enumerate(l):
if x == '' or x is None:
append(None)
continue
int_x = int(replace(x, ',', ''))
if x[0] == '0' and int(x) != 0:
raise TypeError('Integer is padded with 0s, so treat it as a string instead.')
append(int_x)
return int, normal_values
except TypeError:
if normal_type == int:
raise InvalidValueForTypeException(i, x, int)
if blanks_as_nulls:
return unicode, [x if x != '' else None for x in l]
else:
return unicode, l
except ValueError:
if normal_type:
raise InvalidValueForTypeException(i, x, normal_type)
# Are they floats?
if not normal_type or normal_type == float:
try:
normal_values = []
append = normal_values.append
for i, x in enumerate(l):
if x == '' or x is None:
append(None)
continue
float_x = float(replace(x, ',', ''))
append(float_x)
return float, normal_values
except ValueError:
if normal_type:
raise InvalidValueForTypeException(i, x, normal_type)
# Are they datetimes?
if not normal_type or normal_type in [datetime.time, datetime.date, datetime.datetime]:
try:
normal_values = []
append = normal_values.append
normal_types_set = set()
add = normal_types_set.add
for i, x in enumerate(l):
if x == '' or x is None:
append(None)
add(NoneType)
continue
d = parse(x, default=DEFAULT_DATETIME)
# Is it only a time?
if d.date() == NULL_DATE:
if normal_type and normal_type != datetime.time:
raise InvalidValueForTypeException(i, x, normal_type)
d = d.time()
add(datetime.time)
# Is it only a date?
elif d.time() == NULL_TIME:
if normal_type and normal_type not in [datetime.date, datetime.datetime]:
raise InvalidValueForTypeException(i, x, normal_type)
d = d.date()
add(datetime.date)
# It must be a date and time
else:
if normal_type and normal_type != datetime.datetime:
raise InvalidValueForTypeException(i, x, normal_type)
add(datetime.datetime)
append(d)
# This case can only happen if normal_type was specified and the column contained all nulls
if normal_type and normal_types_set == set([NoneType]):
return normal_type, normal_values
normal_types_set.discard(NoneType)
# If a mix of dates and datetimes, up-convert dates to datetimes
if normal_types_set == set([datetime.datetime, datetime.date]) or (normal_types_set == set([datetime.date]) and normal_type is datetime.datetime):
for i, v in enumerate(normal_values):
if v.__class__ == datetime.date:
normal_values[i] = datetime.datetime.combine(v, NULL_TIME)
if datetime.datetime in normal_types_set:
normal_types_set.discard(datetime.date)
# Datetimes and times don't mix -- fallback to using strings
elif normal_types_set == set([datetime.datetime, datetime.time]) or (normal_types_set == set([datetime.time]) and normal_type is datetime.datetime):
raise ValueError('Cant\'t coherently mix datetimes and times in a single column.')
# Dates and times don't mix -- fallback to using strings
elif normal_types_set == set([datetime.date, datetime.time]) or (normal_types_set == set([datetime.time]) and normal_type is datetime.date) or (normal_types_set == set([datetime.date]) and normal_type is datetime.time):
raise ValueError('Can\'t coherently mix dates and times in a single column.')
return normal_types_set.pop(), normal_values
except ValueError:
if normal_type:
raise InvalidValueForTypeException(i, x, normal_type)
except OverflowError:
if normal_type:
raise InvalidValueForTypeException(i, x, normal_type)
except TypeError:
if normal_type:
raise InvalidValueForTypeException(i, x, normal_type)
# Don't know what they are, so they must just be strings
if blanks_as_nulls:
return unicode, [x if x != '' else None for x in l]
else:
return unicode, l
def normalize_table(rows, normal_types=None, accumulate_errors=False, blanks_as_nulls=True):
"""
Given a sequence of sequences, normalize the lot.
Optionally accepts a normal_types parameter which is a list of
types that the columns must normalize to.
"""
data_columns = []
column_count = 0
row_count = 0
for row in rows:
while column_count < len(row):
data_columns.append([None] * row_count)
column_count += 1
for i, value in enumerate(row):
data_columns[i].append(value)
row_count += 1
new_normal_types = []
new_normal_columns= []
errors = {}
for i, column in enumerate(data_columns):
try:
if normal_types:
t, c = normalize_column_type(column, normal_types[i], blanks_as_nulls=blanks_as_nulls)
else:
t, c = normalize_column_type(column, blanks_as_nulls=blanks_as_nulls)
new_normal_types.append(t)
new_normal_columns.append(c)
except InvalidValueForTypeException, e:
if not accumulate_errors:
raise
errors[i] = e
if errors:
raise InvalidValueForTypeListException(errors)
return new_normal_types, new_normal_columns