Permalink
Browse files

Experimental drop-in implementation for Table.from_csv

  • Loading branch information...
1 parent 3545f1a commit ed1caa9dd25a49acaf0d4dfccf96dec6d3dce935 @onyxfish onyxfish committed Dec 26, 2011
Showing with 24 additions and 24 deletions.
  1. +9 −9 csvkit/table.py
  2. +15 −15 csvkit/typeinference.py
View
@@ -205,20 +205,20 @@ def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, **
else:
column_ids = range(len(headers))
- data_columns = [[] for c in headers]
+ #for row in reader:
+ # for i, d in enumerate(row):
+ # try:
+ # data_columns[i].append(row[column_ids[i]].strip())
+ # except IndexError:
+ # # Non-rectangular data is truncated
+ # break
- for row in reader:
- for i, d in enumerate(row):
- try:
- data_columns[i].append(row[column_ids[i]].strip())
- except IndexError:
- # Non-rectangular data is truncated
- break
+ types, data_columns = typeinference.fast_normalize_table(reader, column_ids, 50)
columns = []
for i, c in enumerate(data_columns):
- columns.append(Column(column_ids[i], headers[i], c))
+ columns.append(Column(column_ids[i], headers[i], c, normal_type=types[i]))
return Table(columns, name=name)
View
@@ -354,46 +354,46 @@ def reduce_assessment(limitations):
return result
-def generate_type_hypothesis(rows, sample_size=50):
+def generate_type_hypothesis(sample_rows):
"""
Use type-guessing to generate a hypothesis about columns types based on a
sample of rows.
"""
limits = []
- for row in rows[:sample_size]:
+ for row in sample_rows:
limits = assess_row(row, limits)
return reduce_assessment(limits)
-def fast_normalize_table(rows):
+def fast_normalize_table(rows, column_ids, sample_size):
"""
Normalizes a table using type guessing.
"""
- data_columns = []
- column_count = 0
+ data_columns = [[] for c in column_ids]
row_count = 0
+ sample_rows = []
for row in rows:
- while column_count < len(row):
- data_columns.append([None] * row_count)
- column_count += 1
+ if row_count < sample_size:
+ sample_rows.append(row)
for i, value in enumerate(row):
- data_columns[i].append(value)
+ try:
+ data_columns[i].append(row[column_ids[i]].strip())
+ except IndexError:
+ # Non-rectangular data is truncated
+ break
row_count += 1
- normal_types = generate_type_hypothesis(rows)
+ normal_types = generate_type_hypothesis(sample_rows)
- new_normal_columns= []
+ new_normal_columns = []
for i, column in enumerate(data_columns):
try:
- if normal_types:
- t, c = normalize_column_type(column, normal_types[i])
- else:
- t, c = normalize_column_type(column)
+ t, c = normalize_column_type(column, normal_types[i])
new_normal_columns.append(c)
except InvalidValueForTypeException:

0 comments on commit ed1caa9

Please sign in to comment.