Skip to content

Commit

Permalink
Implement type inference. Remove DateType and DateColumn. #210.
Browse files Browse the repository at this point in the history
  • Loading branch information
onyxfish committed Sep 6, 2015
1 parent ed68c26 commit 5dff828
Show file tree
Hide file tree
Showing 12 changed files with 150 additions and 182 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG
@@ -1,7 +1,8 @@
0.8.0
-----


* Remove DateType and DateColumn.
* Implement robust type inference. (#210)

0.7.0
-----
Expand Down
26 changes: 9 additions & 17 deletions agate/aggregations.py
Expand Up @@ -19,9 +19,9 @@
from collections import defaultdict
import datetime

from agate.column_types import BooleanType, NumberType
from agate.columns import BooleanColumn, DateColumn, DateTimeColumn, NumberColumn, TextColumn
from agate.exceptions import NullCalculationError, UnsupportedAggregationError
from agate.column_types import *
from agate.columns import *
from agate.exceptions import *

class Aggregation(object): #pragma: no cover
"""
Expand Down Expand Up @@ -146,12 +146,10 @@ def run(self, column):
class Min(Aggregation):
"""
Compute the minimum value in a column. May be applied to
:class:`.DateColumn`, :class:`.DateTimeColumn` and :class:`.NumberColumn`.
:class:`.DateTimeColumn` and :class:`.NumberColumn`.
"""
def get_aggregate_column_type(self, column):
if isinstance(column, DateColumn):
return DateType()
elif isinstance(column, DateTimeColumn):
if isinstance(column, DateTimeColumn):
return DateTimeType()
elif isinstance(column, NumberColumn):
return NumberType()
Expand All @@ -162,22 +160,18 @@ def run(self, column):
"""
:returns: :class:`datetime.date`
"""
supported_columns = (DateColumn, DateTimeColumn, NumberColumn)

if not any(isinstance(column, t) for t in supported_columns):
if not (isinstance(column, DateTimeColumn) or isinstance(column, NumberColumn)):
raise UnsupportedAggregationError(self, column)

return min(column.get_data_without_nulls())

class Max(Aggregation):
"""
Compute the maximum value in a column. May be applied to
:class:`.DateColumn`, :class:`.DateTimeColumn` and :class:`.NumberColumn`.
:class:`.DateTimeColumn` and :class:`.NumberColumn`.
"""
def get_aggregate_column_type(self, column):
if isinstance(column, DateColumn):
return DateType()
elif isinstance(column, DateTimeColumn):
if isinstance(column, DateTimeColumn):
return DateTimeType()
elif isinstance(column, NumberColumn):
return NumberType()
Expand All @@ -186,9 +180,7 @@ def run(self, column):
"""
:returns: :class:`datetime.date`
"""
supported_columns = (DateColumn, DateTimeColumn, NumberColumn)

if not any(isinstance(column, t) for t in supported_columns):
if not (isinstance(column, DateTimeColumn) or isinstance(column, NumberColumn)):
raise UnsupportedAggregationError(self, column)

return max(column.get_data_without_nulls())
Expand Down
127 changes: 90 additions & 37 deletions agate/column_types.py
Expand Up @@ -41,6 +41,13 @@ class ColumnType(object): #pragma: no cover
def __init__(self, null_values=DEFAULT_NULL_VALUES):
self.null_values = null_values

@classmethod
def test(cls, d):
raise NotImplementedError

def cast(self, d):
raise NotImplementedError

def _create_column(self, table, index):
raise NotImplementedError

Expand All @@ -59,6 +66,27 @@ def __init__(self, true_values=DEFAULT_TRUE_VALUES, false_values=DEFAULT_FALSE_V
self.true_values = true_values
self.false_values = false_values

@classmethod
def test(cls, d):
"""
Test, for purposes of type inference, if a string value could possibly
be valid for this column type.
"""
d = d.replace(',' ,'').strip()

d_lower = d.lower()

if d_lower in DEFAULT_NULL_VALUES:
return True

if d_lower in DEFAULT_TRUE_VALUES:
return True

if d_lower in DEFAULT_FALSE_VALUES:
return True

return False

def cast(self, d):
"""
Cast a single value to :class:`bool`.
Expand Down Expand Up @@ -90,53 +118,39 @@ def _create_column(self, table, index):

return BooleanColumn(table, index)

class DateType(ColumnType):
class DateTimeType(ColumnType):
"""
Column type for :class:`DateColumn`.
Column type for :class:`DateTimeColumn`.
"""
def __init__(self, date_format=None, null_values=DEFAULT_NULL_VALUES):
super(DateType, self).__init__(null_values=null_values)
def __init__(self, datetime_format=None, null_values=DEFAULT_NULL_VALUES):
super(DateTimeType, self).__init__(null_values=null_values)

self.date_format = date_format
self.datetime_format = datetime_format

def cast(self, d):
@classmethod
def test(cls, d):
"""
Cast a single value to a :class:`datetime.date`.
:param date_format: An optional :func:`datetime.strptime`
format string for parsing dates in this column.
:returns: :class:`datetime.date` or :code:`None`.
Test, for purposes of type inference, if a string value could possibly
be valid for this column type.
"""
if isinstance(d, datetime.date) or d is None:
return d

if isinstance(d, six.string_types):
d = d.strip()
d = d.strip()

if d.lower() in self.null_values:
return None

if self.date_format:
return datetime.datetime.strptime(d, self.date_format).date()
if d.lower() in DEFAULT_NULL_VALUES:
return True

# Ignore numerical values--these are never dates for inference purposes
try:
return parse(d).date()
except (TypeError, ValueError):
raise CastError('Can not parse value "%s" to as datetime for DateColumn.' % d)

def _create_column(self, table, index):
from agate.columns import DateColumn

return DateColumn(table, index)
Decimal(d)
return False
except InvalidOperation:
pass

class DateTimeType(ColumnType):
"""
Column type for :class:`DateTimeColumn`.
"""
def __init__(self, datetime_format=None, null_values=DEFAULT_NULL_VALUES):
super(DateTimeType, self).__init__(null_values=null_values)
try:
parse_result = parse(d)
except:
return False

self.datetime_format = datetime_format
return True

def cast(self, d):
"""
Expand All @@ -160,7 +174,7 @@ def cast(self, d):

try:
return parse(d)
except (TypeError, ValueError):
except:
raise CastError('Can not parse value "%s" to as datetime for DateTimeColumn.' % d)

def _create_column(self, table, index):
Expand All @@ -172,6 +186,24 @@ class TimeDeltaType(ColumnType):
"""
Column type for :class:`datetime.timedelta`.
"""
@classmethod
def test(self, d):
"""
Test, for purposes of type inference, if a string value could possibly
be valid for this column type.
"""
d = d.strip()

if d.lower() in DEFAULT_NULL_VALUES:
return True

seconds = pytimeparse.parse(d)

if seconds is None:
return False

return True

def cast(self, d):
"""
Cast a single value to :class:`datetime.timedelta`.
Expand Down Expand Up @@ -204,6 +236,23 @@ class NumberType(ColumnType):
"""
Column type for :class:`NumberColumn`.
"""
@classmethod
def test(cls, d):
"""
Test, for purposes of type inference, if a string value could possibly
be valid for this column type.
"""
d = d.replace(',' ,'').strip()

if d.lower() in DEFAULT_NULL_VALUES:
return True

try:
Decimal(d)
return True
except InvalidOperation:
return False

def cast(self, d):
"""
Cast a single value to a :class:`decimal.Decimal`.
Expand Down Expand Up @@ -239,6 +288,10 @@ class TextType(ColumnType):
"""
@classmethod
def test(cls, d):
"""
Test, for purposes of type inference, if a string value could possibly
be valid for this column type.
"""
return True

def cast(self, d):
Expand Down
1 change: 0 additions & 1 deletion agate/columns/__init__.py
Expand Up @@ -16,7 +16,6 @@

from agate.columns.base import *
from agate.columns.boolean import *
from agate.columns.date import *
from agate.columns.date_time import *
from agate.columns.number import *
from agate.columns.text import *
Expand Down
9 changes: 0 additions & 9 deletions agate/columns/date.py

This file was deleted.

6 changes: 2 additions & 4 deletions agate/computations.py
Expand Up @@ -73,7 +73,7 @@ def _validate(self, table):
before_column = table.columns[self._before_column_name]
after_column = table.columns[self._after_column_name]

for column_type in (NumberColumn, DateColumn, DateTimeColumn, TimeDeltaColumn):
for column_type in (NumberColumn, DateTimeColumn, TimeDeltaColumn):
if isinstance(before_column, column_type):
if not isinstance(after_column, column_type):
raise ValueError('Specified columns must be of the same type')
Expand All @@ -91,9 +91,7 @@ def _validate(self, table):
def get_computed_column_type(self, table):
before_column, after_column = self._validate(table)

if isinstance(before_column, DateColumn):
return TimeDeltaType()
elif isinstance(before_column, DateTimeColumn):
if isinstance(before_column, DateTimeColumn):
return TimeDeltaType()
elif isinstance(before_column, TimeDeltaColumn):
return TimeDeltaType()
Expand Down
44 changes: 19 additions & 25 deletions agate/inference.py
Expand Up @@ -5,18 +5,24 @@
from agate.column_types import *
from agate.exceptions import *

def infer_types(rows, overrides={}):
possible_types = {
'boolean': BooleanType(),
'date': DateType(),
'datetime': DateTimeType(),
'timedelta': TimeDeltaType(),
'number': NumberType(),
'text': TextType()
}
def infer_types(rows, force={}):
"""
Infer types for the columns in a given set of data.
:param force: A dictionary where each key is a column name and each value
is a :class:`.ColumnType` that overrides inference.
"""
# In order of preference
possible_types = [
BooleanType,
NumberType,
TimeDeltaType,
DateTimeType,
TextType
]

num_columns = len(rows[0])
hypotheses = [set(possible_types.values()) for i in range(num_columns)]
hypotheses = [set(possible_types) for i in range(num_columns)]

for row in rows:
for i in range(num_columns):
Expand All @@ -26,30 +32,18 @@ def infer_types(rows, overrides={}):
continue

for column_type in copy(h):
try:
column_type.cast(row[i])
except CastError:
if not column_type.test(row[i]):
h.remove(column_type)

preference_order = [
possible_types['boolean'],
possible_types['number'],
possible_types['timedelta'],
possible_types['datetime'],
# possible_types['date'],
possible_types['text']
]

column_types = []

for i in range(num_columns):
h = hypotheses[i]

for t in preference_order:
# Select in prefer order
for t in possible_types:
if t in h:
column_types.append(t)
break
else:
column_types.append(None)

return column_types
5 changes: 0 additions & 5 deletions docs/api/columns.rst
Expand Up @@ -15,11 +15,6 @@ agate.columns
:undoc-members:
:show-inheritance:

.. automodule:: agate.columns.date
:members:
:undoc-members:
:show-inheritance:

.. automodule:: agate.columns.date_time
:members:
:undoc-members:
Expand Down

0 comments on commit 5dff828

Please sign in to comment.