Skip to content

Commit

Permalink
Ability to disable table copies in sim framework
Browse files Browse the repository at this point in the history
By default, simulation framework always returns copies of evaluated
tables and their columns, instead of views. Add parameter and
attribute to disable copies where possible. This includes
computed columns but not pandas slices which are always copies.
Added test for copying when enabled and disabled.
  • Loading branch information
daradib committed Nov 1, 2014
1 parent 6fa9115 commit 00d52ac
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 22 deletions.
81 changes: 59 additions & 22 deletions urbansim/sim/simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,21 @@ class DataFrameWrapper(object):
name : str
Name for the table.
frame : pandas.DataFrame
copy : bool, optional
Whether to always copy the DataFrame each time it is evaluated.
Attributes
----------
name : str
Table name.
copy : bool
Whether to always copy the DataFrame each time it is evaluated.
"""
def __init__(self, name, frame):
def __init__(self, name, frame, copy=True):
self.name = name
self._frame = frame
self.copy = copy

@property
def columns(self):
Expand Down Expand Up @@ -159,9 +164,17 @@ def to_frame(self, columns=None):
local_cols = [c for c in self._frame.columns
if c in columns and c not in extra_cols]
extra_cols = toolz.keyfilter(lambda c: c in columns, extra_cols)
df = self._frame[local_cols].copy()
# Slicing always returns a copy, so a copy will be created
# even if copy is False.
df = self._frame[local_cols]
if self.copy:
# Prevent warnings when assigning to a desired copy.
df.is_copy = None

This comment has been minimized.

Copy link
@jiffyclub

jiffyclub Nov 1, 2014

Member

This feels a bit like it's subverting how Pandas is supposed to work. Do you know if there's any official documentation about working with views of Pandas data?

This comment has been minimized.

Copy link
@daradib

daradib Nov 2, 2014

Author Contributor

Yes, the documentation has a section on returning a view versus a copy. But the is_copy attribute is basically undocumented.

is_copy is also a bit of a misnomer. It's usually set to None, even when running DataFrame.copy(). Looking at the source, it's only set when a slice returns a (potentially unintended) copy, to trigger SettingWithCopyWarning (or an exception) when assigning to the copy, in case a user thinks that they're assigning to a view.

I don't like messing around with mostly-undocumented attributes, but it only prevents a false warning, and if pandas copy behavior changes our test should fail.

Alternatives:

  1. Disable SettingWithCopyWarning by setting pd.set_option('mode.chained_assignment', None). I think changing a user's global settings is a bad behavior though.
  2. Call DataFrame._set_is_copy(). It's an internal method, but at least we're not changing the attribute directly.
  3. Revert back to calling .copy(). It creates a second, unnecessary copy, but it's explicit, will work even if pandas one day returns views for column slices, and will take care of setting is_copy to None for us. What do you think?
else:
df = self._frame.copy()
if self.copy:

This comment has been minimized.

Copy link
@jiffyclub

jiffyclub Nov 1, 2014

Member

You have an if self.copy: statement inside the else of another if self.copy: statement. I don't think this is probably what you meant to do.

This comment has been minimized.

Copy link
@daradib

daradib Nov 2, 2014

Author Contributor

I think it's inside the else of the if columns statement as intended. Am I overlooking something?

This comment has been minimized.

Copy link
@jiffyclub

jiffyclub Nov 2, 2014

Member

Ah, you're right. Things are getting so deeply nested here I got lost. This copy thing has new ifs springing up everywhere. 😟

This comment has been minimized.

Copy link
@daradib

daradib Nov 2, 2014

Author Contributor

If we're okay with calling DataFrame.copy() for known copies, to be more explicit (alternative 3 above), we don't need to nest the conditionals.

df = self._frame.copy()
else:
df = self._frame

with log_start_finish(
'computing {!r} columns for table {!r}'.format(
Expand Down Expand Up @@ -212,7 +225,19 @@ def get_column(self, column_name):
'getting single column {!r} from table {!r}'.format(
column_name, self.name),
logger):
return self.to_frame(columns=[column_name])[column_name]
extra_cols = _columns_for_table(self.name)
if column_name in extra_cols:
with log_start_finish(
'computing column {!r} for table {!r}'.format(
column_name, self.name),
logger):
column = extra_cols[column_name]()
else:
column = self._frame[column_name]
if self.copy:
return column.copy()
else:
return column

def __getitem__(self, key):
return self.get_column(key)
Expand Down Expand Up @@ -260,20 +285,25 @@ class TableFuncWrapper(object):
Callable that returns a DataFrame.
cache : bool, optional
Whether to cache the results of calling the wrapped function.
copy : bool, optional
Whether to always copy the DataFrame each time it is evaluated.
Attributes
__________
----------
name : str
Table name.
cache : bool
Whether caching is enabled for this table.
copy : bool
Whether to always copy the DataFrame each time it is evaluated.
"""
def __init__(self, name, func, cache=False):
def __init__(self, name, func, cache=False, copy=True):
self.name = name
self._func = func
self._argspec = inspect.getargspec(func)
self.cache = cache
self.copy = copy
self._columns = []
self._index = None
self._len = 0
Expand Down Expand Up @@ -351,7 +381,8 @@ def to_frame(self, columns=None):
"""
frame = self._call_func()
return DataFrameWrapper(self.name, frame).to_frame(columns)
return DataFrameWrapper(self.name, frame,
copy=self.copy).to_frame(columns)

def get_column(self, column_name):
"""
Expand All @@ -366,11 +397,9 @@ def get_column(self, column_name):
column : pandas.Series
"""
with log_start_finish(
'getting single column {!r} from table {!r}'.format(
column_name, self.name),
logger):
return self.to_frame(columns=[column_name])[column_name]
frame = self._call_func()
return DataFrameWrapper(self.name, frame,
copy=self.copy).get_column(column_name)

def __getitem__(self, key):
return self.get_column(key)
Expand Down Expand Up @@ -404,11 +433,15 @@ class _TableSourceWrapper(TableFuncWrapper):
----------
name : str
func : callable
copy : bool, optional
Whether to always copy the DataFrame each time it is evaluated.
Attributes
----------
name : str
Table name.
copy : bool
Whether to always copy the DataFrame each time it is evaluated.
"""
def convert(self):
Expand All @@ -418,7 +451,7 @@ def convert(self):
"""
frame = self._call_func()
return add_table(self.name, frame)
return add_table(self.name, frame, copy=self.copy)

def to_frame(self, columns=None):
"""
Expand Down Expand Up @@ -764,7 +797,7 @@ def _collect_variables(names, expressions=None):
return variables


def add_table(table_name, table, cache=False):
def add_table(table_name, table, cache=False, copy=True):
"""
Register a table with the simulation.
Expand All @@ -780,6 +813,8 @@ def add_table(table_name, table, cache=False):
cache : bool, optional
Whether to cache the results of a provided callable. Does not
apply if `table` is a DataFrame.
copy : bool, optional
Whether to always copy the DataFrame each time it is evaluated.
Returns
-------
Expand All @@ -788,9 +823,9 @@ def add_table(table_name, table, cache=False):
"""
if hasattr(table, 'columns') and hasattr(table, 'index'):
# Object looks like a DataFrame, so treat it as one.
table = DataFrameWrapper(table_name, table)
table = DataFrameWrapper(table_name, table, copy=copy)
elif isinstance(table, Callable):
table = TableFuncWrapper(table_name, table, cache)
table = TableFuncWrapper(table_name, table, cache=cache, copy=copy)
else:
raise TypeError('table must be DataFrame or function.')

Expand All @@ -803,7 +838,7 @@ def add_table(table_name, table, cache=False):
return table


def table(table_name=None, cache=False):
def table(table_name=None, cache=False, copy=True):
"""
Decorates functions that return DataFrames.
Expand All @@ -820,12 +855,12 @@ def decorator(func):
name = table_name
else:
name = func.__name__
add_table(name, func, cache=cache)
add_table(name, func, cache=cache, copy=copy)
return func
return decorator


def add_table_source(table_name, func):
def add_table_source(table_name, func, copy=True):
"""
Add a DataFrame source function to the simulation.
Expand All @@ -839,19 +874,21 @@ def add_table_source(table_name, func):
The function's argument names and keyword argument values
will be matched to registered variables when the function
needs to be evaluated by the simulation framework.
copy : bool, optional
Whether to always copy the DataFrame each time it is evaluated.
Returns
-------
wrapped : `_TableSourceWrapper`
"""
wrapped = _TableSourceWrapper(table_name, func)
wrapped = _TableSourceWrapper(table_name, func, copy=copy)
logger.debug('registering table source {}'.format(table_name))
_TABLES[table_name] = wrapped
return wrapped


def table_source(table_name=None):
def table_source(table_name=None, copy=True):
"""
Decorates functions that return DataFrames.
Expand All @@ -869,7 +906,7 @@ def decorator(func):
name = table_name
else:
name = func.__name__
add_table_source(name, func)
add_table_source(name, func, copy=copy)
return func
return decorator

Expand Down
98 changes: 98 additions & 0 deletions urbansim/sim/tests/test_simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,104 @@ def asdf(x):
pdt.assert_frame_equal(sim.get_table('table').to_frame(), df * 3)


def test_table_copy(df):
sim.add_table('test_frame_copied', df, copy=True)
sim.add_table('test_frame_uncopied', df, copy=False)
sim.add_table('test_func_copied', lambda: df, copy=True)
sim.add_table('test_func_uncopied', lambda: df, copy=False)

@sim.table(copy=True)
def test_funcd_copied():
return df

@sim.table(copy=False)
def test_funcd_uncopied():
return df

@sim.table(copy=False)
def test_funcd_copied2(test_frame_copied):
return test_frame_copied.to_frame()

@sim.table(copy=True)
def test_funcd_copied3(test_frame_copied):
return test_frame_copied.to_frame()

@sim.table(copy=True)
def test_funcd_copied4(test_frame_uncopied):
return test_frame_uncopied.to_frame()

@sim.table(copy=False)
def test_funcd_uncopied2(test_frame_uncopied):
return test_frame_uncopied.to_frame()

sim.add_table_source('test_source_copied', lambda: df, copy=True)
sim.add_table_source('test_source_uncopied', lambda: df, copy=False)

@sim.table_source(copy=True)
def test_sourced_copied():
return df

@sim.table_source(copy=False)
def test_sourced_uncopied():
return df

sim.add_table('test_copied_columns', pd.DataFrame(index=df.index),
copy=True)
sim.add_table('test_uncopied_columns', pd.DataFrame(index=df.index),
copy=False)

@sim.column('test_copied_columns', 'a')
def copied_column(col='test_frame_uncopied.a'):
return col

@sim.column('test_uncopied_columns', 'a')
def uncopied_column(col='test_frame_uncopied.a'):
return col

for name in ['test_frame_copied', 'test_func_copied',
'test_funcd_copied', 'test_funcd_copied2',
'test_funcd_copied3', 'test_funcd_copied4',
'test_source_copied', 'test_sourced_copied',
'test_copied_columns']:
table = sim.get_table(name)
if 'columns' not in name:
# Run these tests for tables without computed columns.
pdt.assert_frame_equal(table.to_frame(), df)
assert table.to_frame() is not df
pdt.assert_frame_equal(table.to_frame(), table.to_frame())
assert table.to_frame() is not table.to_frame()
pdt.assert_series_equal(table.to_frame()['a'], df['a'])
assert table.to_frame()['a'] is not df['a']
pdt.assert_series_equal(table.to_frame()['a'],
table.to_frame()['a'])
assert table.to_frame()['a'] is not table.to_frame()['a']
pdt.assert_series_equal(table['a'], df['a'])
assert table['a'] is not df['a']
pdt.assert_series_equal(table['a'], table['a'])
assert table['a'] is not table['a']

for name in ['test_frame_uncopied', 'test_func_uncopied',
'test_funcd_uncopied', 'test_funcd_uncopied2',
'test_source_uncopied', 'test_sourced_uncopied',
'test_uncopied_columns']:
table = sim.get_table(name)
if 'columns' not in name:
# Run these tests for tables without computed columns.
pdt.assert_frame_equal(table.to_frame(), df)
assert table.to_frame() is df
pdt.assert_frame_equal(table.to_frame(), table.to_frame())
assert table.to_frame() is table.to_frame()
pdt.assert_series_equal(table.to_frame()['a'], df['a'])
assert table.to_frame()['a'] is df['a']
pdt.assert_series_equal(table.to_frame()['a'],
table.to_frame()['a'])
assert table.to_frame()['a'] is table.to_frame()['a']
pdt.assert_series_equal(table['a'], df['a'])
assert table['a'] is df['a']
pdt.assert_series_equal(table['a'], table['a'])
assert table['a'] is table['a']


def test_columns_for_table():
sim.add_column(
'table1', 'col10', pd.Series([1, 2, 3], index=['a', 'b', 'c']))
Expand Down

0 comments on commit 00d52ac

Please sign in to comment.