Skip to content

Commit

Permalink
Speed up transition models with linked tables
Browse files Browse the repository at this point in the history
Biggest speed improvement was specifying explicit copying of
new linked table rows so that pandas didn't do any
"are you assigning to a copy?" checks. More speedups were achieved
by using a groupby on the linked table rows that needed copying to
cut down on filtering done. Overall speedup is more than 10x.
  • Loading branch information
jiffyclub committed May 15, 2014
1 parent 22312d1 commit 8333b15
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 5 deletions.
15 changes: 15 additions & 0 deletions urbansim/models/tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import string

import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -130,3 +132,16 @@ def test_dict_full(self):
assert util.str_model_expression(expr_dict) == self.full_expected
assert util.str_model_expression(
expr_dict, add_constant=False) == self.full_expected_no_const


def test_sorted_groupby():
df = pd.DataFrame(
{'alpha': np.random.choice(list(string.lowercase), 100),
'num': np.random.randint(100)})
sorted_df = df.sort('alpha')

expected = {name: d.to_dict() for name, d in df.groupby('alpha')}
test = {name: d.to_dict()
for name, d in util.sorted_groupby(sorted_df, 'alpha')}

assert test == expected
19 changes: 14 additions & 5 deletions urbansim/models/transition.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import division

import itertools

import numpy as np
import pandas as pd

Expand Down Expand Up @@ -386,16 +388,23 @@ def _update_linked_table(table, col_name, added, copied, removed):
updated : pandas.DataFrame
"""
table = table.loc[~table[col_name].isin(removed)]
table = table.loc[~table[col_name].isin(set(removed))]
sub_table = table.loc[table[col_name].isin(set(copied))]

id_map = added.groupby(copied)
new_rows = []

for copied_id, new_ids in id_map.items():
rows = table.query('{} == {}'.format(col_name, copied_id))
for copied_id, rows in sub_table.groupby(col_name, sort=False):
# number of times we'll need to duplicate new_ids
n_matching_rows = len(rows)
rows = rows.loc[rows.index.repeat(len(new_ids))]

new_ids = id_map[copied_id]

if len(new_ids) > 1:
rows = rows.loc[rows.index.repeat(len(new_ids))].copy()
else:
rows = rows.copy()

rows[col_name] = new_ids * n_matching_rows
new_rows.append(rows)

Expand Down Expand Up @@ -454,7 +463,7 @@ def transition(self, data, year, linked_tables=None):

updated, added, copied, removed = self.transitioner(data, year)

for table_name, (table, col) in linked_tables.items():
for table_name, (table, col) in linked_tables.iteritems():
updated_links[table_name] = \
_update_linked_table(table, col, added, copied, removed)

Expand Down
28 changes: 28 additions & 0 deletions urbansim/models/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,3 +196,31 @@ def str_model_expression(expr, add_constant=True):
model_expression += ' - 1'

return model_expression


def sorted_groupby(df, groupby):
"""
Perform a groupby on a DataFrame using a specific column
and assuming that that column is sorted.
Parameters
----------
df : pandas.DataFrame
groupby : object
Column name on which to groupby. This column must be sorted.
Returns
-------
generator
Yields pairs of group_name, DataFrame.
"""
start = 0
prev = df[groupby].iloc[start]
for i, x in enumerate(df[groupby]):
if x != prev:
yield prev, df.iloc[start:i]
prev = x
start = i
# need to send back the last group
yield prev, df.iloc[start:]

0 comments on commit 8333b15

Please sign in to comment.