Speed up transition models with linked tables

Biggest speed improvement was specifying explicit copying of new linked table rows so that pandas didn't do any "are you assigning to a copy?" checks. More speedups were achieved by using a groupby on the linked table rows that needed copying to cut down on filtering done. Overall speedup is more than 10x.
UDST · May 15, 2014 · 8333b15 · 8333b15
1 parent 22312d1
commit 8333b15
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 5 deletions.
diff --git a/urbansim/models/tests/test_util.py b/urbansim/models/tests/test_util.py
@@ -1,3 +1,5 @@
+import string
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -130,3 +132,16 @@ def test_dict_full(self):
         assert util.str_model_expression(expr_dict) == self.full_expected
         assert util.str_model_expression(
             expr_dict, add_constant=False) == self.full_expected_no_const
+
+
+def test_sorted_groupby():
+    df = pd.DataFrame(
+        {'alpha': np.random.choice(list(string.lowercase), 100),
+         'num': np.random.randint(100)})
+    sorted_df = df.sort('alpha')
+
+    expected = {name: d.to_dict() for name, d in df.groupby('alpha')}
+    test = {name: d.to_dict()
+            for name, d in util.sorted_groupby(sorted_df, 'alpha')}
+
+    assert test == expected
diff --git a/urbansim/models/transition.py b/urbansim/models/transition.py
@@ -1,5 +1,7 @@
 from __future__ import division
 
+import itertools
+
 import numpy as np
 import pandas as pd
 
@@ -386,16 +388,23 @@ def _update_linked_table(table, col_name, added, copied, removed):
     updated : pandas.DataFrame
 
     """
-    table = table.loc[~table[col_name].isin(removed)]
+    table = table.loc[~table[col_name].isin(set(removed))]
+    sub_table = table.loc[table[col_name].isin(set(copied))]
 
     id_map = added.groupby(copied)
     new_rows = []
 
-    for copied_id, new_ids in id_map.items():
-        rows = table.query('{} == {}'.format(col_name, copied_id))
+    for copied_id, rows in sub_table.groupby(col_name, sort=False):
         # number of times we'll need to duplicate new_ids
         n_matching_rows = len(rows)
-        rows = rows.loc[rows.index.repeat(len(new_ids))]
+
+        new_ids = id_map[copied_id]
+
+        if len(new_ids) > 1:
+            rows = rows.loc[rows.index.repeat(len(new_ids))].copy()
+        else:
+            rows = rows.copy()
+
         rows[col_name] = new_ids * n_matching_rows
         new_rows.append(rows)
 
@@ -454,7 +463,7 @@ def transition(self, data, year, linked_tables=None):
 
         updated, added, copied, removed = self.transitioner(data, year)
 
-        for table_name, (table, col) in linked_tables.items():
+        for table_name, (table, col) in linked_tables.iteritems():
             updated_links[table_name] = \
                 _update_linked_table(table, col, added, copied, removed)
 

diff --git a/urbansim/models/util.py b/urbansim/models/util.py
@@ -196,3 +196,31 @@ def str_model_expression(expr, add_constant=True):
             model_expression += ' - 1'
 
     return model_expression
+
+
+def sorted_groupby(df, groupby):
+    """
+    Perform a groupby on a DataFrame using a specific column
+    and assuming that that column is sorted.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+    groupby : object
+        Column name on which to groupby. This column must be sorted.
+
+    Returns
+    -------
+    generator
+        Yields pairs of group_name, DataFrame.
+
+    """
+    start = 0
+    prev = df[groupby].iloc[start]
+    for i, x in enumerate(df[groupby]):
+        if x != prev:
+            yield prev, df.iloc[start:i]
+            prev = x
+            start = i
+    # need to send back the last group
+    yield prev, df.iloc[start:]