Add HedonicModelGroup class

Managers HedonicModel instances which are all for different segments (groups) of the same table.
UDST · Apr 10, 2014 · c2dba01 · c2dba01
1 parent d81f84f
commit c2dba01
Show file tree

Hide file tree

Showing 2 changed files with 154 additions and 1 deletion.
diff --git a/urbansim/models/hedonic.py b/urbansim/models/hedonic.py
@@ -116,7 +116,7 @@ class HedonicModel(object):
         the results reflect actual price.
 
         By default no transformation is applied.
-    name : str, optional
+    name : optional
         Optional descriptive name for this model that may be used
         in output.
 
@@ -172,3 +172,127 @@ def predict(self, data):
             raise RuntimeError('Model has not been fit.')
         return predict(
             data, self.predict_filters, self.model_fit, self.ytransform)
+
+
+class HedonicModelGroup(object):
+    """
+    Manages a group of hedonic models that refer to different segments
+    within a single table.
+
+    Model names must match the segment names after doing a Pandas groupby.
+
+    Parameters
+    ----------
+    segmentation_col : str
+        Name of the column on which to segment.
+
+    """
+    def __init__(self, segmentation_col):
+        self.segmentation_col = segmentation_col
+        self.models = {}
+
+    def add_model(self, model):
+        """
+        Add a `HedonicModel` instance.
+
+        Parameters
+        ----------
+        model : `HedonicModel`
+            Should have a ``.name`` attribute matching one of
+            the groupby segments.
+
+        """
+        self.models[model.name] = model
+
+    def add_model_from_params(self, name, fit_filters, predict_filters,
+                              model_expression, ytransform=None):
+        """
+        Add a model by passing arguments through to `HedonicModel`.
+
+        Parameters
+        ----------
+        name : any
+            Must match a groupby segment name.
+        fit_filters : list of str
+            Filters applied before fitting the model.
+        predict_filters : list of str
+            Filters applied before calculating new data points.
+        model_expression : str
+            A patsy model expression that can be used with statsmodels.
+            Should contain both the left- and right-hand sides.
+        ytransform : callable, optional
+            A function to call on the array of predicted output.
+            For example, if the model relation is predicting the log
+            of price, you might pass ``ytransform=np.exp`` so that
+            the results reflect actual price.
+
+            By default no transformation is applied.
+
+        """
+        model = HedonicModel(
+            fit_filters, predict_filters, model_expression, ytransform, name)
+        self.models[name] = model
+
+    def _iter_groups(self, data):
+        """
+        Iterate over the groups in `data` after grouping by
+        `segmentation_col`. Skips any groups for which there
+        is no model stored.
+
+        Yields tuples of (name, df) where name is the group key
+        and df is the group DataFrame.
+
+        Parameters
+        ----------
+        data : pandas.DataFrame
+            Must have a column with the same name as `segmentation_col`.
+
+        """
+        groups = data.groupby(self.segmentation_col)
+
+        for name, df in groups:
+            if name not in self.models:
+                # TODO: add logging of skipped groups
+                continue
+            else:
+                yield name, df
+
+    def fit_models(self, data):
+        """
+        Fit each of the models in the group.
+
+        Parameters
+        ----------
+        data : pandas.DataFrame
+            Must have a column with the same name as `segmentation_col`.
+
+        Returns
+        -------
+        fits : dict of statsmodels.regression.linear_model.OLSResults
+            Keys are the segment names.
+
+        """
+        return {name: self.models[name].fit_model(df)
+                for name, df in self._iter_groups(data)}
+
+    def predict(self, data):
+        """
+        Predict new data for each group in the segmentation.
+
+        Parameters
+        ----------
+        data : pandas.DataFrame
+            Data to use for prediction. Must have a column with the
+            same name as `segmentation_col`.
+
+        Returns
+        -------
+        predicted : pandas.Series
+            Predicted data in a pandas Series. Will have the index of `data`
+            after applying filters and minus any groups that do not have
+            models.
+
+        """
+        results = [self.models[name].predict(df)
+                   for name, df in self._iter_groups(data)]
+        return pd.concat(results)
diff --git a/urbansim/models/tests/test_hedonic.py b/urbansim/models/tests/test_hedonic.py
@@ -16,6 +16,12 @@ def test_df():
         index=['a', 'b', 'c', 'd', 'e'])
 
 
+@pytest.fixture
+def groupby_df(test_df):
+    test_df['group'] = ['x', 'y', 'x', 'x', 'y']
+    return test_df
+
+
 def test_apply_filter_query(test_df):
     filters = ['col1 < 3', 'col2 > 6']
     filtered = hedonic.apply_filter_query(test_df, filters)
@@ -117,3 +123,26 @@ def test_HedonicModel(test_df):
     predicted = model.predict(test_df)
     expected = pd.Series([0.5, 1.5], index=['b', 'd'])
     pdt.assert_series_equal(predicted, expected)
+
+
+def test_HedonicModelGroup(groupby_df):
+    model_exp = 'col1 ~ col2'
+
+    hmg = hedonic.HedonicModelGroup('group')
+
+    xmodel = hedonic.HedonicModel(None, None, model_exp, name='x')
+    hmg.add_model(xmodel)
+    assert isinstance(hmg.models['x'], hedonic.HedonicModel)
+
+    hmg.add_model_from_params('y', None, None, model_exp)
+    assert isinstance(hmg.models['y'], hedonic.HedonicModel)
+    assert hmg.models['y'].name == 'y'
+
+    fits = hmg.fit_models(groupby_df)
+    assert isinstance(fits['x'], RegressionResultsWrapper)
+    assert isinstance(fits['y'], RegressionResultsWrapper)
+
+    predicted = hmg.predict(groupby_df)
+    assert isinstance(predicted, pd.Series)
+    pdt.assert_series_equal(
+        predicted.sort_index(), groupby_df.col1, check_dtype=False)