Merge ca34f90 into ec56452

UDST · Jul 30, 2014 · cca73ba · cca73ba
2 parents ec56452 + ca34f90
commit cca73ba
Show file tree

Hide file tree

Showing 9 changed files with 355 additions and 11 deletions.
diff --git a/urbansim/developer/sqftproforma.py b/urbansim/developer/sqftproforma.py
@@ -554,7 +554,6 @@ def lookup(self, form, df, only_built=True):
             df['max_far_from_dua'] = df.max_dua * df.ave_unit_size / self.config.building_efficiency
             df['min_max_fars'] = df[['min_max_fars', 'max_far_from_dua']].min(axis=1)
 
-        df['min_max_fars'] = df.min_max_fars.fillna(0)
         if only_built:
             df = df.query('min_max_fars > 0 and parcel_size > 0')
 

diff --git a/urbansim/models/lcm.py b/urbansim/models/lcm.py
@@ -8,12 +8,14 @@
 import logging
 
 import numpy as np
+from numpy import random
 import pandas as pd
 from patsy import dmatrix
 from prettytable import PrettyTable
 import toolz
 
 from . import util
+from ..exceptions import ModelEvaluationError
 from ..urbanchoice import interaction, mnl
 from ..utils import yamlio
 from ..utils.logutil import log_start_finish
@@ -238,6 +240,13 @@ def fit(self, choosers, alternatives, current_choice):
             choosers, alternatives, self.sample_size, current_choice)
         model_design = dmatrix(
             self.str_model_expression, data=merged, return_type='dataframe')
+
+        if len(merged) != model_design.as_matrix().shape[0]:
+            raise ModelEvaluationError(
+                'Estimated data does not have the same length as input.  '
+                'This suggests there are null values in one or more of '
+                'the input columns.')
+
         self.log_likelihoods, self.fit_parameters = mnl.mnl_estimate(
             model_design.as_matrix(), chosen, self.sample_size)
         self.fit_parameters.index = model_design.columns
@@ -336,6 +345,12 @@ def predict(self, choosers, alternatives, debug=False):
         model_design = dmatrix(
             self.str_model_expression, data=merged, return_type='dataframe')
 
+        if len(merged) != model_design.as_matrix().shape[0]:
+            raise ModelEvaluationError(
+                'Simulated data does not have the same length as input.  '
+                'This suggests there are null values in one or more of '
+                'the input columns.')
+
         coeffs = [self.fit_parameters['Coefficient'][x]
                   for x in model_design.columns]
 
@@ -445,6 +460,65 @@ def columns_used(self):
             self.alts_columns_used(),
             self.interaction_columns_used())))
 
+    @classmethod
+    def fit_from_cfg(cls, choosers, chosen_fname, alternatives, cfgname):
+        """
+        Parameters
+        ----------
+        choosers : DataFrame
+            A dataframe of rows of agents which have locations assigned.
+        chosen_fname : string
+            A string indicating the column in the choosers dataframe which
+            gives which location the choosers have chosen.
+        alternatives : DataFrame
+            A dataframe of locations which should include the chosen locations
+            from the choosers dataframe as well as some other locations from
+            which to sample.  Values in choosers[chosen_fname] should index
+            into the alternatives dataframe.
+        cfgname : string
+            The name of the yaml config file from which to read the location
+            choice model.
+        """
+        lcm = cls.from_yaml(str_or_buffer=cfgname)
+        lcm.fit(choosers, alternatives, choosers[chosen_fname])
+        lcm.report_fit()
+        lcm.to_yaml(str_or_buffer=cfgname)
+        return lcm
+
+    @classmethod
+    def predict_from_cfg(cls, movers, locations, cfgname,
+                         location_ratio=2.0):
+        """
+        Simulate the location choices for the specified choosers
+
+        Parameters
+        ----------
+        movers : DataFrame
+            A dataframe of agents doing the choosing.
+        locations : DataFrame
+            A dataframe of locations which the choosers are location in and which
+            have a supply.
+        cfgname : string
+            The name of the yaml config file from which to read the location
+            choice model.
+        location_ratio : float
+            Above the location ratio (default of 2.0) of locations to choosers, the
+            locations will be sampled to meet this ratio (for performance reasons).
+        """
+        lcm = cls.from_yaml(str_or_buffer=cfgname)
+
+        if len(locations) > len(movers) * location_ratio:
+            print("Location ratio exceeded: %d locations and only %d choosers" %
+                  (len(locations), len(movers)))
+            idxes = random.choice(locations.index, size=len(movers) * location_ratio,
+                                  replace=False)
+            locations = locations.loc[idxes]
+            print("  after sampling %d locations are available\n" % len(locations))
+
+        new_units = lcm.predict(movers, locations, debug=True)
+        print("Assigned %d choosers to new units" % len(new_units.index))
+        return new_units
+
 
 class MNLLocationChoiceModelGroup(object):
     """
@@ -1033,4 +1107,66 @@ def columns_used(self):
         return list(toolz.unique(toolz.concatv(
             self.choosers_columns_used(),
             self.alts_columns_used(),
-            self.interaction_columns_used())))
+            self.interaction_columns_used(),
+            [self.segmentation_col])))
+
+    @classmethod
+    def fit_from_cfg(cls, choosers, chosen_fname, alternatives, cfgname):
+        """
+        Parameters
+        ----------
+        choosers : DataFrame
+            A dataframe of rows of agents which have locations assigned.
+        chosen_fname : string
+            A string indicating the column in the choosers dataframe which
+            gives which location the choosers have chosen.
+        alternatives : DataFrame
+            A dataframe of locations which should include the chosen locations
+            from the choosers dataframe as well as some other locations from
+            which to sample.  Values in choosers[chosen_fname] should index
+            into the alternatives dataframe.
+        cfgname : string
+            The name of the yaml config file from which to read the location
+            choice model.
+        """
+        lcm = cls.from_yaml(str_or_buffer=cfgname)
+        lcm.fit(choosers, alternatives, choosers[chosen_fname])
+        for k, v in lcm._group.models.items():
+            print("LCM RESULTS FOR SEGMENT %s\n" % str(k))
+            v.report_fit()
+        lcm.to_yaml(str_or_buffer=cfgname)
+        return lcm
+
+    @classmethod
+    def predict_from_cfg(cls, movers, locations, cfgname,
+                         location_ratio=2.0):
+        """
+        Simulate the location choices for the specified choosers
+
+        Parameters
+        ----------
+        movers : DataFrame
+            A dataframe of agents doing the choosing.
+        locations : DataFrame
+            A dataframe of locations which the choosers are location in and which
+            have a supply.
+        cfgname : string
+            The name of the yaml config file from which to read the location
+            choice model.
+        location_ratio : float
+            Above the location ratio (default of 2.0) of locations to choosers, the
+            locations will be sampled to meet this ratio (for performance reasons).
+        """
+        lcm = cls.from_yaml(str_or_buffer=cfgname)
+
+        if len(locations) > len(movers) * location_ratio:
+            print("Location ratio exceeded: %d locations and only %d choosers" %
+                  (len(locations), len(movers)))
+            idxes = random.choice(locations.index, size=len(movers) * location_ratio,
+                                  replace=False)
+            locations = locations.loc[idxes]
+            print("  after sampling %d locations are available\n" % len(locations))
+
+        new_units = lcm.predict(movers, locations, debug=True)
+        print("Assigned %d choosers to new units" % len(new_units.index))
+        return new_units
diff --git a/urbansim/models/regression.py b/urbansim/models/regression.py
@@ -42,6 +42,13 @@ def fit_model(df, filters, model_expression):
     """
     df = util.apply_filter_query(df, filters)
     model = smf.ols(formula=model_expression, data=df)
+
+    if len(model.exog) != len(df):
+        raise ModelEvaluationError(
+            'Estimated data does not have the same length as input.  '
+            'This suggests there are null values in one or more of '
+            'the input columns.')
+
     with log_start_finish('statsmodels OLS fit', logger):
         return model.fit()
 
@@ -325,6 +332,9 @@ class instance for use during prediction.
         self.fit_parameters = _model_fit_to_table(fit)
         if debug:
             index = util.apply_filter_query(data, self.fit_filters).index
+            assert len(fit.model.exog) == len(index), (
+                "The estimate data is unequal in length to the original "
+                "dataframe, usually caused by nans")
             df = pd.DataFrame(
                 fit.model.exog, columns=fit.model.exog_names, index=index)
             df[fit.model.endog_names] = fit.model.endog
@@ -456,6 +466,41 @@ def columns_used(self):
             util.columns_in_filters(self.predict_filters),
             util.columns_in_formula(self.model_expression))))
 
+    @classmethod
+    def fit_from_cfg(cls, df, cfgname, debug=False):
+        """
+        Parameters
+        ----------
+        df : DataFrame
+            The dataframe which contains the columns to use for the estimation.
+        cfgname : string
+            The name of the yaml config file which describes the hedonic model.
+        debug : boolean, optional (default False)
+            Whether to generate debug information on the model.
+        """
+        hm = cls.from_yaml(str_or_buffer=cfgname)
+        ret = hm.fit(df, debug=debug)
+        print ret.summary()
+        hm.to_yaml(str_or_buffer=cfgname)
+        return hm
+
+    @classmethod
+    def predict_from_cfg(cls, df, cfgname):
+        """
+        Parameters
+        ----------
+        df : DataFrame
+            The dataframe which contains the columns to use for the estimation.
+        cfgname : string
+            The name of the yaml config file which describes the hedonic model.
+        """
+        hm = cls.from_yaml(str_or_buffer=cfgname)
+
+        price_or_rent = hm.predict(df)
+        print price_or_rent.describe()
+
+        return price_or_rent
+
 
 class RegressionModelGroup(object):
     """
@@ -896,4 +941,50 @@ def columns_used(self):
         return list(toolz.unique(toolz.concatv(
             util.columns_in_filters(self.fit_filters),
             util.columns_in_filters(self.predict_filters),
-            self._group.columns_used())))
+            self._group.columns_used(),
+            [self.segmentation_col])))
+
+    @classmethod
+    def fit_from_cfg(cls, df, cfgname, debug=False, min_segment_size=None):
+        """
+        Parameters
+        ----------
+        df : DataFrame
+            The dataframe which contains the columns to use for the estimation.
+        cfgname : string
+            The name of the yaml config file which describes the hedonic model.
+        debug : boolean, optional (default False)
+            Whether to generate debug information on the model.
+        min_segment_size : int, optional
+            Set attribute on the model.
+        """
+        hm = cls.from_yaml(str_or_buffer=cfgname)
+        if min_segment_size:
+            hm.min_segment_size = min_segment_size
+
+        for k, v in hm.fit(df, debug=debug).items():
+            print "REGRESSION RESULTS FOR SEGMENT %s\n" % str(k)
+            print v.summary()
+        hm.to_yaml(str_or_buffer=cfgname)
+        return hm
+
+    @classmethod
+    def predict_from_cfg(cls, df, cfgname, min_segment_size=None):
+        """
+        Parameters
+        ----------
+        df : DataFrame
+            The dataframe which contains the columns to use for the estimation.
+        cfgname : string
+            The name of the yaml config file which describes the hedonic model.
+        min_segment_size : int, optional
+            Set attribute on the model.
+        """
+        hm = cls.from_yaml(str_or_buffer=cfgname)
+        if min_segment_size:
+            hm.min_segment_size = min_segment_size
+
+        price_or_rent = hm.predict(df)
+        print price_or_rent.describe()
+
+        return price_or_rent
diff --git a/urbansim/models/tests/test_lcm.py b/urbansim/models/tests/test_lcm.py
@@ -2,8 +2,10 @@
 import pandas as pd
 import pytest
 import yaml
+import os
 from pandas.util import testing as pdt
 
+from ...utils import misc
 from ...utils import testing
 
 from .. import lcm
@@ -189,7 +191,7 @@ def test_mnl_lcm_segmented(grouped_choosers, alternatives):
     assert group.choosers_columns_used() == []
     assert group.alts_columns_used() == []
     assert set(group.interaction_columns_used()) == {'var1', 'var2', 'var3'}
-    assert set(group.columns_used()) == {'var1', 'var2', 'var3'}
+    assert set(group.columns_used()) == {'group', 'var1', 'var2', 'var3'}
 
     assert group.fitted is False
     logliks = group.fit(grouped_choosers, alternatives, 'thing_id')
@@ -286,3 +288,59 @@ def test_segmented_lcm_removes_old_models(grouped_choosers, alternatives):
     group.fit(grouped_choosers, alternatives, 'thing_id')
 
     assert sorted(group._group.models.keys()) == ['x', 'y']
+
+
+def test_fit_from_cfg(choosers, alternatives):
+    model_exp = 'var2 + var1:var3'
+    sample_size = 5
+    choosers_fit_filters = ['var1 != 5']
+    choosers_predict_filters = ['var1 != 7']
+    alts_fit_filters = ['var3 != 15']
+    alts_predict_filters = ['var2 != 14']
+    interaction_predict_filters = None
+    estimation_sample_size = None
+    choice_column = None
+    name = 'Test LCM'
+
+    model = lcm.MNLLocationChoiceModel(
+        model_exp, sample_size,
+        choosers_fit_filters, choosers_predict_filters,
+        alts_fit_filters, alts_predict_filters,
+        interaction_predict_filters, estimation_sample_size,
+        choice_column, name)
+
+    misc._mkifnotexists("fake_data_home")
+    cfgname = os.path.join("fake_data_home", "test.yaml")
+    model.to_yaml(cfgname)
+    lcm.MNLLocationChoiceModel.fit_from_cfg(choosers, "thing_id", alternatives,
+                                            cfgname)
+    lcm.MNLLocationChoiceModel.predict_from_cfg(choosers, alternatives, cfgname)
+
+    lcm.MNLLocationChoiceModel.predict_from_cfg(choosers, alternatives,
+                                                cfgname, .2)
+
+
+def test_fit_from_cfg_segmented(grouped_choosers, alternatives):
+    model_exp = 'var2 + var1:var3'
+    sample_size = 4
+
+    group = lcm.SegmentedMNLLocationChoiceModel(
+        'group', sample_size, default_model_expr=model_exp)
+    group.add_segment('x')
+    group.add_segment('y', 'var3 + var1:var2')
+
+    misc._mkifnotexists("fake_data_home")
+    cfgname = os.path.join("fake_data_home", "test.yaml")
+    group.to_yaml(cfgname)
+    lcm.SegmentedMNLLocationChoiceModel.fit_from_cfg(grouped_choosers,
+                                                     "thing_id",
+                                                     alternatives,
+                                                     cfgname)
+    lcm.SegmentedMNLLocationChoiceModel.predict_from_cfg(grouped_choosers,
+                                                         alternatives,
+                                                         cfgname)
+
+    lcm.SegmentedMNLLocationChoiceModel.predict_from_cfg(grouped_choosers,
+                                                         alternatives,
+                                                         cfgname,
+                                                         .8)