Merge d7dbb95 into ec95b55

UDST · Jul 28, 2014 · 32e723c · 32e723c
2 parents ec95b55 + d7dbb95
commit 32e723c
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 166 deletions.
diff --git a/urbansim/developer/sqftproforma.py b/urbansim/developer/sqftproforma.py
@@ -554,7 +554,6 @@ def lookup(self, form, df, only_built=True):
             df['max_far_from_dua'] = df.max_dua * df.ave_unit_size / self.config.building_efficiency
             df['min_max_fars'] = df[['min_max_fars', 'max_far_from_dua']].min(axis=1)
 
-        df['min_max_fars'] = df.min_max_fars.fillna(0)
         if only_built:
             df = df.query('min_max_fars > 0 and parcel_size > 0')
 

diff --git a/urbansim/models/lcm.py b/urbansim/models/lcm.py
@@ -14,6 +14,7 @@
 import toolz
 
 from . import util
+from ..exceptions import ModelEvaluationError
 from ..urbanchoice import interaction, mnl
 from ..utils import yamlio
 from ..utils.logutil import log_start_finish
@@ -238,6 +239,13 @@ def fit(self, choosers, alternatives, current_choice):
             choosers, alternatives, self.sample_size, current_choice)
         model_design = dmatrix(
             self.str_model_expression, data=merged, return_type='dataframe')
+
+        if len(merged) != model_design.as_matrix().shape[0]:
+            raise ModelEvaluationError(
+                'Estimated data does not have the same length as input.  '
+                'This suggests there are null values in one or more of '
+                'the input columns.')
+
         self.log_likelihoods, self.fit_parameters = mnl.mnl_estimate(
             model_design.as_matrix(), chosen, self.sample_size)
         self.fit_parameters.index = model_design.columns
@@ -336,6 +344,12 @@ def predict(self, choosers, alternatives, debug=False):
         model_design = dmatrix(
             self.str_model_expression, data=merged, return_type='dataframe')
 
+        if len(merged) != model_design.as_matrix().shape[0]:
+            raise ModelEvaluationError(
+                'Simulated data does not have the same length as input.  '
+                'This suggests there are null values in one or more of '
+                'the input columns.')
+
         coeffs = [self.fit_parameters['Coefficient'][x]
                   for x in model_design.columns]
 

diff --git a/urbansim/models/regression.py b/urbansim/models/regression.py
@@ -42,6 +42,13 @@ def fit_model(df, filters, model_expression):
     """
     df = util.apply_filter_query(df, filters)
     model = smf.ols(formula=model_expression, data=df)
+
+    if len(model.exog) != len(df):
+        raise ModelEvaluationError(
+            'Estimated data does not have the same length as input.  '
+            'This suggests there are null values in one or more of '
+            'the input columns.')
+
     with log_start_finish('statsmodels OLS fit', logger):
         return model.fit()
 
@@ -325,6 +332,8 @@ class instance for use during prediction.
         self.fit_parameters = _model_fit_to_table(fit)
         if debug:
             index = util.apply_filter_query(data, self.fit_filters).index
+            assert len(fit.model.exog) == len(index), "The estimate data is"
+            "unequal in length to the original dataframe, usually caused by nans"
             df = pd.DataFrame(
                 fit.model.exog, columns=fit.model.exog_names, index=index)
             df[fit.model.endog_names] = fit.model.endog

diff --git a/urbansim/sim/simulation.py b/urbansim/sim/simulation.py
@@ -5,6 +5,10 @@
 
 import pandas as pd
 import toolz
+import time
+import logging
+
+logger = logging.getLogger(__name__)
 
 _TABLES = {}
 _COLUMNS = {}
@@ -45,6 +49,14 @@ def columns(self):
         """
         return list(self._frame.columns) + _list_columns_for_table(self.name)
 
+    @property
+    def local_columns(self):
+        """
+        Columns in this table.
+
+        """
+        return list(self._frame.columns)
+
     @property
     def index(self):
         """
@@ -489,4 +501,6 @@ def run(models, years=None):
         for model_name in models:
             print('Running model {}'.format(model_name))
             model = get_model(model_name)
+            t1 = time.time()
             model(year=year)
+            logger.debug("Time to execute model = %.3fs" % (time.time()-t1))
diff --git a/urbansim/models/yamlmodelrunner.py → urbansim/sim/yamlmodelrunner.py b/urbansim/models/yamlmodelrunner.py → urbansim/sim/yamlmodelrunner.py
@@ -1,12 +1,27 @@
 import numpy as np
 import yaml
 import pandas as pd
+from numpy import random
 from urbansim.utils import misc
+import urbansim.sim.simulation as sim
 from urbansim.models import RegressionModel, SegmentedRegressionModel, \
     MNLLocationChoiceModel, SegmentedMNLLocationChoiceModel, \
     GrowthRateTransition
 
 
+# this is a single place to deal with nas
+def deal_with_nas_for_est_or_sim(df, subset=None):
+    if subset is not None:
+        flds = filter(lambda x: x in df.columns, subset)
+        df = df[flds]
+    lenbefore = len(df)
+    df = df.dropna(how='any')
+    lenafter = len(df)
+    if lenafter != lenbefore:
+        print "Dropped %d rows because they contained nas" % (lenbefore-lenafter)
+    return df
+
+
 def hedonic_estimate(df, cfgname):
     """
     Parameters
@@ -21,10 +36,12 @@ def hedonic_estimate(df, cfgname):
     model_type = yaml.load(open(cfg))["model_type"]
     if model_type == "regression":
         hm = RegressionModel.from_yaml(str_or_buffer=cfg)
+        df = deal_with_nas_for_est_or_sim(df, hm.columns_used())
         print hm.fit(df, debug=True).summary()
         est_data = {"est_data": hm.est_data}
     if model_type == "segmented_regression":
         hm = SegmentedRegressionModel.from_yaml(str_or_buffer=cfg)
+        df = deal_with_nas_for_est_or_sim(df, hm.columns_used())
         hm.min_segment_size = 10
         for k, v in hm.fit(df, debug=True).items():
             print "REGRESSION RESULTS FOR SEGMENT %s\n" % str(k)
@@ -35,16 +52,16 @@ def hedonic_estimate(df, cfgname):
     return est_data
 
 
-def hedonic_simulate(df, cfgname, outdf, outfname):
+def hedonic_simulate(df, cfgname, outdf_name, outfname):
     """
     Parameters
     ----------
     df : DataFrame
         The dataframe which contains the columns to use for the estimation.
     cfgname : string
         The name of the yaml config file which describes the hedonic model.
-    outdf : DataFrame
-        The dataframe to write the simulated price/rent to.
+    outdf_name : string
+        The name of the dataframe to write the simulated price/rent to.
     outfname : string
         The column name to write the simulated price/rent to.
     """
@@ -53,12 +70,28 @@ def hedonic_simulate(df, cfgname, outdf, outfname):
     model_type = yaml.load(open(cfg))["model_type"]
     if model_type == "regression":
         hm = RegressionModel.from_yaml(str_or_buffer=cfg)
+        df = deal_with_nas_for_est_or_sim(df, hm.columns_used())
     if model_type == "segmented_regression":
         hm = SegmentedRegressionModel.from_yaml(str_or_buffer=cfg)
+        df = deal_with_nas_for_est_or_sim(df, hm.columns_used())
         hm.min_segment_size = 10
     price_or_rent = hm.predict(df)
     print price_or_rent.describe()
-    outdf.loc[price_or_rent.index.values, outfname] = price_or_rent
+    print
+    s = sim.get_table(outdf_name).get_column(outfname)
+    s.loc[price_or_rent.index.values] = price_or_rent
+    sim.add_column(outdf_name, outfname, s)
+
+
+def _to_frame_get_fields(model_type, model, output_fname, df):
+    add_flds = [output_fname]
+    if model_type == "segmented_locationchoice":
+        add_flds += [model.segmentation_col]
+    flds = model.columns_used()+add_flds
+    print "The following fields are used by this model:", flds
+    print
+    df = df.to_frame(flds)
+    return deal_with_nas_for_est_or_sim(df)
 
 
 def lcm_estimate(choosers, chosen_fname, alternatives, cfgname):
@@ -84,10 +117,14 @@ def lcm_estimate(choosers, chosen_fname, alternatives, cfgname):
     model_type = yaml.load(open(cfg))["model_type"]
     if model_type == "locationchoice":
         lcm = MNLLocationChoiceModel.from_yaml(str_or_buffer=cfg)
+        choosers = _to_frame_get_fields(model_type, lcm, chosen_fname, choosers)
+        alternatives = deal_with_nas_for_est_or_sim(alternatives, lcm.columns_used())
         lcm.fit(choosers, alternatives, choosers[chosen_fname])
         lcm.report_fit()
     elif model_type == "segmented_locationchoice":
         lcm = SegmentedMNLLocationChoiceModel.from_yaml(str_or_buffer=cfg)
+        choosers = _to_frame_get_fields(model_type, lcm, chosen_fname, choosers)
+        alternatives = deal_with_nas_for_est_or_sim(alternatives, lcm.columns_used())
         lcm.fit(choosers, alternatives, choosers[chosen_fname])
         for k, v in lcm._group.models.items():
             print "LCM RESULTS FOR SEGMENT %s\n" % str(k)
@@ -116,11 +153,11 @@ def get_vacant_units(choosers, location_fname, locations, supply_fname):
         representing the number of agents that can be located at that location.
     """
     vacant_units = locations[supply_fname].sub(
-        choosers.groupby(location_fname).size(), fill_value=0)
+        choosers[location_fname].value_counts(), fill_value=0)
     print "There are %d total available units" % locations[supply_fname].sum()
     print "    and %d total choosers" % len(choosers.index)
     print "    but there are %d overfull buildings" % \
-        len(vacant_units[vacant_units < 0].index)
+        len(vacant_units[vacant_units < 0])
     vacant_units = vacant_units[vacant_units > 0]
     alternatives = locations.loc[np.repeat(vacant_units.index,
                                  vacant_units.values.astype('int'))] \
@@ -135,12 +172,13 @@ def _print_number_unplaced(df, fieldname="building_id"):
     Just an internal function to use to compute and print info on the number
     of unplaced agents.
     """
-    counts = df[fieldname].isnull().value_counts()
+    counts = (df[fieldname] == -1).value_counts()
     count = 0 if True not in counts else counts[True]
     print "Total currently unplaced: %d" % count
 
 
-def lcm_simulate(choosers, locations, cfgname, outdf, output_fname):
+def lcm_simulate(choosers, locations, cfgname, outdf_name, output_fname,
+                 location_ratio=2.0):
     """
     Simulate the location choices for the specified choosers
 
@@ -154,12 +192,16 @@ def lcm_simulate(choosers, locations, cfgname, outdf, output_fname):
     cfgname : string
         The name of the yaml config file from which to read the location
         choice model.
-    outdf : DataFrame
-        The dataframe to write the simulated location to.
+    outdf_name : string
+        The name of the dataframe to write the simulated location to.
     outfname : string
         The column name to write the simulated location to.
+    location_ratio : float
+        Above the location ratio (default of 2.0) of locations to choosers, the
+        locations will be sampled to meet this ratio (for performance reasons).
     """
     print "Running location choice model simulation\n"
+    outdf = sim.get_table(outdf_name)
     cfg = misc.config(cfgname)
     model_type = yaml.load(open(cfg))["model_type"]
 
@@ -168,14 +210,28 @@ def lcm_simulate(choosers, locations, cfgname, outdf, output_fname):
     elif model_type == "segmented_locationchoice":
         lcm = SegmentedMNLLocationChoiceModel.from_yaml(str_or_buffer=cfg)
 
-    movers = choosers[choosers[output_fname].isnull()]
+    choosers = _to_frame_get_fields(model_type, lcm, output_fname, choosers)
+
+    movers = choosers[choosers[output_fname] == -1]
+
+    locations = deal_with_nas_for_est_or_sim(locations, lcm.columns_used()+[output_fname])
+
+    if len(locations) > len(movers) * location_ratio:
+        print "Location ratio exceeded: %d locations and only %d choosers" % \
+              (len(locations), len(movers))
+        idxes = random.choice(locations.index, size=len(movers) * location_ratio,
+                              replace=False)
+        locations = locations.loc[idxes]
+        print "  after sampling %d locations are available\n" % len(locations)
 
     new_units = lcm.predict(movers, locations, debug=True)
     print "Assigned %d choosers to new units" % len(new_units.index)
     if len(new_units) == 0:
         return
-    outdf[output_fname].loc[new_units.index] = \
+    s = sim.get_table(outdf_name).get_column(output_fname)
+    s.loc[new_units.index] = \
         locations.loc[new_units.values][output_fname].values
+    sim.add_column(outdf_name, output_fname,  s)
     _print_number_unplaced(outdf, output_fname)
 
     if model_type == "locationchoice":
@@ -195,30 +251,32 @@ def simple_relocation(choosers, relocation_rate, fieldname='building_id'):
     """
     Parameters
     ----------
-    choosers : DataFrame
-        A dataframe of people which might be relocating.
+    choosers_name : string
+        A name of the dataframe of people which might be relocating.
     relocation_rate : float
         A number less than one describing the percent of rows to mark for
         relocation.
     fieldname : string
-        The field name in the choosers dataframe to set to np.nan for those
+        The field name in the choosers dataframe to set to -1 for those
         rows to mark for relocation.
     """
-    print "Running relocation\n"
+    choosers_name = choosers
+    choosers = sim.get_table(choosers)
+    print "Total agents: %d" % len(choosers[fieldname])
     _print_number_unplaced(choosers, fieldname)
     chooser_ids = np.random.choice(choosers.index, size=int(relocation_rate *
                                    len(choosers)), replace=False)
-    choosers[fieldname].loc[chooser_ids] = np.nan
+    s = choosers[fieldname]
+    print "Assinging for relocation..."
+    s.loc[chooser_ids] = -1
+    sim.add_column(choosers_name, fieldname, s)
     _print_number_unplaced(choosers, fieldname)
 
 
-def simple_transition(dset, dfname, rate):
+def simple_transition(dfname, rate):
     """
     Parameters
     ----------
-    choosers : dataset
-        The dataset object, in order to write the resulting transitioned
-        dataframe
     dfname : string
         The name of the dataframe in the dataset to read and write the the
         dataframe.
@@ -227,8 +285,9 @@ def simple_transition(dset, dfname, rate):
         transition model.
     """
     transition = GrowthRateTransition(rate)
-    df = dset.fetch(dfname)
+    tbl = sim.get_table(dfname)
+    df = tbl.to_frame(tbl.local_columns)
     print "%d agents before transition" % len(df.index)
     df, added, copied, removed = transition.transition(df, None)
     print "%d agents after transition" % len(df.index)
-    dset.save_tmptbl(dfname, df)
+    sim.add_table(dfname, df)