mix of change to developer and lcms

Unfortunately I didn't check in my developer changes before I started working on debug info in the regressions and pdfs. So this push includes both. At any rate, this is a basically working developer model. As well as version of yamlmodelrunner which returns debug information for the regressions and simulations.
UDST · Jun 13, 2014 · e3b254d · e3b254d
1 parent 61be2e8
commit e3b254d
Show file tree

Hide file tree

Showing 7 changed files with 116 additions and 39 deletions.
diff --git a/urbansim/developer/developer.py b/urbansim/developer/developer.py
@@ -14,7 +14,7 @@ def __init__(self, feasibility):
     def max_form(f, colname):
         """
         Assumes dataframe with hierarchical columns with first index equal to the
-        use and second index equal to the attribtue
+        use and second index equal to the attribute
 
         e.g. f.columns equal to:
         mixedoffice   building_cost
@@ -118,14 +118,13 @@ def pick(self, form, target_units, parcel_size, ave_unit_size,
         df = df[df.max_profit_far > 0]
         df["parcel_size"] = parcel_size
         df = df[df.parcel_size < max_parcel_size]
-        df['new_sqft'] = df.parcel_size * df.max_profit_far
         ave_unit_size[ave_unit_size < min_unit_size] = min_unit_size
-        df['new_units'] = np.round(df.new_sqft / ave_unit_size)
+        df['residential_units'] = np.round(df.building_sqft / ave_unit_size)
         df['current_units'] = current_units
-        df['net_units'] = df.new_units - df.current_units
+        df['net_units'] = df.residential_units - df.current_units
         df = df[df.net_units > 0]
 
-        print "Describe of net units\n", df.net_units.describe()
+        # print "Describe of net units\n", df.net_units.describe()
         print "Sum of net units that are profitable", df.net_units.sum()
         if df.net_units.sum() < target_units:
             print "WARNING THERE WERE NOT ENOUGH PROFITABLE UNITS TO MATCH DEMAND"
@@ -155,6 +154,5 @@ def merge(old_df, new_df):
         maxind = np.max(old_df.index.values)
         new_df.index = new_df.index + maxind + 1
         concat_df = pd.concat([old_df, new_df], verify_integrity=True)
-        print concat_df.index.name
         concat_df.index.name = 'building_id'
         return concat_df
diff --git a/urbansim/developer/sqftproforma.py b/urbansim/developer/sqftproforma.py
@@ -8,6 +8,7 @@ def _reset_defaults(self):
         self.parcel_sizes = [10000.0]
         self.fars = [.1, .25, .5, .75, 1.0, 1.5, 1.8, 2.0, 3.0, 4.0, 5.0, 7.0, 9.0, 11.0]
         self.uses = ['retail', 'industrial', 'office', 'residential']
+        self.residential_uses = [False, False, False, True]
         self.forms = {
             'retail': {
                 "retail": 1.0
@@ -185,10 +186,13 @@ def _convert_types(self):
         """
         self.fars = np.array(self.fars)
         self.parking_rates = np.array([self.parking_rates[use] for use in self.uses])
+        self.res_ratios = {}
+        assert len(self.uses) == len(self.residential_uses)
         for k, v in self.forms.iteritems():
             self.forms[k] = np.array([self.forms[k].get(use, 0.0) for use in self.uses])
             # normalize if not already
             self.forms[k] /= self.forms[k].sum()
+            self.res_ratios[k] = pd.Series(self.forms[k])[self.residential_uses].sum()
         self.costs = np.transpose(np.array([self.costs[use] for use in self.uses]))
 
     @property
@@ -298,7 +302,7 @@ def _generate_lookup(self):
                         building_bulk = orig_bulk - parkingstalls * \
                             c.parking_sqft_d[parking_config]
 
-                df['build'] = building_bulk
+                df['building_sqft'] = building_bulk
 
                 parkingstalls = building_bulk * \
                     np.sum(uses_distrib * c.parking_rates) / c.sqft_per_rate
@@ -309,11 +313,11 @@ def _generate_lookup(self):
                 df['spaces'] = parkingstalls
 
                 if parking_config == 'underground':
-                    df['parksqft'] = parkingstalls * \
+                    df['park_sqft'] = parkingstalls * \
                         c.parking_sqft_d[parking_config]
                     stories = building_bulk / c.tiled_parcel_sizes
                 if parking_config == 'deck':
-                    df['parksqft'] = parkingstalls * \
+                    df['park_sqft'] = parkingstalls * \
                         c.parking_sqft_d[parking_config]
                     stories = ((building_bulk + parkingstalls *
                                 c.parking_sqft_d[parking_config]) /
@@ -322,17 +326,17 @@ def _generate_lookup(self):
                     stories = building_bulk / \
                         (c.tiled_parcel_sizes - parkingstalls *
                          c.parking_sqft_d[parking_config])
-                    df['parksqft'] = parkingstalls * \
+                    df['park_sqft'] = parkingstalls * \
                         c.parking_sqft_d[parking_config]
                     # not all fars support surface parking
                     stories[np.where(stories < 0.0)] = np.nan
 
-                df['total_sqft'] = df.build + df.parksqft
+                df['total_sqft'] = df.building_sqft + df.park_sqft
                 stories /= c.parcel_coverage
                 df['stories'] = stories
                 df['build_cost_sqft'] = self._building_cost(uses_distrib, stories)
 
-                df['build_cost'] = df.build_cost_sqft * df.build
+                df['build_cost'] = df.build_cost_sqft * df.building_sqft
                 df['park_cost'] = parking_cost
                 df['cost'] = df.build_cost + df.park_cost
 
@@ -444,7 +448,7 @@ def lookup(self, form, df, only_built=True):
         A dataframe which is indexed by the parcel ids that were passed, with the
         following columns.
 
-        building_size : Series, float
+        building_sqft : Series, float
             The number of square feet for the building to build.  Keep in mind
             this includes parking and common space.  Will need a helpful function
             to convert from gross square feet to actual usable square feet in
@@ -513,7 +517,7 @@ def twod_get(indexes, arr):
             return arr[indexes, np.arange(indexes.size)].astype('float')
 
         outdf = pd.DataFrame({
-            'building_size': twod_get(maxprofitind, building_bulks),
+            'building_sqft': twod_get(maxprofitind, building_bulks),
             'building_cost': twod_get(maxprofitind, building_costs),
             'total_cost': twod_get(maxprofitind, total_costs),
             'building_revenue': twod_get(maxprofitind, building_revenue),
@@ -524,6 +528,12 @@ def twod_get(indexes, arr):
         if only_built:
             outdf = outdf.query('max_profit > 0')
 
+        resratio = c.res_ratios[form]
+        nonresratio = 1.0 - resratio
+        outdf["residential_sqft"] = outdf.building_sqft * c.building_efficiency * resratio
+        outdf["non_residential_sqft"] = outdf.building_sqft * c.building_efficiency * nonresratio
+        outdf["stories"] = outdf["max_profit_far"] / c.parcel_coverage
+
         return outdf
 
     def _debug_output(self):

diff --git a/urbansim/models/lcm.py b/urbansim/models/lcm.py
@@ -124,6 +124,7 @@ def __init__(self, model_expression, sample_size,
         self.estimation_sample_size = estimation_sample_size
         self.choice_column = choice_column
         self.name = name if name is not None else 'MNLLocationChoiceModel'
+        self.sim_pdf = None
 
         self.log_likelihoods = None
         self.fit_parameters = None
@@ -271,7 +272,7 @@ def report_fit(self):
 
         print(tbl)
 
-    def predict(self, choosers, alternatives):
+    def predict(self, choosers, alternatives, debug=False):
         """
         Choose from among alternatives for a group of agents.
 
@@ -283,6 +284,10 @@ def predict(self, choosers, alternatives):
             agent probabilities of choosing alternatives.
         alternatives : pandas.DataFrame
             Table describing the things from which agents are choosing.
+        debug : bool
+            If debug is set to true, well set the variable "sim_pdf" on
+            the object to store the probabilities for mapping of the
+            outcome.
 
         Returns
         -------
@@ -299,22 +304,33 @@ def predict(self, choosers, alternatives):
         alternatives = util.apply_filter_query(
             alternatives, self.alts_predict_filters)
 
+        if len(choosers) == 0:
+            return pd.Series()
+
         # TODO: only using 1st item in choosers for determining probabilities.
         # Need to expand options around this.
+        num_choosers = 1
         _, merged, _ = interaction.mnl_interaction_dataset(
-            choosers.head(1), alternatives, len(alternatives))
+            choosers.head(num_choosers), alternatives, len(alternatives))
         merged = util.apply_filter_query(
             merged, self.interaction_predict_filters)
         model_design = dmatrix(
             self.str_model_expression, data=merged, return_type='dataframe')
 
+        coeffs = [self.fit_parameters['Coefficient'][x] for x in model_design.columns]
+
         # probabilities are returned from mnl_simulate as a 2d array
         # and need to be flatted for use in unit_choice.
         probabilities = mnl.mnl_simulate(
             model_design.as_matrix(),
-            self.fit_parameters['Coefficient'].values,
+            coeffs,
             numalts=len(merged), returnprobs=True).flatten()
 
+        if debug:
+            # when we're not doing 1st item of choosers, this will break!
+            assert num_choosers == 1
+            self.sim_pdf = pd.Series(probabilities, index=alternatives.index)
+
         # figure out exactly which things from which choices are drawn
         alt_choices = (
             merged[self.choice_column] if self.choice_column else merged.index)
@@ -460,8 +476,9 @@ def _iter_groups(self, data):
         """
         groups = data.groupby(self.segmentation_col)
 
-        for name in self.models:
-            yield name, groups.get_group(name)
+        for name, group in groups:
+            print("Returning group %s" % str(name))
+            yield name, group
 
     def fit(self, choosers, alternatives, current_choice):
         """
@@ -500,7 +517,7 @@ def fitted(self):
         return (all(m.fitted for m in self.models.values())
                 if self.models else False)
 
-    def predict(self, choosers, alternatives):
+    def predict(self, choosers, alternatives, debug=False):
         """
         Choose from among alternatives for a group of agents after
         segmenting the `choosers` table.
@@ -514,6 +531,10 @@ def predict(self, choosers, alternatives):
             Must have a column matching the .segmentation_col attribute.
         alternatives : pandas.DataFrame
             Table describing the things from which agents are choosing.
+        debug : bool
+            If debug is set to true, well set the variable "sim_pdf" on
+            the object to store the probabilities for mapping of the
+            outcome.
 
         Returns
         -------
@@ -526,7 +547,7 @@ def predict(self, choosers, alternatives):
         results = []
 
         for name, df in self._iter_groups(choosers):
-            choices = self.models[name].predict(df, alternatives)
+            choices = self.models[name].predict(df, alternatives, debug=debug)
             # remove chosen alternatives
             alternatives = alternatives.loc[~alternatives.index.isin(choices)]
             results.append(choices)
@@ -725,7 +746,7 @@ def fitted(self):
         """
         return self._group.fitted
 
-    def predict(self, choosers, alternatives):
+    def predict(self, choosers, alternatives, debug=False):
         """
         Choose from among alternatives for a group of agents after
         segmenting the `choosers` table.
@@ -739,6 +760,10 @@ def predict(self, choosers, alternatives):
             Must have a column matching the .segmentation_col attribute.
         alternatives : pandas.DataFrame
             Table describing the things from which agents are choosing.
+        debug : bool
+            If debug is set to true, well set the variable "sim_pdf" on
+            the object to store the probabilities for mapping of the
+            outcome.
 
         Returns
         -------
@@ -752,7 +777,11 @@ def predict(self, choosers, alternatives):
             choosers, self.choosers_predict_filters)
         alternatives = util.apply_filter_query(
             alternatives, self.alts_predict_filters)
-        return self._group.predict(choosers, alternatives)
+
+        if len(choosers) == 0:
+            return pd.Series()
+
+        return self._group.predict(choosers, alternatives, debug=debug)
 
     def _process_model_dict(self, d):
         """

diff --git a/urbansim/models/regression.py b/urbansim/models/regression.py
@@ -153,7 +153,7 @@ def predict(self, data):
 
         """
         model_design = dmatrix(self._rhs, data=data, return_type='dataframe')
-        return model_design.dot(self.params).values
+        return model_design.dot(self.params.loc[model_design.columns]).values
 
 
 def _model_fit_to_table(fit):
@@ -311,7 +311,8 @@ class instance for use during prediction.
         self.model_fit = fit
         self.fit_parameters = _model_fit_to_table(fit)
         if debug:
-            df = pd.DataFrame(fit.model.exog, columns=fit.model.exog_names, index=data.index)
+            index = util.apply_filter_query(data, self.fit_filters).index
+            df = pd.DataFrame(fit.model.exog, columns=fit.model.exog_names, index=index)
             df[fit.model.endog_names] = fit.model.endog
             df["fittedvalues"] = fit.fittedvalues
             df["residuals"] = fit.resid

diff --git a/urbansim/models/yamlmodelrunner.py b/urbansim/models/yamlmodelrunner.py
@@ -21,8 +21,8 @@ def hedonic_estimate(df, cfgname):
     model_type = yaml.load(open(cfg))["model_type"]
     if model_type == "regression":
         hm = RegressionModel.from_yaml(str_or_buffer=cfg)
-        print hm.fit(df).summary()
-        est_data = hm.est_data
+        print hm.fit(df, debug=True).summary()
+        est_data = {"est_data": hm.est_data}
     if model_type == "segmented_regression":
         hm = SegmentedRegressionModel.from_yaml(str_or_buffer=cfg)
         hm.min_segment_size = 10
@@ -162,17 +162,34 @@ def lcm_simulate(choosers, locations, cfgname, outdf, output_fname):
     print "Running location choice model simulation\n"
     cfg = misc.config(cfgname)
     model_type = yaml.load(open(cfg))["model_type"]
+
     if model_type == "locationchoice":
         lcm = MNLLocationChoiceModel.from_yaml(str_or_buffer=cfg)
     elif model_type == "segmented_locationchoice":
         lcm = SegmentedMNLLocationChoiceModel.from_yaml(str_or_buffer=cfg)
+
     movers = choosers[choosers[output_fname].isnull()]
-    new_units = lcm.predict(movers, locations)
+
+    new_units = lcm.predict(movers, locations, debug=True)
     print "Assigned %d choosers to new units" % len(new_units.index)
+    if len(new_units) == 0:
+        return
     outdf[output_fname].loc[new_units.index] = \
         locations.loc[new_units.values][output_fname].values
     _print_number_unplaced(outdf, output_fname)
 
+    if model_type == "locationchoice":
+        sim_pdf = {"sim_pdf": lcm.sim_pdf}
+    elif model_type == "segmented_locationchoice":
+        sim_pdf = {name: lcm._group.models[name].sim_pdf for name in lcm._group.models}
+
+    # go back to the buildings from units
+    sim_pdf = pd.concat(sim_pdf.values(), keys=sim_pdf.keys(), axis=1)
+    sim_pdf.index = locations[output_fname]
+    sim_pdf = sim_pdf.groupby(level=0).first()
+
+    return sim_pdf
+
 
 def simple_relocation(choosers, relocation_rate, fieldname='building_id'):
     """
@@ -189,8 +206,8 @@ def simple_relocation(choosers, relocation_rate, fieldname='building_id'):
     """
     print "Running relocation\n"
     _print_number_unplaced(choosers, fieldname)
-    chooser_ids = np.random.choice(choosers.index, size=relocation_rate *
-                                   len(choosers.index), replace=False)
+    chooser_ids = np.random.choice(choosers.index, size=int(relocation_rate *
+                                   len(choosers)), replace=False)
     choosers[fieldname].loc[chooser_ids] = np.nan
     _print_number_unplaced(choosers, fieldname)
 

diff --git a/urbansim/utils/dataset.py b/urbansim/utils/dataset.py
@@ -85,8 +85,12 @@ def df(self):
         return self.dset.fetch(self.name)
 
     def build_df(obj, flds=None):
+        flds = None
         if flds is None:
-            flds = obj.flds
+            if obj.flds is None:
+                return obj.df
+            else:
+                flds = obj.flds
         columns = [getattr(obj, fld) for fld in flds]
         df = pd.concat(columns, axis=1)
         df.columns = flds