Skip to content

Commit

Permalink
Merge ca34f90 into ec56452
Browse files Browse the repository at this point in the history
  • Loading branch information
fscottfoti committed Jul 30, 2014
2 parents ec56452 + ca34f90 commit cca73ba
Show file tree
Hide file tree
Showing 9 changed files with 355 additions and 11 deletions.
1 change: 0 additions & 1 deletion urbansim/developer/sqftproforma.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,6 @@ def lookup(self, form, df, only_built=True):
df['max_far_from_dua'] = df.max_dua * df.ave_unit_size / self.config.building_efficiency
df['min_max_fars'] = df[['min_max_fars', 'max_far_from_dua']].min(axis=1)

df['min_max_fars'] = df.min_max_fars.fillna(0)
if only_built:
df = df.query('min_max_fars > 0 and parcel_size > 0')

Expand Down
138 changes: 137 additions & 1 deletion urbansim/models/lcm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@
import logging

import numpy as np
from numpy import random
import pandas as pd
from patsy import dmatrix
from prettytable import PrettyTable
import toolz

from . import util
from ..exceptions import ModelEvaluationError
from ..urbanchoice import interaction, mnl
from ..utils import yamlio
from ..utils.logutil import log_start_finish
Expand Down Expand Up @@ -238,6 +240,13 @@ def fit(self, choosers, alternatives, current_choice):
choosers, alternatives, self.sample_size, current_choice)
model_design = dmatrix(
self.str_model_expression, data=merged, return_type='dataframe')

if len(merged) != model_design.as_matrix().shape[0]:
raise ModelEvaluationError(
'Estimated data does not have the same length as input. '
'This suggests there are null values in one or more of '
'the input columns.')

self.log_likelihoods, self.fit_parameters = mnl.mnl_estimate(
model_design.as_matrix(), chosen, self.sample_size)
self.fit_parameters.index = model_design.columns
Expand Down Expand Up @@ -336,6 +345,12 @@ def predict(self, choosers, alternatives, debug=False):
model_design = dmatrix(
self.str_model_expression, data=merged, return_type='dataframe')

if len(merged) != model_design.as_matrix().shape[0]:
raise ModelEvaluationError(
'Simulated data does not have the same length as input. '
'This suggests there are null values in one or more of '
'the input columns.')

coeffs = [self.fit_parameters['Coefficient'][x]
for x in model_design.columns]

Expand Down Expand Up @@ -445,6 +460,65 @@ def columns_used(self):
self.alts_columns_used(),
self.interaction_columns_used())))

@classmethod
def fit_from_cfg(cls, choosers, chosen_fname, alternatives, cfgname):
"""
Parameters
----------
choosers : DataFrame
A dataframe of rows of agents which have locations assigned.
chosen_fname : string
A string indicating the column in the choosers dataframe which
gives which location the choosers have chosen.
alternatives : DataFrame
A dataframe of locations which should include the chosen locations
from the choosers dataframe as well as some other locations from
which to sample. Values in choosers[chosen_fname] should index
into the alternatives dataframe.
cfgname : string
The name of the yaml config file from which to read the location
choice model.
"""
lcm = cls.from_yaml(str_or_buffer=cfgname)
lcm.fit(choosers, alternatives, choosers[chosen_fname])
lcm.report_fit()
lcm.to_yaml(str_or_buffer=cfgname)
return lcm

@classmethod
def predict_from_cfg(cls, movers, locations, cfgname,
location_ratio=2.0):
"""
Simulate the location choices for the specified choosers
Parameters
----------
movers : DataFrame
A dataframe of agents doing the choosing.
locations : DataFrame
A dataframe of locations which the choosers are location in and which
have a supply.
cfgname : string
The name of the yaml config file from which to read the location
choice model.
location_ratio : float
Above the location ratio (default of 2.0) of locations to choosers, the
locations will be sampled to meet this ratio (for performance reasons).
"""
lcm = cls.from_yaml(str_or_buffer=cfgname)

if len(locations) > len(movers) * location_ratio:
print("Location ratio exceeded: %d locations and only %d choosers" %
(len(locations), len(movers)))
idxes = random.choice(locations.index, size=len(movers) * location_ratio,
replace=False)
locations = locations.loc[idxes]
print(" after sampling %d locations are available\n" % len(locations))

new_units = lcm.predict(movers, locations, debug=True)
print("Assigned %d choosers to new units" % len(new_units.index))
return new_units


class MNLLocationChoiceModelGroup(object):
"""
Expand Down Expand Up @@ -1033,4 +1107,66 @@ def columns_used(self):
return list(toolz.unique(toolz.concatv(
self.choosers_columns_used(),
self.alts_columns_used(),
self.interaction_columns_used())))
self.interaction_columns_used(),
[self.segmentation_col])))

@classmethod
def fit_from_cfg(cls, choosers, chosen_fname, alternatives, cfgname):
"""
Parameters
----------
choosers : DataFrame
A dataframe of rows of agents which have locations assigned.
chosen_fname : string
A string indicating the column in the choosers dataframe which
gives which location the choosers have chosen.
alternatives : DataFrame
A dataframe of locations which should include the chosen locations
from the choosers dataframe as well as some other locations from
which to sample. Values in choosers[chosen_fname] should index
into the alternatives dataframe.
cfgname : string
The name of the yaml config file from which to read the location
choice model.
"""
lcm = cls.from_yaml(str_or_buffer=cfgname)
lcm.fit(choosers, alternatives, choosers[chosen_fname])
for k, v in lcm._group.models.items():
print("LCM RESULTS FOR SEGMENT %s\n" % str(k))
v.report_fit()
lcm.to_yaml(str_or_buffer=cfgname)
return lcm

@classmethod
def predict_from_cfg(cls, movers, locations, cfgname,
location_ratio=2.0):
"""
Simulate the location choices for the specified choosers
Parameters
----------
movers : DataFrame
A dataframe of agents doing the choosing.
locations : DataFrame
A dataframe of locations which the choosers are location in and which
have a supply.
cfgname : string
The name of the yaml config file from which to read the location
choice model.
location_ratio : float
Above the location ratio (default of 2.0) of locations to choosers, the
locations will be sampled to meet this ratio (for performance reasons).
"""
lcm = cls.from_yaml(str_or_buffer=cfgname)

if len(locations) > len(movers) * location_ratio:
print("Location ratio exceeded: %d locations and only %d choosers" %
(len(locations), len(movers)))
idxes = random.choice(locations.index, size=len(movers) * location_ratio,
replace=False)
locations = locations.loc[idxes]
print(" after sampling %d locations are available\n" % len(locations))

new_units = lcm.predict(movers, locations, debug=True)
print("Assigned %d choosers to new units" % len(new_units.index))
return new_units
93 changes: 92 additions & 1 deletion urbansim/models/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ def fit_model(df, filters, model_expression):
"""
df = util.apply_filter_query(df, filters)
model = smf.ols(formula=model_expression, data=df)

if len(model.exog) != len(df):
raise ModelEvaluationError(
'Estimated data does not have the same length as input. '
'This suggests there are null values in one or more of '
'the input columns.')

with log_start_finish('statsmodels OLS fit', logger):
return model.fit()

Expand Down Expand Up @@ -325,6 +332,9 @@ class instance for use during prediction.
self.fit_parameters = _model_fit_to_table(fit)
if debug:
index = util.apply_filter_query(data, self.fit_filters).index
assert len(fit.model.exog) == len(index), (
"The estimate data is unequal in length to the original "
"dataframe, usually caused by nans")
df = pd.DataFrame(
fit.model.exog, columns=fit.model.exog_names, index=index)
df[fit.model.endog_names] = fit.model.endog
Expand Down Expand Up @@ -456,6 +466,41 @@ def columns_used(self):
util.columns_in_filters(self.predict_filters),
util.columns_in_formula(self.model_expression))))

@classmethod
def fit_from_cfg(cls, df, cfgname, debug=False):
"""
Parameters
----------
df : DataFrame
The dataframe which contains the columns to use for the estimation.
cfgname : string
The name of the yaml config file which describes the hedonic model.
debug : boolean, optional (default False)
Whether to generate debug information on the model.
"""
hm = cls.from_yaml(str_or_buffer=cfgname)
ret = hm.fit(df, debug=debug)
print ret.summary()
hm.to_yaml(str_or_buffer=cfgname)
return hm

@classmethod
def predict_from_cfg(cls, df, cfgname):
"""
Parameters
----------
df : DataFrame
The dataframe which contains the columns to use for the estimation.
cfgname : string
The name of the yaml config file which describes the hedonic model.
"""
hm = cls.from_yaml(str_or_buffer=cfgname)

price_or_rent = hm.predict(df)
print price_or_rent.describe()

return price_or_rent


class RegressionModelGroup(object):
"""
Expand Down Expand Up @@ -896,4 +941,50 @@ def columns_used(self):
return list(toolz.unique(toolz.concatv(
util.columns_in_filters(self.fit_filters),
util.columns_in_filters(self.predict_filters),
self._group.columns_used())))
self._group.columns_used(),
[self.segmentation_col])))

@classmethod
def fit_from_cfg(cls, df, cfgname, debug=False, min_segment_size=None):
"""
Parameters
----------
df : DataFrame
The dataframe which contains the columns to use for the estimation.
cfgname : string
The name of the yaml config file which describes the hedonic model.
debug : boolean, optional (default False)
Whether to generate debug information on the model.
min_segment_size : int, optional
Set attribute on the model.
"""
hm = cls.from_yaml(str_or_buffer=cfgname)
if min_segment_size:
hm.min_segment_size = min_segment_size

for k, v in hm.fit(df, debug=debug).items():
print "REGRESSION RESULTS FOR SEGMENT %s\n" % str(k)
print v.summary()
hm.to_yaml(str_or_buffer=cfgname)
return hm

@classmethod
def predict_from_cfg(cls, df, cfgname, min_segment_size=None):
"""
Parameters
----------
df : DataFrame
The dataframe which contains the columns to use for the estimation.
cfgname : string
The name of the yaml config file which describes the hedonic model.
min_segment_size : int, optional
Set attribute on the model.
"""
hm = cls.from_yaml(str_or_buffer=cfgname)
if min_segment_size:
hm.min_segment_size = min_segment_size

price_or_rent = hm.predict(df)
print price_or_rent.describe()

return price_or_rent
60 changes: 59 additions & 1 deletion urbansim/models/tests/test_lcm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
import pandas as pd
import pytest
import yaml
import os
from pandas.util import testing as pdt

from ...utils import misc
from ...utils import testing

from .. import lcm
Expand Down Expand Up @@ -189,7 +191,7 @@ def test_mnl_lcm_segmented(grouped_choosers, alternatives):
assert group.choosers_columns_used() == []
assert group.alts_columns_used() == []
assert set(group.interaction_columns_used()) == {'var1', 'var2', 'var3'}
assert set(group.columns_used()) == {'var1', 'var2', 'var3'}
assert set(group.columns_used()) == {'group', 'var1', 'var2', 'var3'}

assert group.fitted is False
logliks = group.fit(grouped_choosers, alternatives, 'thing_id')
Expand Down Expand Up @@ -286,3 +288,59 @@ def test_segmented_lcm_removes_old_models(grouped_choosers, alternatives):
group.fit(grouped_choosers, alternatives, 'thing_id')

assert sorted(group._group.models.keys()) == ['x', 'y']


def test_fit_from_cfg(choosers, alternatives):
model_exp = 'var2 + var1:var3'
sample_size = 5
choosers_fit_filters = ['var1 != 5']
choosers_predict_filters = ['var1 != 7']
alts_fit_filters = ['var3 != 15']
alts_predict_filters = ['var2 != 14']
interaction_predict_filters = None
estimation_sample_size = None
choice_column = None
name = 'Test LCM'

model = lcm.MNLLocationChoiceModel(
model_exp, sample_size,
choosers_fit_filters, choosers_predict_filters,
alts_fit_filters, alts_predict_filters,
interaction_predict_filters, estimation_sample_size,
choice_column, name)

misc._mkifnotexists("fake_data_home")
cfgname = os.path.join("fake_data_home", "test.yaml")
model.to_yaml(cfgname)
lcm.MNLLocationChoiceModel.fit_from_cfg(choosers, "thing_id", alternatives,
cfgname)
lcm.MNLLocationChoiceModel.predict_from_cfg(choosers, alternatives, cfgname)

lcm.MNLLocationChoiceModel.predict_from_cfg(choosers, alternatives,
cfgname, .2)


def test_fit_from_cfg_segmented(grouped_choosers, alternatives):
model_exp = 'var2 + var1:var3'
sample_size = 4

group = lcm.SegmentedMNLLocationChoiceModel(
'group', sample_size, default_model_expr=model_exp)
group.add_segment('x')
group.add_segment('y', 'var3 + var1:var2')

misc._mkifnotexists("fake_data_home")
cfgname = os.path.join("fake_data_home", "test.yaml")
group.to_yaml(cfgname)
lcm.SegmentedMNLLocationChoiceModel.fit_from_cfg(grouped_choosers,
"thing_id",
alternatives,
cfgname)
lcm.SegmentedMNLLocationChoiceModel.predict_from_cfg(grouped_choosers,
alternatives,
cfgname)

lcm.SegmentedMNLLocationChoiceModel.predict_from_cfg(grouped_choosers,
alternatives,
cfgname,
.8)
Loading

0 comments on commit cca73ba

Please sign in to comment.