Skip to content
Permalink
Browse files

Merge commit 'v0.5.0-1269-g957a43e' into debian-experimental

* commit 'v0.5.0-1269-g957a43e': (59 commits)
  TST: avoid pandas in assert with wrapped results (failure pandas>0.12)
  REF: GEEResults rename to postfix names, keep old as alias for now
  BUG: NominalGEE pandas handling, fix typo, adjust unit tests
  ENH: GEE add Wrapper, closes statsmodels#1904
  BUG: NominalGEE fix initialization for pandas, closes statsmodels#1931
  BUG/ENH: add df_resid, df_model, fixes t_test, closes statsmodels#1918
  REF: GEE rename naive_covariance -> cov_naive, same for others
  REF: rename covariance_type -> cov_type
  TST: test_glm: remove test noise
  BUG: GEE subclasses cov_params_default doesn't get attached, unit tests
  REF: GEE summary remove covariance_type argument (not possible)
  BUG/TST: GEE fix predict closes statsmodels#1919 and fix conf_int for cov_params_default add GEE to generic test, and adjust those (missing _results, summary2)
  BUG: GEE partial cleanup of cov_type, see statsmodels#1906
  ENH: cov_kwds add generic 'scaling_factor' option to match Stata
  BUG: genmod links, CDFLink, probit deriv2 use approx_fprime (not complex)
  BUG: fix used hessian in sandwich_covariance, with GLM test cases
  REF: GLM change default scale in score_obs (tests pass, possibly wrong, see PR comment)
  REF/TST generic cov_type, TST compare GLM Logit, GLM OLS (this fails)
  REF/ENH: add get_robustcov_results generically to LikelihoodModel/Results    NegativeBinomial as test case, needs cleanup for code duplication.
  ENH: add cov_type to GLM.fit, with tests for family Poisson
  ...
  • Loading branch information...
yarikoptic committed Aug 27, 2014
2 parents 2d8e5ae + 957a43e commit f48717d311063fcf93d7eb3ff625841922e486a6
Showing with 1,861 additions and 237 deletions.
  1. +271 −0 statsmodels/base/covtype.py
  2. +65 −7 statsmodels/base/model.py
  3. +86 −4 statsmodels/base/tests/test_generic_methods.py
  4. +44 −5 statsmodels/discrete/discrete_model.py
  5. +389 −1 statsmodels/discrete/tests/test_sandwich_cov.py
  6. +24 −2 statsmodels/examples/ex_ols_robustcov.py
  7. +2 −2 statsmodels/examples/try_gee.py
  8. +2 −1 statsmodels/formula/tests/test_formula.py
  9. +17 −9 statsmodels/genmod/dependence_structures/covstruct.py
  10. +11 −0 statsmodels/genmod/families/links.py
  11. +223 −107 statsmodels/genmod/generalized_estimating_equations.py
  12. +26 −3 statsmodels/genmod/generalized_linear_model.py
  13. +320 −44 statsmodels/genmod/tests/test_gee.py
  14. +8 −3 statsmodels/genmod/tests/test_glm.py
  15. +41 −14 statsmodels/regression/linear_model.py
  16. +130 −1 statsmodels/regression/tests/test_robustcov.py
  17. +10 −0 statsmodels/sandbox/distributions/extras.py
  18. +2 −1 statsmodels/sandbox/distributions/tests/testtransf.py
  19. +2 −2 statsmodels/sandbox/regression/tests/test_gmm.py
  20. +2 −1 statsmodels/sandbox/stats/multicomp.py
  21. +3 −3 statsmodels/stats/anova.py
  22. +5 −2 statsmodels/stats/correlation_tools.py
  23. +5 −1 statsmodels/stats/sandwich_covariance.py
  24. +90 −13 statsmodels/stats/tests/test_corrpsd.py
  25. +12 −0 statsmodels/stats/tests/test_pairwise.py
  26. +10 −1 statsmodels/tools/grouputils.py
  27. +45 −0 statsmodels/tools/testing.py
  28. +1 −1 statsmodels/tsa/arima_model.py
  29. +3 −2 statsmodels/tsa/base/datetools.py
  30. +2 −1 statsmodels/tsa/base/tests/test_base.py
  31. +2 −2 statsmodels/tsa/tests/test_ar.py
  32. +4 −3 statsmodels/tsa/tests/test_arima.py
  33. +4 −1 tools/nbgenerate.py
@@ -0,0 +1,271 @@
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 04 08:00:16 2014
Author: Josef Perktold
License: BSD-3
"""

from statsmodels.compat.python import lrange, lzip, range

import numpy as np


def get_robustcov_results(self, cov_type='HC1', use_t=None, **kwds):
"""create new results instance with robust covariance as default
Parameters
----------
cov_type : string
the type of robust sandwich estimator to use. see Notes below
use_t : bool
If true, then the t distribution is used for inference.
If false, then the normal distribution is used.
kwds : depends on cov_type
Required or optional arguments for robust covariance calculation.
see Notes below
Returns
-------
results : results instance
This method creates a new results instance with the requested
robust covariance as the default covariance of the parameters.
Inferential statistics like p-values and hypothesis tests will be
based on this covariance matrix.
Notes
-----
Warning: Some of the options and defaults in cov_kwds may be changed in a
future version.
The covariance keywords provide an option 'scaling_factor' to adjust the
scaling of the covariance matrix, that is the covariance is multiplied by
this factor if it is given and is not `None`. This allows the user to
adjust the scaling of the covariance matrix to match other statistical
packages.
For example, `scaling_factor=(nobs - 1.) / (nobs - k_params)` provides a
correction so that the robust covariance matrices match those of Stata in
some models like GLM and discrete Models.
The following covariance types and required or optional arguments are
currently available:
- 'HC0', 'HC1', 'HC2', 'HC3' and no keyword arguments:
heteroscedasticity robust covariance
- 'HAC' and keywords
- `maxlag` integer (required) : number of lags to use
- `kernel` string (optional) : kernel, default is Bartlett
- `use_correction` bool (optional) : If true, use small sample
correction
- 'cluster' and required keyword `groups`, integer group indicator
- `groups` array_like, integer (required) :
index of clusters or groups
- `use_correction` bool (optional) :
If True the sandwich covariance is calulated with a small
sample correction.
If False the the sandwich covariance is calulated without
small sample correction.
- `df_correction` bool (optional)
If True (default), then the degrees of freedom for the
inferential statistics and hypothesis tests, such as
pvalues, f_pvalue, conf_int, and t_test and f_test, are
based on the number of groups minus one instead of the
total number of observations minus the number of explanatory
variables. `df_resid` of the results instance is adjusted.
If False, then `df_resid` of the results instance is not
adjusted.
- 'hac-groupsum' Driscoll and Kraay, heteroscedasticity and
autocorrelation robust standard errors in panel data
keywords
- `time` array_like (required) : index of time periods
- `maxlag` integer (required) : number of lags to use
- `kernel` string (optional) : kernel, default is Bartlett
- `use_correction` False or string in ['hac', 'cluster'] (optional) :
If False the the sandwich covariance is calulated without
small sample correction.
If `use_correction = 'cluster'` (default), then the same
small sample correction as in the case of 'covtype='cluster''
is used.
- `df_correction` bool (optional)
adjustment to df_resid, see cov_type 'cluster' above
#TODO: we need more options here
- 'hac-panel' heteroscedasticity and autocorrelation robust standard
errors in panel data.
The data needs to be sorted in this case, the time series for
each panel unit or cluster need to be stacked.
keywords
- `time` array_like (required) : index of time periods
- `maxlag` integer (required) : number of lags to use
- `kernel` string (optional) : kernel, default is Bartlett
- `use_correction` False or string in ['hac', 'cluster'] (optional) :
If False the the sandwich covariance is calulated without
small sample correction.
- `df_correction` bool (optional)
adjustment to df_resid, see cov_type 'cluster' above
#TODO: we need more options here
Reminder:
`use_correction` in "nw-groupsum" and "nw-panel" is not bool,
needs to be in [False, 'hac', 'cluster']
TODO: Currently there is no check for extra or misspelled keywords,
except in the case of cov_type `HCx`
"""

import statsmodels.stats.sandwich_covariance as sw

# TODO: make separate function that returns a robust cov plus info
use_self = kwds.pop('use_self', False)
if use_self:
res = self
else:
# this doesn't work for most models, use raw instance instead from fit
res = self.__class__(self.model, self.params,
normalized_cov_params=self.normalized_cov_params,
scale=self.scale)

res.cov_type = cov_type
# use_t might already be defined by the class, and already set
if use_t is None:
use_t = self.use_t
res.cov_kwds = {'use_t':use_t} # store for information
res.use_t = use_t

adjust_df = False
if cov_type in ['cluster', 'nw-panel', 'nw-groupsum']:
df_correction = kwds.get('df_correction', None)
# TODO: check also use_correction, do I need all combinations?
if df_correction is not False: # i.e. in [None, True]:
# user didn't explicitely set it to False
adjust_df = True

res.cov_kwds['adjust_df'] = adjust_df

# verify and set kwds, and calculate cov
# TODO: this should be outsourced in a function so we can reuse it in
# other models
# TODO: make it DRYer repeated code for checking kwds
if cov_type in ('HC0', 'HC1', 'HC2', 'HC3'):
if kwds:
raise ValueError('heteroscedasticity robust covarians ' +
'does not use keywords')
res.cov_kwds['description'] = ('Standard Errors are heteroscedasticity ' +
'robust ' + '(' + cov_type + ')')

res.cov_params_default = getattr(self, 'cov_' + cov_type.upper(), None)
if res.cov_params_default is None:
# results classes that don't have cov_HCx attribute
res.cov_params_default = sw.cov_white_simple(self,
use_correction=False)
elif cov_type == 'HAC':
maxlags = kwds['maxlags'] # required?, default in cov_hac_simple
res.cov_kwds['maxlags'] = maxlags
use_correction = kwds.get('use_correction', False)
res.cov_kwds['use_correction'] = use_correction
res.cov_kwds['description'] = ('Standard Errors are heteroscedasticity ' +
'and autocorrelation robust (HAC) using %d lags and %s small ' +
'sample correction') % (maxlags, ['without', 'with'][use_correction])

res.cov_params_default = sw.cov_hac_simple(self, nlags=maxlags,
use_correction=use_correction)
elif cov_type == 'cluster':
#cluster robust standard errors, one- or two-way
groups = kwds['groups']
if not hasattr(groups, 'shape'):
groups = np.asarray(groups).T
res.cov_kwds['groups'] = groups
use_correction = kwds.get('use_correction', True)
res.cov_kwds['use_correction'] = use_correction
if groups.ndim == 1:
if adjust_df:
# need to find number of groups
# duplicate work
self.n_groups = n_groups = len(np.unique(groups))
res.cov_params_default = sw.cov_cluster(self, groups,
use_correction=use_correction)

elif groups.ndim == 2:
if adjust_df:
# need to find number of groups
# duplicate work
n_groups0 = len(np.unique(groups[:,0]))
n_groups1 = len(np.unique(groups[:, 1]))
self.n_groups = (n_groups0, n_groups1)
n_groups = min(n_groups0, n_groups1) # use for adjust_df

# Note: sw.cov_cluster_2groups has 3 returns
res.cov_params_default = sw.cov_cluster_2groups(self, groups,
use_correction=use_correction)[0]
else:
raise ValueError('only two groups are supported')
res.cov_kwds['description'] = ('Standard Errors are robust to' +
'cluster correlation ' + '(' + cov_type + ')')

elif cov_type == 'nw-panel':
#cluster robust standard errors
res.cov_kwds['time'] = time = kwds['time']
#TODO: nlags is currently required
#nlags = kwds.get('nlags', True)
#res.cov_kwds['nlags'] = nlags
#TODO: `nlags` or `maxlags`
res.cov_kwds['maxlags'] = maxlags = kwds['maxlags']
use_correction = kwds.get('use_correction', 'hac')
res.cov_kwds['use_correction'] = use_correction
weights_func = kwds.get('weights_func', sw.weights_bartlett)
res.cov_kwds['weights_func'] = weights_func
# TODO: clumsy time index in cov_nw_panel
tt = (np.nonzero(np.diff(time) < 0)[0] + 1).tolist()
groupidx = lzip([0] + tt, tt + [len(time)])
self.n_groups = n_groups = len(groupidx)
res.cov_params_default = sw.cov_nw_panel(self, maxlags, groupidx,
weights_func=weights_func,
use_correction=use_correction)
res.cov_kwds['description'] = ('Standard Errors are robust to' +
'cluster correlation ' + '(' + cov_type + ')')
elif cov_type == 'nw-groupsum':
# Driscoll-Kraay standard errors
res.cov_kwds['time'] = time = kwds['time']
#TODO: nlags is currently required
#nlags = kwds.get('nlags', True)
#res.cov_kwds['nlags'] = nlags
#TODO: `nlags` or `maxlags`
res.cov_kwds['maxlags'] = maxlags = kwds['maxlags']
use_correction = kwds.get('use_correction', 'cluster')
res.cov_kwds['use_correction'] = use_correction
weights_func = kwds.get('weights_func', sw.weights_bartlett)
res.cov_kwds['weights_func'] = weights_func
if adjust_df:
# need to find number of groups
tt = (np.nonzero(np.diff(time) < 0)[0] + 1)
self.n_groups = n_groups = len(tt) + 1
res.cov_params_default = sw.cov_nw_groupsum(self, maxlags, time,
weights_func=weights_func,
use_correction=use_correction)
res.cov_kwds['description'] = (
'Driscoll and Kraay Standard Errors are robust to ' +
'cluster correlation ' + '(' + cov_type + ')')
else:
raise ValueError('cov_type not recognized. See docstring for ' +
'available options and spelling')

# generic optional factor to scale covariance
sc_factor = kwds.get('scaling_factor', None)
res.cov_kwds['scaling_factor'] = sc_factor
if sc_factor is not None:
res.cov_params_default *= sc_factor

if adjust_df:
# Note: df_resid is used for scale and others, add new attribute
res.df_resid_inference = n_groups - 1

return res

0 comments on commit f48717d

Please sign in to comment.
You can’t perform that action at this time.