Merge commit 'v0.5.0-13-g8e07d34' into debian

* commit 'v0.5.0-13-g8e07d34': Backport PR statsmodels#1200: BLD: do not install *.pyx *.c MANIFEST.in Backport PR statsmodels#1157: Tst precision master Backport PR statsmodels#1149: BUG: Fix small data issues for ARIMA. Backport PR statsmodels#1125: REF/BUG: Some GLM cleanup. Used trimmed results in NegativeBinomial variance. Backport PR statsmodels#1124: BUG: Fix ARIMA prediction when fit without a trend. Backport PR statsmodels#1117: Update ex_arma2.py Backport PR statsmodels#1089: ENH: exp(poisson.logpmf()) for poisson better behaved. Backport PR statsmodels#1077: BUG: Allow 1d exog in ARMAX forecasting. Backport PR statsmodels#1075: BLD: Fix build issue on some versions of easy_install. Backport PR statsmodels#1071: Update setup.py to fix broken install on OSX Backport PR statsmodels#1057: COMPAT: Fix py3 caching for get_rdatasets. BUG: fix predict (was refactoring victim)
yarikoptic · Nov 29, 2013 · 985a57e · 985a57e
2 parents 9f2a17d + 8e07d34
commit 985a57e
Show file tree

Hide file tree

Showing 17 changed files with 197 additions and 54 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,4 +1,4 @@
-global-include *.csv *.py *.txt *.pyx *.c
+global-include *.csv *.py *.txt
 #scikits*.*
 include MANIFEST.in
 
@@ -44,4 +44,4 @@ include statsmodels/stats/libqsturng/LICENSE.txt
 include statsmodels/regression/tests/results/leverage_influence_ols_nostars.txt
 
 
-global-exclude *~ *.swp  *.pyc *.bak
+global-exclude *~ *.swp  *.pyc *.bak *.pyx
diff --git a/docs/source/gettingstarted.rst b/docs/source/gettingstarted.rst
@@ -26,7 +26,7 @@ provides labelled arrays of (potentially heterogenous) data, similar to the
 comma-separated values file to a ``DataFrame`` object.
 
 `patsy <https://github.com/pydata/patsy>`_ is a Python library for describing
-satistical models and building `Design Matrices
+statistical models and building `Design Matrices
 <http://en.wikipedia.org/wiki/Design_matrix>`_ using ``R``-like formulas.
 
 Data

diff --git a/examples/tsa/ex_arma2.py b/examples/tsa/ex_arma2.py
@@ -14,10 +14,10 @@
 # The conventions of the arma_generate function require that we specify a
 # 1 for the zero-lag of the AR and MA parameters and that the AR parameters
 # be negated.
-arparams = np.r_[1, -arparams]
-maparam = np.r_[1, maparams]
+ar = np.r_[1, -arparams]
+ma = np.r_[1, maparams]
 nobs = 250
-y = arma_generate_sample(arparams, maparams, nobs)
+y = arma_generate_sample(ar, ma, nobs)
 
 # Now, optionally, we can add some dates information. For this example,
 # we'll use a pandas time series.

diff --git a/setup.py b/setup.py
@@ -13,6 +13,11 @@
 import subprocess
 import re
 
+# temporarily redirect config directory to prevent matplotlib importing
+# testing that for writeable directory which results in sandbox error in
+# certain easy_install versions
+os.environ["MPLCONFIGDIR"] = "."
+
 # may need to work around setuptools bug by providing a fake Pyrex
 try:
     import Cython
@@ -436,12 +441,12 @@ def get_data_files():
             continue
         path = pjoin(root, i)
         if os.path.isdir(path):
-            data_files.update({relpath(path).replace(sep, ".") : ["*.csv",
+            data_files.update({relpath(path, start=curdir).replace(sep, ".") : ["*.csv",
                                                                   "*.dta"]})
     # add all the tests and results files
     for r, ds, fs in os.walk(pjoin(curdir, "statsmodels")):
         if r.endswith('results') and 'sandbox' not in r:
-            data_files.update({relpath(r).replace(sep, ".") : ["*.csv",
+            data_files.update({relpath(r, start=curdir).replace(sep, ".") : ["*.csv",
                                                                "*.txt"]})
 
     return data_files

diff --git a/statsmodels/base/data.py b/statsmodels/base/data.py
@@ -234,7 +234,14 @@ def _get_names(self, arr):
     def _get_yarr(self, endog):
         if data_util._is_structured_ndarray(endog):
             endog = data_util.struct_to_ndarray(endog)
-        return np.asarray(endog).squeeze()
+        endog = np.asarray(endog)
+        if len(endog) == 1: # never squeeze to a scalar
+            if endog.ndim == 1:
+                return endog
+            elif endog.ndim > 1:
+                return np.asarray([endog.squeeze()])
+
+        return endog.squeeze()
 
     def _get_xarr(self, exog):
         if data_util._is_structured_ndarray(exog):

diff --git a/statsmodels/datasets/utils.py b/statsmodels/datasets/utils.py
@@ -131,7 +131,9 @@ def _cache_it(data, cache_path):
     if sys.version_info[0] >= 3:
         # for some reason encode("zip") won't work for me in Python 3?
         import zlib
-        open(cache_path, "wb").write(zlib.compress(pickle.dumps(data)))
+        # use protocol 2 so can open with python 2.x if cached in 3.x
+        open(cache_path, "wb").write(zlib.compress(pickle.dumps(data,
+                                                                protocol=2)))
     else:
         open(cache_path, "wb").write(pickle.dumps(data).encode("zip"))
 
@@ -141,7 +143,8 @@ def _open_cache(cache_path):
         # Python 3 build
         import zlib
         data = zlib.decompress(open(cache_path, 'rb').read())
-        data = pickle.loads(data)
+        # return as bytes object encoded in utf-8 for cross-compat of cached
+        data = pickle.loads(data).encode('utf-8')
     else:
         data = open(cache_path, 'rb').read().decode('zip')
         data = pickle.loads(data)
@@ -181,9 +184,9 @@ def _get_data(base_url, dataname, cache, extension="csv"):
         else:
             raise err
 
-    #Python 3, don't think there will be any unicode in r datasets
+    #Python 3, always decode as unicode
     if sys.version[0] == '3':  # pragma: no cover
-        data = data.decode('ascii', errors='strict')
+        data = data.decode('utf-8', errors='strict')
     return StringIO(data), from_cache
 
 

diff --git a/statsmodels/discrete/discrete_model.py b/statsmodels/discrete/discrete_model.py
@@ -840,7 +840,7 @@ def pdf(self, X):
         The parameter `X` is :math:`x_{i}\\beta` in the above formula.
         """
         y = self.endog
-        return stats.poisson.pmf(y, np.exp(X))
+        return np.exp(stats.poisson.logpmf(y, np.exp(X)))
 
     def loglike(self, params):
         """

diff --git a/statsmodels/genmod/families/family.py b/statsmodels/genmod/families/family.py
@@ -10,6 +10,7 @@
 from scipy.stats import ss
 import links as L
 import varfuncs as V
+FLOAT_EPS = np.finfo(float).eps
 
 class Family(object):
 
@@ -28,7 +29,6 @@ class Family(object):
 #TODO: change these class attributes, use valid somewhere...
     valid = [-np.inf, np.inf]
 
-    tol = 1.0e-05
     links = []
 
     def _setlink(self, link):
@@ -563,7 +563,7 @@ def _clean(self, x):
         possible that other families might need a check for validity of the
         domain.
         """
-        return np.clip(x, 1.0e-10, np.inf)
+        return np.clip(x, FLOAT_EPS, np.inf)
 
     def deviance(self, Y, mu, scale=1.):
         """
@@ -696,7 +696,7 @@ class Binomial(Family):
     endog for Binomial can be specified in one of three ways.
     """
 
-    links = [L.logit, L.probit, L.cauchy, L.log, L.cloglog]
+    links = [L.logit, L.probit, L.cauchy, L.log, L.cloglog, L.identity]
     variance = V.binary # this is not used below in an effort to include n
 
     def __init__(self, link=L.logit):  #, n=1.):
@@ -1102,7 +1102,7 @@ def _clean(self, x):
         possible that other families might need a check for validity of the
         domain.
         """
-        return np.clip(x, 1.0e-10, np.inf)
+        return np.clip(x, FLOAT_EPS, np.inf)
 
     def deviance(self, Y, mu, scale=1.):
         """

diff --git a/statsmodels/genmod/families/links.py b/statsmodels/genmod/families/links.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 import scipy.stats
+FLOAT_EPS = np.finfo(float).eps
 
 #TODO: are the instance actually "aliases"
 # I used this terminology in varfuncs as well -ss
@@ -70,17 +71,15 @@ class Logit(Link):
     Notes
     -----
     call and derivative use a private method _clean to make trim p by
-    1e-10 so that p is in (0,1)
+    machine epsilon so that p is in (0,1)
 
     Alias of Logit:
     logit = Logit()
     """
 
-    tol = 1.0e-10
-
     def _clean(self, p):
         """
-        Clip logistic values to range (tol, 1-tol)
+        Clip logistic values to range (eps, 1-eps)
 
         Parameters
         -----------
@@ -92,7 +91,7 @@ def _clean(self, p):
         pclip : array
             Clipped probabilities
         """
-        return np.clip(p, Logit.tol, 1. - Logit.tol)
+        return np.clip(p, FLOAT_EPS, 1. - FLOAT_EPS)
 
     def __call__(self, p):
         """
@@ -310,13 +309,11 @@ class Log(Link):
     Notes
     -----
     call and derivative call a private method _clean to trim the data by
-    1e-10 so that p is in (0,1). log is an alias of Log.
+    machine epsilon so that p is in (0,1). log is an alias of Log.
     """
 
-    tol = 1.0e-10
-
     def _clean(self, x):
-        return np.clip(x, Logit.tol, np.inf)
+        return np.clip(x, FLOAT_EPS, np.inf)
 
     def __call__(self, p, **extra):
         """
@@ -599,13 +596,11 @@ class NegativeBinomial(object):
         values are usually assumed to be in (.01,2).
     '''
 
-    tol = 1.0e-10
-
     def __init__(self, alpha=1.):
         self.alpha = alpha
 
     def _clean(self, x):
-        return np.clip(x, NegativeBinomial.tol, np.inf)
+        return np.clip(x, FLOAT_EPS, np.inf)
 
     def __call__(self, x):
         '''

diff --git a/statsmodels/genmod/families/varfuncs.py b/statsmodels/genmod/families/varfuncs.py
@@ -5,6 +5,7 @@
 __docformat__ = 'restructuredtext'
 
 import numpy as np
+FLOAT_EPS = np.finfo(float).eps
 
 class VarianceFunction(object):
     """
@@ -147,16 +148,15 @@ class Binomial(object):
     Alias for Binomial:
     binary = Binomial()
 
-    A private method _clean trims the data by 1e-10 so that p is in (0,1)
+    A private method _clean trims the data by machine epsilon so that p is
+    in (0,1)
     """
 
-    tol = 1.0e-10
-
     def __init__(self, n=1):
         self.n = n
 
     def _clean(self, p):
-        return np.clip(p, Binomial.tol, 1 - Binomial.tol)
+        return np.clip(p, FLOAT_EPS, 1 - FLOAT_EPS)
 
     def __call__(self, mu):
         """
@@ -208,16 +208,15 @@ class NegativeBinomial(object):
     Alias for NegativeBinomial:
     nbinom = NegativeBinomial()
 
-    A private method _clean trims the data by 1e-10 so that p is in (0,inf)
+    A private method _clean trims the data by machine epsilon so that p is
+    in (0,inf)
     '''
 
-    tol = 1.0e-10
-
     def __init__(self, alpha=1.):
         self.alpha = alpha
 
     def _clean(self, p):
-        return np.clip(p, NegativeBinomial.tol, np.inf)
+        return np.clip(p, FLOAT_EPS, np.inf)
 
     def __call__(self, mu):
         """
@@ -234,7 +233,7 @@ def __call__(self, mu):
             variance = mu + alpha*mu**2
         """
         p = self._clean(mu)
-        return mu + self.alpha*mu**2
+        return p + self.alpha*p**2
 
 nbinom = NegativeBinomial()
 nbinom.__doc__ = """

diff --git a/statsmodels/regression/linear_model.py b/statsmodels/regression/linear_model.py
@@ -376,7 +376,11 @@ def __init__(self, endog, exog, weights=1., missing='none', hasconst=None):
         weights = np.array(weights)
         if weights.shape == ():
             weights = np.repeat(weights, len(endog))
-        weights = weights.squeeze()
+        # handle case that endog might be of len == 1
+        if len(weights) == 1:
+            weights = np.array([weights.squeeze()])
+        else:
+            weights = weights.squeeze()
         super(WLS, self).__init__(endog, exog, missing=missing,
                                   weights=weights, hasconst=hasconst)
         nobs = self.exog.shape[0]

diff --git a/statsmodels/sandbox/regression/gmm.py b/statsmodels/sandbox/regression/gmm.py
@@ -142,7 +142,7 @@ def fit(self):
         return lfit
 
     #copied from GLS, because I subclass currently LikelihoodModel and not GLS
-    def predict(self, exog, params=None):
+    def predict(self, params, exog=None):
         """
         Return linear predicted values from a design matrix.
 
@@ -161,6 +161,9 @@ def predict(self, exog, params=None):
         -----
         If the model as not yet been fit, params is not optional.
         """
+        if exog is None:
+            exog = self.exog
+        return np.dot(exog, params)
         #JP: this doesn't look correct for GLMAR
         #SS: it needs its own predict method
         if self._results is None and params is None:

diff --git a/statsmodels/sandbox/tests/test_pca.py b/statsmodels/sandbox/tests/test_pca.py
@@ -57,9 +57,9 @@ def test_pca_svd():
     xred_svd, factors_svd, evals_svd, evecs_svd = pcasvd(xf, keepdim=0)
     assert_array_almost_equal(evals_svd, evals, 14)
     msign = (evecs/evecs_svd)[0]
-    assert_array_almost_equal(msign*evecs_svd, evecs, 14)
-    assert_array_almost_equal(msign*factors_svd, factors, 13)
-    assert_array_almost_equal(xred_svd, xreduced, 14)
+    assert_array_almost_equal(msign*evecs_svd, evecs, 13)
+    assert_array_almost_equal(msign*factors_svd, factors, 12)
+    assert_array_almost_equal(xred_svd, xreduced, 13)
 
     pcares = pca(xf, keepdim=2)
     pcasvdres = pcasvd(xf, keepdim=2)

diff --git a/statsmodels/stats/tests/test_diagnostic.py b/statsmodels/stats/tests/test_diagnostic.py
@@ -554,10 +554,10 @@ def test_normality(self):
         lf2 = smsdia.lillifors(res.resid**2)
         lf3 = smsdia.lillifors(res.resid[:20])
 
-        compare_t_est(lf1, lillifors1, decimal=(15, 14))
-        compare_t_est(lf2, lillifors2, decimal=(15, 15)) #pvalue very small
+        compare_t_est(lf1, lillifors1, decimal=(14, 14))
+        compare_t_est(lf2, lillifors2, decimal=(14, 14)) #pvalue very small
         assert_approx_equal(lf2[1], lillifors2['pvalue'], significant=10)
-        compare_t_est(lf3, lillifors3, decimal=(15, 1))
+        compare_t_est(lf3, lillifors3, decimal=(14, 1))
         #R uses different approximation for pvalue in last case
 
         #> ad = ad.test(residuals(fm))