# Regression Examples

In [5]:
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.graphics.factorplots import interaction_plot
from pandas.tools.plotting import scatter_matrix
import statsmodels.stats.diagnostic as ssd
from statsmodels.stats.diagnostic import het_breushpagan
from statsmodels.stats.diagnostic import het_goldfeldquandt

In [6]:
%matplotlib inline

# Regression with synthetic data

$$y = x'\beta + \epsilon$$

Generate data

In [7]:
X = np.random.rand(1000, 3)
X = sm.add_constant(X)
X[:5,:]

array([[ 1.        ,  0.49880953,  0.71528678,  0.04857041],
       [ 1.        ,  0.37130968,  0.83018949,  0.86065446],
       [ 1.        ,  0.79787531,  0.61129996,  0.01212496],
       [ 1.        ,  0.40289693,  0.63369986,  0.56813917],
       [ 1.        ,  0.41035286,  0.98087954,  0.71273889]])

In [None]:
beta = np.matrix([1,1,1,1]).transpose()

In [None]:
sd_epsilon = 1.0
epsilon = sd_epsilon * np.random.randn(1000,1)

In [None]:
eps_het = np.square(0.1 * X[:,2]) * np.random.randn(1, 1000)
eps_het = eps_het.transpose()

##  Generating outcome

* Use `epsilon` to generate data without heteroscedasticity (i.e., homoscedasticity)
* Use `eps_het` to generate data with heteroscedasticity

In [None]:
Y = X.dot(beta) + eps_het # + epsilon

In [None]:
all_raw = np.hstack((Y,X))

In [None]:
df = pd.DataFrame(all_raw, columns=['y', 'const', 'x1', 'x2', 'x3'])

In [None]:
scatter_matrix(df, figsize=(15,15))

In [None]:
results = smf.ols('y ~ x1 + x2 + x3', data=df).fit()

In [None]:
results.summary2()

# Residuals: normality

In [None]:
resid_stud = results.outlier_test()['student_resid']

In [None]:
dev_null = sm.graphics.qqplot(resid_stud, line='45', fit=True)

# Residuals: heteroscedasticity

In [None]:
plt.scatter(results.fittedvalues, results.resid)

In [None]:
results.condition_number

## Robust standard errors

If there is heteroscedasticity, you must use robust errors for hypothesis testing because the default standard errors are incorrect (too small).  Consequently, we obtain the robust standard errors and display them.

In [None]:
robust_results = results.get_robustcov_results()
robust_results.summary2()

Breusch-Pagan test returns: LM stat, LM p-value, F stat, F p-value

$$H_0: no\ heteroscedasticity$$

In [None]:
het_breushpagan(results.resid, results.model.exog)

In [None]:
het_goldfeldquandt(results.resid, results.model.exog)

# Outliers

In [None]:
results.outlier_test()
x = results.get_influence()
print x.summary_table()

In [None]:
x.hat_diag_factor

# Influence

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
fig = sm.graphics.influence_plot(results, ax=ax, criterion="cooks")

# Collinearity

In [None]:
XX = X.dot(X.transpose())
np.linalg.cond(XX)
np.linalg.cond(X)

In [None]:
for ix in range(X.shape[1]):
    print ix, variance_inflation_factor(X, ix)