In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
def calculate_vifs(X):
    num_columns = X.shape[1]
    vifs = [variance_inflation_factor(X, idx) for idx in range(1, num_columns)]
    vifs = [round(vif, 2) for vif in vifs]
    return vifs

In [3]:
credit_data = pd.read_csv('../data/credit.csv')

In [4]:
X_columns = ['Age', 'Limit', 'Rating']
X = credit_data[X_columns]
X.corr()

Unnamed: 0,Age,Limit,Rating
Age,1.0,0.100888,0.103165
Limit,0.100888,1.0,0.99688
Rating,0.103165,0.99688,1.0


In [5]:
X = X.to_numpy()
X = sm.add_constant(X)

In [6]:
y = credit_data['Balance'].to_numpy()

In [7]:
# regression of Balance on Age, Limit and Rating
linear_model_1 = sm.OLS(endog=y, exog=X)
results_1 = linear_model_1.fit()
print(results_1.summary().tables[1])
print(f'R-Squared: {results_1.rsquared}\n')

vifs1 = calculate_vifs(X)
for column, vif in zip(X_columns, vifs1):
    print(f'Variance Inflation Factor for {column}: {vif}')

                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -259.5175     55.882     -4.644      0.000    -369.380    -149.655
x1            -2.3458      0.669     -3.508      0.001      -3.660      -1.031
x2             0.0190      0.063      0.302      0.763      -0.105       0.143
x3             2.3105      0.940      2.459      0.014       0.463       4.158
R-Squared: 0.7536015110570425

Variance Inflation Factor for Age: 1.01
Variance Inflation Factor for Limit: 160.59
Variance Inflation Factor for Rating: 160.67


Confidence Interval for Limit's coefficient not significant due to collinearity indicated by large VIFs for Limit and Rating.

In [8]:
# regression of Balance on Age and Limit
linear_model_2 = sm.OLS(endog=y, exog=X[:, :-1])
results_2 = linear_model_2.fit()
print(results_2.summary().tables[1])
print(f'R-Squared: {results_2.rsquared}\n')

# calculate and print VIFs
vifs2 = calculate_vifs(X[:, :-1])

for column, vif in zip(X_columns[:-1], vifs2):
    print(f'Variance Inflation Factor for {column}: {vif}')

                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -173.4109     43.828     -3.957      0.000    -259.576     -87.246
x1            -2.2915      0.672     -3.407      0.001      -3.614      -0.969
x2             0.1734      0.005     34.496      0.000       0.163       0.183
R-Squared: 0.7498386129717791

Variance Inflation Factor for Age: 1.01
Variance Inflation Factor for Limit: 1.01


Dropping Rating greatly reduced VIF without compromising the fit of the model.

Alternative solution is to combine the Rating and Limit.

In [21]:
longley_data = sm.datasets.get_rdataset('longley').data
longley_data.reset_index(drop=True, inplace=True)

In [38]:
X_columns = longley_data.columns[:-1]
X = longley_data.iloc[:, :-1].to_numpy()
X = sm.add_constant(X)

y = longley_data['Employed'].to_numpy()

In [40]:
longley_model = sm.OLS(endog=y, exog=X)
longley_results = longley_model.fit()

print(longley_results.summary())
print(f'R-Squared: {longley_results.rsquared}\n')

longley_vifs = calculate_vifs(X)
for column, vif in zip(X_columns, longley_vifs):
    print(f'Variance Inflation Factor for {column}: {vif}')

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.995
Model:                            OLS   Adj. R-squared:                  0.992
Method:                 Least Squares   F-statistic:                     330.3
Date:                Sun, 10 Jan 2021   Prob (F-statistic):           4.98e-10
Time:                        00:42:48   Log-Likelihood:                0.90665
No. Observations:                  16   AIC:                             12.19
Df Residuals:                       9   BIC:                             17.59
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -3482.2586    890.420     -3.911      0.0

