In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

ModuleNotFoundError: No module named 'statsmodels'

In [None]:
def calculate_vifs(X):
    num_columns = X.shape[1]
    vifs = [variance_inflation_factor(X, idx) for idx in range(1, num_columns)]
    vifs = [round(vif, 2) for vif in vifs]
    return vifs

In [None]:
credit_data = pd.read_csv('../data/credit.csv')

In [None]:
X_columns = ['Age', 'Limit', 'Rating']
X = credit_data[X_columns]
X.corr()

In [None]:
X = X.to_numpy()
X = sm.add_constant(X)

In [None]:
y = credit_data['Balance'].to_numpy()

In [None]:
# regression of Balance on Age, Limit and Rating
linear_model_1 = sm.OLS(endog=y, exog=X)
results_1 = linear_model_1.fit()
print(results_1.summary().tables[1])
print(f'R-Squared: {results_1.rsquared}\n')

vifs1 = calculate_vifs(X)
for column, vif in zip(X_columns, vifs1):
    print(f'Variance Inflation Factor for {column}: {vif}')

Confidence Interval for Limit's coefficient not significant due to collinearity indicated by large VIFs for Limit and Rating.

In [None]:
# regression of Balance on Age and Limit
linear_model_2 = sm.OLS(endog=y, exog=X[:, :-1])
results_2 = linear_model_2.fit()
print(results_2.summary().tables[1])
print(f'R-Squared: {results_2.rsquared}\n')

# calculate and print VIFs
vifs2 = calculate_vifs(X[:, :-1])

for column, vif in zip(X_columns[:-1], vifs2):
    print(f'Variance Inflation Factor for {column}: {vif}')

Dropping Rating greatly reduced VIF without compromising the fit of the model.

Alternative solution is to combine the Rating and Limit.

In [None]:
longley_data = sm.datasets.get_rdataset('longley').data
longley_data.reset_index(drop=True, inplace=True)

In [None]:
X_columns = longley_data.columns[:-1]
X = longley_data.iloc[:, :-1].to_numpy()
X = sm.add_constant(X)

y = longley_data['Employed'].to_numpy()

In [None]:
longley_model = sm.OLS(endog=y, exog=X)
longley_results = longley_model.fit()

print(longley_results.summary().tables[1])
print(f'R-Squared: {longley_results.rsquared}\n')

longley_vifs = calculate_vifs(X)
for column, vif in zip(X_columns, longley_vifs):
    print(f'Variance Inflation Factor for {column}: {vif}')