In [25]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [54]:
def calculate_vifs(X):
    num_columns = X.shape[1]
    vifs = [variance_inflation_factor(X, idx) for idx in range(1, num_columns)]
    vifs = [round(vif, 2) for vif in vifs]
    return vifs

In [66]:
credit_data = pd.read_csv('../data/credit.csv')

In [67]:
X_columns = ['Age', 'Limit', 'Rating']
X = credit_data[X_columns]
X.corr()

Unnamed: 0,Age,Limit,Rating
Age,1.0,0.100888,0.103165
Limit,0.100888,1.0,0.99688
Rating,0.103165,0.99688,1.0


In [68]:
X = X.to_numpy()
X = sm.add_constant(X)

In [69]:
y = credit_data['Balance'].to_numpy()

In [78]:
# regression of Balance on Age, Rating and Limit
linear_model_1 = sm.OLS(endog=y, exog=X)
results_1 = linear_model_1.fit()
r_squared_1 = results_1.rsquared
print(f'R-Squared: {r_squared_1}\n')

vifs1 = calculate_vifs(X)
for column, vif in zip(X_columns, vifs1):
    print(f'Variance Inflation Factor for {column}: {vif}')

R-Squared: 0.7536015110570427

Variance Inflation Factor for Age: 1.01
Variance Inflation Factor for Limit: 160.59
Variance Inflation Factor for Rating: 160.67


Presence of collinearity indicated by large VIFs for Limit and Rating.

In [85]:
# regression of Balance on Age and Rating
linear_model_2 = sm.OLS(endog=y, exog=X[:, :-1])
results_2 = linear_model_2.fit()
r_squared_2 = results_2.rsquared
print(f'R-Squared: {r_squared_2}\n')

# calculate and print VIFs
vifs2 = calculate_vifs(X[:, :-1])

for column, vif in zip(X_columns[:-1], vifs2):
    print(f'Variance Inflation Factor for {column}: {vif}')
    


R-Squared: 0.7498386129717793

Variance Inflation Factor for Age: 1.01
Variance Inflation Factor for Limit: 1.01


Dropping Rating greatly reduced VIF without compromising the fit of the model.