In [1]:
import numpy as np
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
n_samples = 100
# Updated correlation matrix with weaker within-group correlations
corr_matrix_updated = np.array([
    [1.00, 0.91, 0.15, 0.15],
    [0.91, 1.00, 0.15, 0.15],
    [0.15, 0.15, 1.00, 0.92],
    [0.15, 0.15, 0.92, 1.00],
])

mean = np.zeros(4)
# Regenerate data
data_updated = np.random.multivariate_normal(mean, corr_matrix_updated, n_samples)
df_updated = pd.DataFrame(data_updated, columns=["X1", "X2", "X3", "X4"])

# Recalculate VIFs
vif_data_updated = pd.DataFrame()
vif_data_updated["Variable"] = df_updated.columns
vif_data_updated["VIF"] = [
    variance_inflation_factor(df_updated.values, i) for i in range(df_updated.shape[1])
]
print("\nUpdated VIF Values:\n", vif_data_updated.round(2))


Updated VIF Values:
   Variable   VIF
0       X1  6.86
1       X2  6.94
2       X3  5.74
3       X4  5.78


In [2]:
import numpy as np
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

np.random.seed(42)

# Generate X1 and X2 (independent)
X1 = np.random.normal(0, 1, 1000)
X2 = np.random.normal(0, 1, 1000)

# X3 is a linear combination of X1 and X2 with small noise
X3 = X1 + X2 + np.random.normal(0, 0.1, 1000)  # Noise variance = 0.1

# X4 is independent
X4 = np.random.normal(0, 1, 1000) #+0.5*X1

df = pd.DataFrame({"X1": X1, "X2": X2, "X3": X3, "X4": X4})

# Check pairwise correlations
print("Pairwise Correlations (< 90%):\n", df.corr().round(2))

# Calculate VIFs
vif_data = pd.DataFrame()
vif_data["Variable"] = df.columns
vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
print("\nVIF Values:\n", vif_data.round(2))

Pairwise Correlations (< 90%):
       X1    X2    X3    X4
X1  1.00 -0.04  0.69 -0.01
X2 -0.04  1.00  0.70 -0.05
X3  0.69  0.70  1.00 -0.05
X4 -0.01 -0.05 -0.05  1.00

VIF Values:
   Variable     VIF
0       X1  100.72
1       X2  104.32
2       X3  196.11
3       X4    1.00
