In [1]:
# Load libraries
import yfinance as yf 
import pandas_datareader.data as web
import numpy as np
import pandas as pd

# Load VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
      
# Generate dataset from StockPricePrediction
stk_tickers = ['MSFT', 'IBM', 'GOOGL']
ccy_tickers = ['DEXJPUS', 'DEXUSUK']
idx_tickers = ['SP500', 'DJIA', 'VIXCLS']
stk_data = yf.download(stk_tickers)
ccy_data = web.DataReader(ccy_tickers, 'fred')
idx_data = web.DataReader(idx_tickers, 'fred')
return_period = 5
Y = np.log(stk_data.loc[:, ('Adj Close', 'MSFT')]).diff(return_period).shift(-return_period)
Y.name = Y.name[-1]+'_pred'
X1 = np.log(stk_data.loc[:, ('Adj Close', ('GOOGL', 'IBM'))]).diff(return_period)
X1.columns = X1.columns.droplevel()
X2 = np.log(ccy_data).diff(return_period)
X3 = np.log(idx_data).diff(return_period)
X4 = pd.concat([np.log(stk_data.loc[:, ('Adj Close', 'MSFT')]).diff(i) for i in [return_period, return_period*3, return_period*6, return_period*12]], axis=1).dropna()
X4.columns = ['MSFT_DT', 'MSFT_3DT', 'MSFT_6DT', 'MSFT_12DT']
X = pd.concat([X1, X2, X3, X4], axis=1)
dataset = pd.concat([Y, X], axis=1).dropna().iloc[::return_period, :]
Y = dataset.loc[:, Y.name]
X = dataset.loc[:, X.columns]

# Create function to calculate VIF
def calc_vif(df):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = 0
    # Calculate VIF for each feature
    for i in range(len(df.columns)):
        vif_data.loc[i, "VIF"] = variance_inflation_factor(df.values, i)    
    return vif_data

# Select VIF threshold
vif_threshold = 5

# Print initial features
print("\n", "Initial Features:", X.columns, "\n")

# Create Boolean to determine whether any feature exceeds the VIF threshold
violates = True

# Remove features that exceed the VIF threshold
while violates:
    vif_results = calc_vif(X)
    print(vif_results.sort_values(by = 'VIF', ascending = False), "\n")
    max_vif = vif_results['VIF'].max()
    # Check to see if any variables exceed the VIF threshold
    if max_vif > vif_threshold:
        drop_var = vif_results.loc[vif_results['VIF'].idxmax(), 'Feature']
        X = X.drop(drop_var, axis = 1)
        # Print current features after removal
        print("Removed Feature:", drop_var, "\n")
    else:
        violates = False
        
# Print final features
print("Final Features:", X.columns)

[*********************100%%**********************]  3 of 3 completed

 Initial Features: Index(['GOOGL', 'IBM', 'DEXJPUS', 'DEXUSUK', 'SP500', 'DJIA', 'VIXCLS',
       'MSFT_DT', 'MSFT_3DT', 'MSFT_6DT', 'MSFT_12DT'],
      dtype='object') 

      Feature        VIF
4       SP500  23.467368
5        DJIA  16.505712
7     MSFT_DT   3.003782
9    MSFT_6DT   2.866652
6      VIXCLS   2.677917
0       GOOGL   2.466297
8    MSFT_3DT   2.260309
10  MSFT_12DT   2.128798
1         IBM   1.878810
3     DEXUSUK   1.768682
2     DEXJPUS   1.403838 

Removed Feature: SP500 

     Feature       VIF
4       DJIA  4.380594
8   MSFT_6DT  2.859912
5     VIXCLS  2.540801
6    MSFT_DT  2.459482
7   MSFT_3DT  2.247571
9  MSFT_12DT  2.126645
0      GOOGL  2.010295
1        IBM  1.850417
3    DEXUSUK  1.750317
2    DEXJPUS  1.403836 

Final Features: Index(['GOOGL', 'IBM', 'DEXJPUS', 'DEXUSUK', 'DJIA', 'VIXCLS', 'MSFT_DT',
       'MSFT_3DT', 'MSFT_6DT', 'MSFT_12DT'],
      dtype='object')
