In [1]:
import numpy as np
import pandas as pd
import warnings
import tensorflow as tf
from sklearn.linear_model import HuberRegressor
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
#from sklearn.inspection import plot_partial_dependence
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings('ignore')
path = '/home/jhecy/ML/'

2024-04-29 11:56:23.384429: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Data preparation

In [2]:
## Load 94 firm characteristics dataset
data_ch = pd.read_csv(path+'GKX_20201231.csv')
data_ch['DATE'] = pd.to_datetime(data_ch['DATE'], format='%Y%m%d') + pd.offsets.MonthEnd(0)
data_ch = data_ch[(data_ch['DATE'] >= '1957-01-31') & (data_ch['DATE'] <= '2016-12-31')]
cols = data_ch.columns.tolist()
cols_new = [x for x in cols if x not in ['permno', 'prc', 'SHROUT', 'mve0']]
data_ch = data_ch[cols_new]

In [3]:
# Remove the samples with missing 'sic2'
data_ch  = data_ch.dropna(subset=['sic2']).reset_index(drop=True)
dummies = pd.get_dummies(data_ch['sic2'], prefix='dum_')
data_ch = data_ch.drop('sic2', axis=1)
# Construct dummy variables
data_ch = pd.concat([data_ch, dummies], axis=1)

In [4]:
# Replace all missings of firm characteristics with 0
chas = [x for x in cols_new if x not in ['DATE', 'RET', 'sic2']]
data_ch[chas] = data_ch[chas].fillna(0)

In [5]:
## Load 8 macroeconomic predictors
data_ma = pd.read_csv(path+'PredictorData2023.csv')
data_ma['yyyymm'] = pd.to_datetime(data_ma['yyyymm'], format='%Y%m') + pd.offsets.MonthEnd(0)
data_ma = data_ma[(data_ma['yyyymm'] >= '1957-01-31') & (data_ma['yyyymm'] <= '2016-12-31')].reset_index(drop=True)

In [6]:
# Construct 8 macroeconomic predictors
ma_predictors = ['dp', 'ep', 'bm', 'ntis', 'tbl', 'tms', 'dfy', 'svar']
data_ma['Index'] = data_ma['Index'].str.replace(',', '').astype('float64')
data_ma['dp'] = np.log(data_ma['D12'] / data_ma['Index'])
data_ma['ep'] = np.log(data_ma['E12'] / data_ma['Index'])
data_ma.rename(columns={'b/m': 'bm'}, inplace=True)
data_ma['tms'] = data_ma['lty'] - data_ma['tbl']
data_ma['dfy'] = data_ma['BAA'] - data_ma['AAA']
data_ma = data_ma[['yyyymm'] + ma_predictors]


In [7]:
# Construct the dataset including all covariates
data_ma_long = pd.merge(data_ch['DATE'], data_ma, left_on='DATE', right_on='yyyymm', how='left').drop('yyyymm', axis=1)
for cha in chas:
    for predictor in ma_predictors:
        name = cha + '_' + predictor
        data_ch[name] = data_ch[cha] * data_ma_long[predictor]
data = data_ch

In [9]:
## Split the dataset without validation
def get_data_split(str, end):
    covariates = [x for x in data.columns if (x != 'RET') & (x != 'DATE')]
    X = data[(data['DATE'] >= str) & (data['DATE'] <= end)][covariates].to_numpy()
    y = data[(data['DATE'] >= str) & (data['DATE'] <= end)]['RET'].to_numpy()
    return X, y

In [10]:
# Split the dataset with validation

def get_data_split1(str, end, model=None):
    if model == 'ols3':
        dt = data[['DATE', 'RET', 'mvel1', 'bm', 'mom1m']]
        covariates = ['mvel1', 'bm', 'mom1m']
        X = dt[(dt['DATE'] >= str) & (dt['DATE'] <= end)][covariates]
        y = dt[(dt['DATE'] >= str) & (dt['DATE'] <= end)]['RET']
        return X, y
    else:
        covariates = [x for x in data.columns if (x != 'RET') & (x != 'DATE')]
        X = data[(data['DATE'] >= str) & (data['DATE'] <= end)][covariates].to_numpy()
        y = data[(data['DATE'] >= str) & (data['DATE'] <= end)]['RET'].to_numpy()
        return X, y

In [11]:
# r2 function
def r2_score(y, yhat):
    r2 = 1 - sum((y - yhat) ** 2) / sum(y ** 2)
    return r2

In [12]:
train_str = '1957-01-31'; train_end = '1974-12-31'
val_str = '1975-01-31'; val_end = '1986-12-31'
test_str = '1987-01-31'; test_end = '2016-12-31'

## RF

In [None]:
## Fitting RF with training data with tuning hyperparameter
rf_oos = np.array([0])
for i in range(0,1):
    # Get training dataset, test dataset split
    str = pd.to_datetime(train_str)
    end = pd.to_datetime(train_end) + pd.DateOffset(years=i)
    mid_str = end + pd.DateOffset(months=1)
    mid_end = end + pd.DateOffset(years=12)
    oos_str = mid_end + pd.DateOffset(months=1)
    oos_end = mid_end + pd.DateOffset(years=1)

    X_train, y_train = get_data_split(str, end)
    X_val, y_val = get_data_split(mid_str, mid_end)
    X_test, y_test = get_data_split(oos_str, oos_end)

    
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_std = scaler.transform(X_train)
    X_val_std = scaler.transform(X_val)
    X_test_std = scaler.transform(X_test)
    
    # Define the parameter grid to search over
    param_grid = {
    'max_depth': [1, 2, 3, 4, 5, 6],
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900],
    'max_features': [3, 5, 10, 20, 30, 40, 50]
    }

    # Define the cross-validation scheme
    cv = KFold(n_splits=5, shuffle=True, random_state=1)

    # Create a random forest regressor object
    rf = RandomForestRegressor(random_state=1)
    # Create a grid search object
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error')

    # Fit the grid search object to the training data
    grid_search.fit(X_train_std, y_train)

    # Print the best parameters and score
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)
    
    # Train the final model using the best parameters on the combined training and validation sets
    best_rf = RandomForestRegressor(max_depth=grid_search.best_params_['max_depth'],
                                n_estimators=grid_search.best_params_['n_estimators'],
                                max_features=grid_search.best_params_['max_features'],
                                random_state=1)
    best_rf.fit(X_val_std, y_val)

    # Evaluate the final model on the test set
    y_pred = best_rf.predict(X_test_std)
    
    #rf = RandomForestRegressor(max_features=10, random_state=1)
    #rf.fit(X_train, y_train)
    #y_pred = rf.predict(X_test)

    rf_oos = np.concatenate((rf_oos.flatten(), y_pred.flatten()))
    rf_oos = rf_oos.reshape(-1, 1)
    y_test = y_test.ravel()
    rf_oos = rf_oos.ravel()
    rf_oos = np.resize(rf_oos, y_test.shape)
    rf_oos_r2 = r2_score(y_test, rf_oos)
    
    print(rf_oos_r2)
    
    ## save the result
    
    # with open('/home/jhecy/ML/rf-y_pred.csv', 'a') as f:
    #    np.savetxt(f, rf_oos, delimiter=',')
    #with open('/home/jhecy/ML/rf-y_test.csv', 'a') as f:
    #    np.savetxt(f, y_test, delimiter=',')

In [None]:
## Fitting RF with training data without tuning hyperparameter

rf_oos = np.array([0])
for i in range(0,1):
    # Get training dataset, test dataset split
    str = pd.to_datetime(train_str)
    end = pd.to_datetime(val_end) + pd.DateOffset(years=i)
    oos_str = end + pd.DateOffset(months=1)
    oos_end = end + pd.DateOffset(years=1)
    X_train, y_train = get_data_split(str, end)
    X_test, y_test = get_data_split(oos_str, oos_end)
    
    
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_std = scaler.transform(X_train)
    X_train_std = scaler.transform(X_train)
    X_test_std = scaler.transform(X_test)
    
    rf = RandomForestRegressor(max_features=6, random_state=1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    rf_oos = np.concatenate((rf_oos.flatten(), y_pred.flatten()))
    rf_oos = rf_oos.reshape(-1, 1)
    y_test = y_test.ravel()
    rf_oos = rf_oos.ravel()
    rf_oos = np.resize(rf_oos, y_test.shape)
    rf_oos_r2 = r2_score(y_test, rf_oos)
    print(rf_oos_r2)
    # with open('/home/jhecy/ML/rf-y_pred.csv', 'a') as f:
    #    np.savetxt(f, rf_oos, delimiter=',')
    #with open('/home/jhecy/ML/rf-y_test.csv', 'a') as f:
    #    np.savetxt(f, y_test, delimiter=',')

In [None]:
import csv
y_pred = []

with open('/home/jhecy/ML/rf-y_pred.csv', newline='') as file:
    reader = csv.reader(file)
    for row in reader:
        y_pred.append(row[0])

In [None]:
y_test = []

with open('/home/jhecy/ML/rf-y_test.csv', newline='') as file:
    reader = csv.reader(file)
    for row in reader:
        y_test.append(row[0])

### Importance variables

In [None]:
import matplotlib.pyplot as plt
# import seaborn as sns
import graphviz
import pydot
from IPython.display import Image

Importance = pd.DataFrame({'Importance':rf.feature_importances_*100}, index=X.columns)
Importance.sort_values('Importance', axis=0, ascending=True).plot(kind='barh', color='r', )
plt.xlabel('Variable Importance')
plt.gca().legend_ = None

## PCR

In [None]:
## Fitting PCR with training data
#pcr_oos = []
pcr_oos = np.array([0])
for i in range(1):
    # Get training dataset, test dataset split
    str = pd.to_datetime(train_str)
    end = pd.to_datetime(train_end) + pd.DateOffset(years=i)
    oos_str = end + pd.DateOffset(months=1)
    oos_end = end + pd.DateOffset(years=1)
    X_train, y_train = get_data_split(str, end)
    X_test, y_test = get_data_split(oos_str, oos_end)
    
    
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_std = scaler.transform(X_train)
    X_test_std = scaler.transform(X_test)
    
    pca = PCA(n_components=15)
    pca.fit(X_train_std)
    X_train_pca = pca.transform(X_train_std)
    X_test_pca = pca.transform(X_test_std)
    
    reg = LinearRegression()
    reg.fit(X_train_pca, y_train)
    y_pred = reg.predict(X_test_pca)
    #y_pred = np.array(y_pred).reshape(-1, 1)
    pcr_oos = np.concatenate((pcr_oos.flatten(), y_pred.flatten()))
    pcr_oos = pcr_oos.reshape(-1, 1)
    y_test = y_test.ravel()
    pcr_oos = pcr_oos.ravel()
    pcr_oos = np.resize(pcr_oos, y_test.shape)
    pcr_oos_r2 = r2_score(y_test, pcr_oos)  
    
    # save the result
    # with open('/home/jhecy/ML/pcr-y_pred.csv', 'a') as f:
    #    np.savetxt(f, pcr_oos, delimiter=',')
    
    # with open('/home/jhecy/ML/pcr-y_test.csv', 'a') as f:
    #    np.savetxt(f, y_test, delimiter=',')

In [None]:
import matplotlib.pyplot as plt

In [None]:
import csv
y_test = []

with open('/home/jhecy/ML/pcr-y_test.csv', newline='') as file:
    reader = csv.reader(file)
    for row in reader:
        y_test.append(row[0])


In [None]:
y_pred = []

with open('/home/jhecy/ML/pcr-y_pred.csv', newline='') as file:
    reader = csv.reader(file)
    for row in reader:
        y_pred.append(row[0])

In [None]:
# plot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
y_test = np.array(y_test).astype(float)
y_pred = np.array(y_pred).astype(float)
r2 = np.array([])
for i in range(0,24):
    str = i*77122
    end = (i+1)*77122
    yy_pred = y_pred[str:end]
    yy_test = y_test[str:end]
    r2 = np.append(r2_score(yy_test, yy_pred),r2)
plt.figure(figsize=(5,3))
plt.title("PCR")
plt.xlabel('Year')
plt.ylabel(r'$R_2$')
plt.plot(np.arange(1987,2011),r2[0:])
plt.tight_layout()

### Top20 Importance Variables

In [None]:
from sklearn.inspection import permutation_importance
import pandas as pd

init_train_str = '1957-01-31'; init_train_end = '1974-12-31'
init_val_str = '1975-01-31'; init_val_end = '1986-12-31'
init_test_str = '1987-01-31'; init_test_end = '2016-12-31'
year_span = 21

train_str = pd.to_datetime(init_train_str)
train_end = pd.to_datetime(init_val_end)+pd.DateOffset(years=year_span)
oos_str = train_end+pd.DateOffset(years=1)
oos_end = oos_str
X_train, y_train = get_data_split(train_str, train_end)
X_test, y_test = get_data_split(oos_str, oos_end)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)
    
pca = PCA(n_components=15)
pca.fit(X_train_std)
X_train_pca = pca.transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
    
reg = LinearRegression()
reg.fit(X_train_pca, y_train)
y_pred = reg.predict(X_test_pca)
#y_pred = np.array(y_pred).reshape(-1, 1)
pcr_oos = np.concatenate((pcr_oos.flatten(), y_pred.flatten()))
pcr_oos = pcr_oos.reshape(-1, 1)
y_test = y_test.ravel()
pcr_oos = pcr_oos.ravel()
pcr_oos = np.resize(pcr_oos, y_test.shape)
pcr_oos_r2 = r2_score(y_test, pcr_oos)  

results = permutation_importance(pca,X_train_pca,y_train,n_repeats=2,random_state=1)