In [1]:
## perform imports and set-up
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns

from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.interpolate import interp1d
from patsy.contrasts import Treatment

import torch
from torch.nn import Module
from torch import nn
# import torchvision
# from torchvision.datasets import mnist
from torch.nn import CrossEntropyLoss
from torch.optim import SGD, Adam
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
# import torchvision.transforms as transforms

from torch.optim.swa_utils import AveragedModel, SWALR
from torch.optim.lr_scheduler import CosineAnnealingLR

from sklearn.decomposition import IncrementalPCA
from sklearn.cross_decomposition import PLSRegression

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.datasets import make_regression

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import ElasticNet

import random
import csv

from datetime import datetime
from dateutil.relativedelta import relativedelta


%matplotlib inline
plt.style.use('ggplot') # emulate pretty r-style plots

In [2]:
def read_file(file_path, nrows=None):
    # Sample 100 rows of data to determine dtypes.
    df_test = pd.read_csv(file_path, nrows=100)
    float_cols = [c for c in df_test if df_test[c].dtype == "float64"]
    float16_cols = {c: np.float16 for c in float_cols}
    df = pd.read_csv(file_path, engine='c', dtype=float16_cols, nrows=nrows)
    return df

In [3]:
data = read_file('data_all_clean_rank_2.csv').iloc[:,1:]
data['DATE'] = pd.to_datetime(data['DATE'])
# data = read_file('data_manipulated.csv').iloc[:,1:]
# macropredictors = pd.read_csv("macropredictors.csv")
# data = read_file('./datashare/GKX_20201231.csv')
# macropredictors_raw = pd.read_excel('PredictorData2021.xlsx')

In [4]:
temp = np.array(["permno", "DATE", "RET", "SHROUT", "sic2", "mve0", "prc", "d_p", "e_p", "b_m", "ntis", "tbl", "tms", "dfy", "svar"])
character_names = np.setdiff1d(data.columns, temp)
# dp ep bm ntis tbl tms dfy svar
# macro-economic predictors
macropredictors_names = np.array(["d_p", "e_p", "b_m", "ntis", "tbl", "tms", "dfy", "svar"])

In [5]:
sic2, sic2_counts = np.unique(data["sic2"], return_counts=True)
def map_sic2(x):
    return np.where(sic2 == x)[0][0]
data["sic2"] = list(map(map_sic2, data["sic2"]))
sic2_onehot = pd.get_dummies(data["sic2"], prefix='sic2')
sic2_names = sic2_onehot.columns
# sic2_onehot = pd.get_dummies(data["sic2"])
data = pd.concat([data, sic2_onehot], axis=1)
# sic2_names = sic2_onehot.columns

In [6]:
# macro_character_names = []
# for item1 in macropredictors_names:
#     for item2 in character_names:
#         name_temp = item1 + '_' + item2
#         macro_character_names.append(name_temp)
#         temp_df = pd.DataFrame(data = {name_temp: data[item1].values * data[item2].values})
#         data = pd.concat([data, temp_df], axis=1)
# macro_character_names = np.array(macro_character_names)

In [6]:
# all_names = np.append(character_names, macro_character_names)
all_names = np.append(character_names, macropredictors_names)
all_names = np.append(all_names, sic2_names)

In [7]:
character_names.shape, macropredictors_names.shape, sic2_names.shape, all_names.shape # macro_character_names.shape, 

((94,), (8,), (75,), (177,))

In [8]:
def compute_r2(y_predict, y_test):
    y_predict = y_predict.reshape(-1)
    # y_test = y_test.reshape(-1)
    se = np.sum((y_predict - y_test)**2)
    denominator = np.sum(y_test**2)
    roos = 1 - (se/denominator)
    return roos

In [9]:
def compute_variable_importance(model, x_train, y_train, character_names, macropredictors_names, file_name):
    y_predict = model.predict(x_train)
    roos_all = compute_r2(y_predict, y_train)
    vi_array = np.array([])
    names_all = np.append(character_names, macropredictors_names)
    for item in names_all:
        x_temp = x_train.copy(deep=True)
        x_temp[item] = 0
        y_predict = model.predict(x_temp)
        roos_temp = compute_r2(y_predict, y_train)
        vi_array = np.append(vi_array, roos_all-roos_temp)
    print(vi_array)
    with open(file_name,'a') as fd:
        writer = csv.writer(fd)
        writer.writerow(vi_array)
        
    

In [10]:
train_start = datetime.strptime("1957-01-01", "%Y-%m-%d")
train_end = datetime.strptime("1974-12-31", "%Y-%m-%d")
val_start = datetime.strptime("1975-01-01", "%Y-%m-%d")
val_end = datetime.strptime("1986-12-31", "%Y-%m-%d")
test_start = datetime.strptime("1987-01-01", "%Y-%m-%d")
test_end = datetime.strptime("1987-12-31", "%Y-%m-%d")
filename = "OLS_result.csv"
filename_vi = 'OLS_result_vi.csv'
for i in range(30):
    train_index = (data['DATE'] >= train_start) & (data['DATE'] <= train_end)
    val_index = (data['DATE'] >= val_start) & (data['DATE'] <= val_end)
    test_index = (data['DATE'] >= test_start) & (data['DATE'] <= test_end)
    # print(data[train_index].head())
    # print(data[train_index].tail())
    
    # train model
    x_train = data[train_index][all_names]
    y_train = data[train_index]['RET']
    x_val = data[val_index][all_names]
    y_val = data[val_index]['RET']

    ols = make_pipeline(StandardScaler(), SGDRegressor(loss='huber', alpha=0))
    ols.fit(x_train, y_train)

    # test model
    x_test = data[test_index][all_names]
    y_test = data[test_index]['RET']
    y_predict = ols.predict(x_test)
    y_predict = y_predict.reshape(-1)
    df = pd.DataFrame({'predict':y_predict, 'real':y_test})
    df.to_csv(filename, mode='a', index=False, header=False)
    
    rms = np.sum((y_predict - y_test)**2)
    denominator = np.sum(y_test**2)
    roos = 1 - rms/denominator
    print('Accuracy:', roos)

    # variable importance computing 
    # compute_variable_importance(ols, x_train, y_train, character_names, macropredictors_names, filename_vi)

    train_end = train_end + relativedelta(years=1)
    val_start = val_start + relativedelta(years=1)
    val_end = val_end + relativedelta(years=1)
    test_start = test_start + relativedelta(years=1)
    test_end = test_end + relativedelta(years=1)

Accuracy: -0.2214302863525699
Accuracy: -0.3251253715273965
Accuracy: -0.2327493977497801
Accuracy: -0.1947883676798181
Accuracy: -0.030936779530139624
Accuracy: -0.154257110377086
Accuracy: -0.08045195998205545
Accuracy: -0.34344683103397733
Accuracy: -0.003773069858996392
Accuracy: 0.0016306467928800528
Accuracy: -0.016530001335859357
Accuracy: -0.008046433051978719
Accuracy: -0.033042100139295316
Accuracy: -0.04214776605533266
Accuracy: 0.01495769267539826
Accuracy: -0.0411485995051728
Accuracy: 0.05100336687453544
Accuracy: -0.038360091340191804
Accuracy: -0.01023547622923382
Accuracy: -0.025469480238515807
Accuracy: -0.04075293533023938
Accuracy: -0.20479412503073435
Accuracy: -0.0006296894660746677
Accuracy: -0.05438968396447441
Accuracy: -0.3106250029904538
Accuracy: -0.2003748700364849
Accuracy: -0.15634831579307917
Accuracy: -0.17198906793751867
Accuracy: -0.07971515046782707
Accuracy: -0.01699468614446409
