In [8]:
## perform imports and set-up
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns

from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.interpolate import interp1d
from patsy.contrasts import Treatment

import torch
from torch.nn import Module
from torch import nn
# import torchvision
# from torchvision.datasets import mnist
from torch.nn import CrossEntropyLoss
from torch.optim import SGD, Adam
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
# import torchvision.transforms as transforms

from torch.optim.swa_utils import AveragedModel, SWALR
from torch.optim.lr_scheduler import CosineAnnealingLR

import csv
import random

from datetime import datetime
from dateutil.relativedelta import relativedelta


%matplotlib inline
plt.style.use('ggplot') # emulate pretty r-style plots

In [9]:
def read_file(file_path, nrows=None):
    # Sample 100 rows of data to determine dtypes.
    df_test = pd.read_csv(file_path, nrows=100)
    float_cols = [c for c in df_test if df_test[c].dtype == "float64"]
    float32_cols = {c: np.float32 for c in float_cols}
    df = pd.read_csv(file_path, engine='c', dtype=float32_cols)#, nrows=nrows)
    return df

In [10]:
data = read_file('data_all_clean_rank_2.csv', 1817034).iloc[:,1:]
data['DATE'] = pd.to_datetime(data['DATE'])
# data = read_file('data_manipulated.csv').iloc[:,1:]
# macropredictors = pd.read_csv("macropredictors.csv")
# data = read_file('./datashare/GKX_20201231.csv')
# macropredictors_raw = pd.read_excel('PredictorData2021.xlsx')

In [11]:
temp = np.array(["permno", "DATE", "RET", "SHROUT", "sic2", "mve0", "prc", "d_p", "e_p", "b_m", "ntis", "tbl", "tms", "dfy", "svar"])
character_names = np.setdiff1d(data.columns, temp)
# dp ep bm ntis tbl tms dfy svar
# macro-economic predictors
macropredictors_names = np.array(["d_p", "e_p", "b_m", "ntis", "tbl", "tms", "dfy", "svar"])

In [12]:
sic2, sic2_counts = np.unique(data["sic2"], return_counts=True)
def map_sic2(x):
    return np.where(sic2 == x)[0][0]
data["sic2"] = list(map(map_sic2, data["sic2"]))

In [13]:
character_names.shape, macropredictors_names.shape

((94,), (8,))

In [14]:
class MyDataset(Dataset):
  
  def __init__(self,sic2_x, character_x, macro_x, data_y, device=None):
    self.device = device
    self.device = 'cpu'
    self.x_sic2 = torch.tensor(sic2_x.values,dtype=torch.int16,device=self.device)
    self.x_character = torch.tensor(character_x.values,dtype=torch.float32,device=self.device)
    self.x_macro = torch.tensor(macro_x.values,dtype=torch.float32,device=self.device)
    self.y_train = torch.tensor(data_y.values,dtype=torch.float32,device=self.device)

  def __len__(self):
    return len(self.y_train)
  
  def __getitem__(self,idx):
    sic2_point = torch.eye(75, device=self.device)[self.x_sic2[idx]]
    character_point = self.x_character[idx,]
    multi_point = torch.kron(character_point, self.x_macro[idx,])
    train_data = torch.cat((sic2_point, character_point, multi_point), 0)
    return train_data,self.y_train[idx]

In [15]:
class WrappedBatchNorm(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.batchnorm = nn.BatchNorm1d(dim)
    def forward(self, x):
        if self.training:
            return self.batchnorm(x)
        else:
            x = x.transpose(-1,-2)
            y = self.batchnorm(x)
            y = y.transpose(-1,-2)
            return y

In [None]:
class LeNet5(Module):
    def __init__(self):
        super(LeNet5, self).__init__()
        self.main = nn.Sequential(
            WrappedBatchNorm(921),
            nn.Linear(921, 32),
            nn.ReLU(),
            WrappedBatchNorm(32),
            nn.Linear(32, 16),
            nn.ReLU(),
            WrappedBatchNorm(16),
            nn.Linear(16, 8),
            nn.ReLU(),
            # WrappedBatchNorm(8),
            nn.Linear(8, 4),
            nn.ReLU(),
            # WrappedBatchNorm(4),
            nn.Linear(4, 2),
            nn.ReLU(),
            # WrappedBatchNorm(2),
            nn.Linear(2, 1)
        )
    def forward(self, x):
        return self.main(x)

net = LeNet5()
print(net)

In [None]:
class LeNet4(Module):
    def __init__(self):
        super(LeNet4, self).__init__()
        self.main = nn.Sequential(
            WrappedBatchNorm(921),
            nn.Linear(921, 32),
            nn.ReLU(),
            WrappedBatchNorm(32),
            nn.Linear(32, 16),
            nn.ReLU(),
            WrappedBatchNorm(16),
            nn.Linear(16, 8),
            nn.ReLU(),
            # WrappedBatchNorm(8),
            nn.Linear(8, 4),
            nn.ReLU(),
            # WrappedBatchNorm(4),
            nn.Linear(4, 1),
        )

    def forward(self, x):
        return self.main(x)

net = LeNet4()
print(net)

In [None]:
class LeNet3(Module):
    def __init__(self):
        super(LeNet3, self).__init__()
        self.main = nn.Sequential(
            WrappedBatchNorm(921),
            nn.Linear(921, 32),
            nn.ReLU(),
            WrappedBatchNorm(32),
            nn.Linear(32, 16),
            nn.ReLU(),
            WrappedBatchNorm(16),
            nn.Linear(16, 8),
            nn.ReLU(),
            # WrappedBatchNorm(8),
            nn.Linear(8, 1)
        )

    def forward(self, x):
        return self.main(x)

net = LeNet3()
print(net)

In [None]:
class LeNet2(Module):
    def __init__(self):
        super(LeNet2, self).__init__()
        self.main = nn.Sequential(
            WrappedBatchNorm(921),
            nn.Linear(921, 32),
            nn.ReLU(),
            WrappedBatchNorm(32),
            nn.Linear(32, 16),
            nn.ReLU(),
            WrappedBatchNorm(16),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.main(x)


net = LeNet2()
print(net)

In [None]:
class LeNet1(Module):
    def __init__(self):
        super(LeNet1, self).__init__()
        self.main = nn.Sequential(
            WrappedBatchNorm(921),
            nn.Linear(921, 32),
            nn.ReLU(),
            WrappedBatchNorm(32),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.main(x)


net = LeNet1()
print(net)

In [17]:
# Train
def traindata(device, model, epochs, optimizer, loss_function, l1_lambda, train_loader, valid_loader):
    # Early stopping
    last_loss = 100
    patience = 5
    trigger_times = 0
    for epoch in range(1, epochs+1):
        model.train()
        for idx, (train_x, train_label) in enumerate(train_loader):
            #######################################################
            train_x, train_label = train_x.to(device), train_label.to(device)
            #######################################################
            optimizer.zero_grad()
            predict_y = model(train_x).squeeze()
            _loss = loss_function(predict_y, train_label)

            # l1 penalty term
            l1_norm = sum(p.abs().sum() for p in model.parameters())
        
            _loss = _loss + l1_lambda * l1_norm

            _loss.backward()
            optimizer.step()
            
            # Show progress
            if idx % 10 == 0 or idx == len(train_loader):
                print('[{}/{}, {}/{}] loss: {:.8}'.format(epoch, epochs, idx, len(train_loader), _loss.item()))

        # # update swa model
        # swa_model.update_parameters(model)
        # swa_scheduler.step()
        
        # Early stopping        
        if epoch >= 5:
            current_loss = validation(model, device, valid_loader, loss_function)
            print('The Current Loss:', current_loss)

            if current_loss > last_loss - 1e-3:
                trigger_times += 1
                print('Trigger Times:', trigger_times)

                if trigger_times >= patience:
                    print('Early stopping!\nStart to test process.')
                    return model

            else:
                print('trigger times: 0')
                trigger_times = 0

            last_loss = current_loss

    return model


def validation(model, device, valid_loader, loss_function):
    model.eval()
    loss_total = 0
    # Test validation data
    with torch.no_grad():
        for idx, (train_x, train_label) in enumerate(valid_loader):
            #######################################################
            train_x, train_label = train_x.to(device).unsqueeze(1), train_label.to(device)
            #######################################################
            predict_y = model(train_x).squeeze()
            # print(predict_y.shape)
            # print(train_label.shape)
            _loss = loss_function(predict_y, train_label)
            loss_total += _loss.item()

    return loss_total / len(valid_loader)



def test(device, model, test_loader, filename=None, vi=False, ensemble = 10):
    model.eval()
    y_predict_all = np.array([])
    y_test_all = np.array([])
    with torch.no_grad():
        se = 0
        denominator = 0
        for idx, (x_test, y_test) in enumerate(test_loader):
            #######################################################
            x_test, y_test = x_test.to(device).unsqueeze(1), y_test.to(device)
            #######################################################
            y_test = y_test.cpu().numpy()
            y_predict = np.zeros(y_test.shape)
            for i in range(ensemble):
                y_predict = y_predict + model(x_test).detach().squeeze().cpu().numpy()
            # print(y_predict.shape)
            y_predict = y_predict/ensemble
            y_predict_all = np.append(y_predict_all, y_predict)
            y_test_all = np.append(y_test_all, y_test)
            se = se + np.sum((y_predict - y_test)**2)
            denominator = denominator + np.sum(y_test**2)
        roos = 1 - se/denominator
        if vi:
            print(roos)
            return roos
        else:
            df = pd.DataFrame({'predict':y_predict_all, 'real':y_test_all})
            df.to_csv(filename, mode='a', index=False, header=False)
            print('Accuracy:', roos)


def test_vi(device, model, test_loader, mask, filename, ensemble = 1):
    print("Start variable importance computing!")
    model.eval()
    with torch.no_grad():
        roos_all = test(device, model, test_loader, vi=True)
        se_all = np.zeros(102)
        denominator = 0
        for idx, (x_test, y_test) in enumerate(test_loader):
            #######################################################
            x_test, y_test = x_test.to(device), y_test.to('cpu').numpy()
            #######################################################
            # mask = 1 - torch.eye(921).to(device) # [D, D] off-diag are 1, diag are 0.
            # mask = mask[:921-75,:]
            x_test = x_test.unsqueeze(-2) * mask
            # print(x_test.shape)
            # print(x_test)
            y_predict = np.zeros(x_test.shape[:2]).transpose(-1, -2)
            for i in range(ensemble):
                # random.seed(10)
                temp = model(x_test).detach().cpu().numpy().squeeze(-1).transpose(-1, -2)
                # print('temp:', temp.shape)
                y_predict = y_predict + temp
            y_predict = y_predict/ensemble
            # print('y_predict:', y_predict.shape)
            # print(y_predict - y_test)
            # print('y_predict:', y_predict.shape)
            # print('y_test:', y_test.shape)
            se_all = se_all + np.sum((y_predict - y_test)**2, axis=1)
            denominator = denominator + np.sum(y_test**2)
        roos = 1 - se_all/denominator
        print(roos.shape)
        print(roos_all-roos)
        print("Variable importance computing stopping!")
        with open(filename,'a') as fd:
            writer = csv.writer(fd)
            writer.writerow(roos_all-roos)
        return roos




In [18]:
def main(model, train_index, val_index, test_index, filename_vi, filename_roos, mask, lr = 1e-2, l1_lambda = 1e-3, ):
    # GPU device
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    # device = 'cpu'
    print('Device state:', device)

    epochs = 20
    batch_size = 10000
    val_batch_size = 100000
    # lr = 1e-2
    # l1_lambda = 1e-3
    loss_function = nn.MSELoss()
    
    model = model().to(device)
    optimizer = Adam(model.parameters(), lr=lr)

    # Data
    trainset = MyDataset(data.loc[train_index, "sic2"], 
                data.loc[train_index, character_names], 
                data.loc[train_index, macropredictors_names], 
                data.loc[train_index, "RET"], device)

    validset = MyDataset(data.loc[val_index, "sic2"], 
                data.loc[val_index, character_names], 
                data.loc[val_index, macropredictors_names], 
                data.loc[val_index, "RET"], device)

    testset = MyDataset(data.loc[test_index, "sic2"], 
                data.loc[test_index, character_names], 
                data.loc[test_index, macropredictors_names], 
                data.loc[test_index, "RET"], device)


    trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
    validloader = DataLoader(validset, batch_size=val_batch_size, shuffle=True, num_workers=0, pin_memory=True)
    testloader = DataLoader(testset, batch_size=val_batch_size, shuffle=False, num_workers=0, pin_memory=True)

    # Train
    model = traindata(device, model, epochs, optimizer, loss_function, l1_lambda, trainloader, validloader)

    # Test
    test(device, model, testloader, filename_roos)

    # Test variable importance
    # compute_variable_importance(device, model, data, train_index, val_batch_size, character_names, macropredictors_names, filename_vi)
    # trainloader = DataLoader(trainset, batch_size=1000, shuffle=False, num_workers=0, pin_memory=True)
    # test_vi(device, model, trainloader, mask, filename_vi)


    

In [None]:
train_start = datetime.strptime("1957-01-01", "%Y-%m-%d")
train_end = datetime.strptime("1974-12-31", "%Y-%m-%d")
val_start = datetime.strptime("1975-01-01", "%Y-%m-%d")
val_end = datetime.strptime("1986-12-31", "%Y-%m-%d")
test_start = datetime.strptime("1987-01-01", "%Y-%m-%d")
test_end = datetime.strptime("1987-12-31", "%Y-%m-%d")
filename_roos = "L1_nn_result_27-28.csv"
filename_vi = "L1_nn_vi.csv"

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'
mask = torch.ones(75*94).reshape(94,75).to(device)
for i in range(9):
    temp = 1-torch.eye(94).to(device)
    mask = torch.cat((mask, temp), 1)
for i in range(8):
    temp = torch.ones(921).to(device)
    temp[75+94+i*92:75+94+(i+1)*92] = 0
    temp = temp.unsqueeze(0)
    mask = torch.cat((mask, temp), 0)

for i in range(30):
    print("=====================", i, "===================")
    train_index = (data['DATE'] >= train_start) & (data['DATE'] <= train_end)
    val_index = (data['DATE'] >= val_start) & (data['DATE'] <= val_end)
    test_index = (data['DATE'] >= test_start) & (data['DATE'] <= test_end)
    # print(data[train_index].head())
    # print(data[train_index].tail())
    
    main(LeNet1, train_index, val_index, test_index, filename_vi, filename_roos, mask)

    train_end = train_end + relativedelta(years=1)
    val_start = val_start + relativedelta(years=1)
    val_end = val_end + relativedelta(years=1)
    test_start = test_start + relativedelta(years=1)
    test_end = test_end + relativedelta(years=1)