In [1]:
import numpy as np
import pandas as pd
import gc
from copy import copy
from tqdm import trange, tqdm
from datetime import datetime
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
import torchvision.transforms as transforms
from torch.optim.lr_scheduler import StepLR
import AssetPricing._auto_pca as ap
from AssetPricing._auto_pca import StockDataset, Auto_PCA, custom_collate, negative_correlation_loss

In [2]:
data = pd.read_csv("/Users/boningzhou/kernel ipca/data/cha1.csv", index_col=0)
# data = pd.read_csv('D:/project/data/data.csv', index_col = 1)
date = list(pd.unique(data.index))
date.sort()
ret = data.pivot(columns = 'PERMNO', values = 'RET').fillna(0).loc[date]

characteristics = dict()
for t in trange(len(date)):
    key = date[t]
    cha = data.loc[key].set_index('PERMNO').iloc[:, 1:]    
    cha = 2*cha.rank()/cha.count()-1
    rt = ret.loc[key, cha.index]
    cha.loc[rt[rt == 0].index] = np.nan
    cha = cha.dropna(how = 'all')
    
    characteristics[key] = cha

ret[ret == 0] = np.nan
ret = ret.add(- ret.mean(axis = 1), axis = 0).divide(ret.std(axis = 1), axis = 0)
ret = ret.fillna(0)



ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [3]:
date_train = date[:400]
date_val = date[400:]
ret_train = ret[:400]
ret_val = ret[400:]
characteristics_train = {k: characteristics[k] for k in date_train}
characteristics_val = {k: characteristics[k] for k in date_val}

In [4]:
dataset_train = StockDataset(characteristics_train, ret_train)
dataloader_train = DataLoader(dataset_train, batch_size = 1, collate_fn= custom_collate, shuffle = False)
dataset_val = StockDataset(characteristics_val, ret_val)
dataloader_val = DataLoader(dataset_val, batch_size = 1, collate_fn= custom_collate, shuffle = False)

In [5]:
layer_list = [36,1024,512,1]
model = Auto_PCA(layer_list, sparsity_strength=1e-8, sparse_layer = 0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay = 1e-4)
num_epoch = 100
train_losses = []
criterion = nn.MSELoss()
scheduler = StepLR(optimizer=optimizer, step_size=10, gamma = 1e-1)

cpu


In [6]:
for epoch in range(num_epoch):

    model.train()
    training_loss = 0.0
    batch_count = 0
    for i, (features_batch, labels_batch) in enumerate(tqdm(dataloader_train, desc = "Batches", leave = False)):
        batch_losses = 0.0

        for features, labels in zip(features_batch, labels_batch):
            features = features.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            output = model(features)
            size = output.shape[0]
            output = output.squeeze()
            # print(output.shape)
            labels = labels.squeeze()
            #labels = labels.unsqueeze(-1)
            # print(labels.shape)
            loss = negative_correlation_loss(output, labels) + model.sparsity_penalty()
            loss.backward()
            optimizer.step()
            batch_losses += loss.item()

        training_loss += batch_losses
        batch_count += 1


    average_batch_loss = training_loss/batch_count
    print("Epoch: {}, Training Loss: {}".format(epoch, average_batch_loss))
    scheduler.step()
    model.eval()
    val_loss = 0.0
    with torch.no_grad():

        for i, (features_batch, labels_batch) in enumerate(tqdm(dataloader_val, desc = "Batches", leave = False)):
            batch_losses = 0.0

            for features, labels in zip(features_batch, labels_batch):
                features = features.to(device)
                labels = labels.to(device)

                output = model(features)
                output = output.squeeze()
                # print(output.shape)
                labels = labels.squeeze()
                #labels = labels.unsqueeze(-1)
                # print(labels.shape)
                loss = negative_correlation_loss(output, labels)
                batch_losses += loss.item()

            val_loss += batch_losses
        average_val_loss = val_loss/batch_count
        print("Epoch: {}, VAL Loss: {}".format(epoch, average_val_loss))



Batches:   0%|          | 0/400 [00:00<?, ?it/s]

                                                          

Epoch: 0, Training Loss: 0.001970139154372583


                                                          

Epoch: 0, VAL Loss: 0.0


                                                          

Epoch: 1, Training Loss: 0.00034695424122012


                                                          

Epoch: 1, VAL Loss: 0.0


                                                          

Epoch: 2, Training Loss: 3.3971916981909598e-12


                                                          

Epoch: 2, VAL Loss: 0.0


                                                          

Epoch: 3, Training Loss: 9.045853354850514e-13


                                                          

Epoch: 3, VAL Loss: 0.0


                                                          

Epoch: 4, Training Loss: 5.122359336266644e-13


                                                          

Epoch: 4, VAL Loss: 0.0


                                                          

Epoch: 5, Training Loss: 3.545898390328768e-13


                                                          

Epoch: 5, VAL Loss: 0.0


                                                          

Epoch: 6, Training Loss: 2.5235001771629655e-13


                                                          

Epoch: 6, VAL Loss: 0.0


                                                          

KeyboardInterrupt: 