In [None]:
import pandas as pd
import numpy as np
import os
import os.path as op
import torch.nn as nn
from functools import partial

IMAGE_WIDTH = {5: 15, 20: 60, 60: 180}
IMAGE_HEIGHT = {5: 32, 20: 64, 60: 96}  

# Construct CNN

In [None]:
class CNN(nn.Module):

    def __init__(self, 
                 num_class=2,       
                 initial_filter = 64,
                 filter_sizes= (5,3),
                 maxpool_sizes= (2,1),
                 dilation=(2,1),
                 stride=(2,1),
                 p=0.5,
                 BN=True,
                 Activation='LRELU'
                ):
        
        super(CNN, self).__init__()
        
        padding = (
            (filter_sizes[0]-1)//2,
            (filter_sizes[1]-1)//2
       )
        
        #BOOLEAN:whether to use batch norm
        self.BN = BN
        #STRING: type of activation
        self.Activation = Activation
        # int : 2 for binary
        self.num_class = num_class
        # float: prob for dropout
        self.p = p
        
        if self.Activation=='LRELU':
            self.lrelu = nn.LeakyReLU(negative_slope=0.01, inplace=True)
        else:
            self.lrelu = nn.ReLU(inplace=True)
            
        
        self.conv1 = nn.Conv2d(in_channels=1, 
                      out_channels=initial_filter, 
                      kernel_size=filter_sizes,
                      padding=padding,
                      stride=stride, 
                      dilation=dilation)
        
        self.bn1 = nn.BatchNorm2d(initial_filter)
        self.pool1 = nn.MaxPool2d(kernel_size=maxpool_sizes, 
                                       stride=maxpool_sizes)
            
            
        self.conv2= nn.Conv2d(
                      in_channels=initial_filter, 
                      out_channels=initial_filter*2, 
                      kernel_size=filter_sizes,
                      padding=padding
                      )
        
        self.bn2 = nn.BatchNorm2d(initial_filter*2)
        self.pool2 = nn.MaxPool2d(kernel_size=maxpool_sizes, 
                                  stride=maxpool_sizes)
            
        self.conv3 = nn.Conv2d(in_channels=initial_filter*2, 
                      out_channels=initial_filter*4, 
                      kernel_size=filter_sizes,
                      padding=padding
                      )
                        
        self.bn3=nn.BatchNorm2d(initial_filter*4)
        self.pool3 = nn.MaxPool2d(kernel_size=maxpool_sizes, 
                                  stride=maxpool_sizes)
        
        
        
        self.fc = nn.Linear(46080, 1)
        
        self.dropout = nn.Dropout(p=self.p)
        
        #XAVIER initialization
        nn.init.xavier_uniform_(self.conv1.weight)
        nn.init.xavier_uniform_(self.conv2.weight)
        nn.init.xavier_uniform_(self.conv3.weight)
        nn.init.xavier_uniform_(self.fc.weight)

    
    def forward(self, x):
        #print('Block1')
        x = self.conv1(x)
        #print(x.shape)
        if self.BN:
            x = self.bn1(x)
        x = self.lrelu(x)
        x = self.pool1(x)
        #print(x.shape)

        #print('Block2')
        x = self.conv2(x)
        #print(x.shape)
        if self.BN:
            x = self.bn2(x)
        x = self.lrelu(x)
        x = self.pool2(x)
        #print(x.shape)
        
        #print('Block3')
        x = self.conv3(x)
        #print(x.shape)
        if self.BN:
            x = self.bn3(x)
        x = self.lrelu(x)
        x = self.pool3(x)
        #print(x.shape)
        
        
        
        x = x.view(x.size(0), -1)
        #print('Flatten-FC')
        #print(x.shape)
        
        x = self.dropout(x)
        #drop out applied to Linear layer. 
        #print('after drop out')
        #print(x.shape)
        x = self.fc(x)
        #x=self.dropout(self.fc(x))        
        
        #softmax is not needed for our loss function
        #print('after linear layer')
        #print(x.shape)
        #x = self.classifier(x)  #take a vector z [2,1] -> probability [2,1]

        return x

In [None]:
#changing the model_name will save the output to another path
model_name ='regression_adjusted_return_normalized'
target_label ='adj_return_normalized' 
#'regression_return_normalized','regression_sharpe_normalized','regression_adjusted_return_normalized'
#'return_pred_normalized','sharpe_pred_normalized', 'adj_return_normalized'

model = CNN()
print(model)


# Assume that we are on a CUDA machine, then this should print a CUDA device:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

use_gpu = torch.cuda.is_available()

if use_gpu:
    print('GPU')
    device = torch.device("cuda:2" if use_gpu else "cpu")
    model = model.cuda()


In [None]:
# Show basic information of model
# Install a pip package in the current Jupyter kernel
#import sys
#!{sys.executable} -m pip install torchsummary

from torchsummary import summary

summary(model, input_size=(1, 64, 60)) 

# Training hyperparameters

In [None]:

lr = 1e-5 
BATCH_SIZE = {'train':128,
              'test':128}
momentum = 0.9
num_epochs = 50
#Patience of early stopping
patience = 2
     
#pin_memory ensures movement of data from cpu to gpu is efficient and fast. 
#In case one uses inbuilt datasets like MNIST or CIFAR10 then this parameter is not required as in that case data is loaded directly into GPU. 
#num_workers attribute tells the data loader instance how many sub-processes to use for data loading
pin_memory = True
num_workers = 2



# Load Data

In [None]:
Label_Raw = pd.read_hdf('./Label_Raw_Addtional_Predictors.h5',key='data')
length_Train=pd.read_hdf('./length_Train.h5',key='data')

#Now all images are saved in 1993 file
year = 1993
Images = np.memmap(
            op.join("./img_data/monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_images.dat"), 
            dtype=np.uint8, mode='r+').reshape(
                                        (-1, IMAGE_HEIGHT[20], IMAGE_WIDTH[20]))
        
Label_Raw = Label_Raw.reset_index().drop('index',axis=1) 
Label_Raw


In [None]:
len(Images)

In [None]:
#Convert into labels:
# 1: positive return
# 0: negative returns
raw_labels = Label_Raw[target_label].mask(Label_Raw[target_label]==0).dropna()

#adjust format for dataloader below
annotations = raw_labels.reset_index().rename(columns= {'index':'img_name',
                                           target_label:'label'})
annotations

In [None]:
import random
random.seed(10)

In [None]:
#Use Pytorch's DataLoader

In [None]:
from torch.utils.data import Dataset
import pandas as pd
import os
from PIL import Image
import torch

class CNNDataset(Dataset):
    def __init__(self, Images_map ,annotations, transform=None):
        
        self.annotations = annotations
        self.transform = transform
        self.Images = Images_map

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        
        img_id = self.annotations.iloc[index, 0]
        
        img = torch.from_numpy(self.Images[[img_id]]).type(torch.float)        
        
        y_label = torch.tensor((self.annotations.iloc[index, 1])).type(torch.float)

        if self.transform is not None:
            img = self.transform(img)

        return (img, y_label)

In [None]:
from torch.utils.data import DataLoader


train_size = int(len(annotations) * 0.7)
val_size = len(annotations) - train_size
#create dataset object
dataset = CNNDataset(Images,annotations)

#split train vs validation
train_set, validation_set = torch.utils.data.random_split(dataset,[train_size,val_size])

#load data
trainloader = DataLoader(dataset=train_set, batch_size=BATCH_SIZE['train'],shuffle=True,
                          num_workers=num_workers,pin_memory=pin_memory)
valloader = DataLoader(dataset=validation_set, batch_size=BATCH_SIZE['test'],shuffle=False,
                               num_workers=num_workers, pin_memory=pin_memory) 

loaders = {'train': trainloader, 'test': valloader}

In [None]:
dataiter = iter(trainloader)
images, labels = dataiter.next()

from matplotlib.pyplot import imshow
from matplotlib.pyplot import show
for img in images[:10]:
    imshow(img[0])
    show()

# Training

In [None]:
import os
import re
import time
import scipy.stats

class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count





In [None]:
# Train the model
import torch.optim as optim


criterion = nn.MSELoss()
#optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
optimizer = optim.Adam(model.parameters(), lr=lr)  #Adam



In [None]:
#Train 5 models with different random seeds
for seed in range(0,5):
    # The log for recording train (test) loss and errors.
    log = {
        'num_params': [],
        'train_loss': [],
        'train_error': [],
        'test_loss': [],
        'test_error': []
    }
    log_saver = log 
    num_epochs = 50


    print('seed '+str(seed))
    torch.manual_seed(seed)
    
    name = model_name+'_'+str(seed)
    
    since = time.time()
    steps = 0
    last_loss = 100
    triggertimes = 0


    model = CNN()
    if use_gpu:
        print('GPU')
        device = torch.device("cuda:2" if use_gpu else "cpu")
        model = model.cuda()

    use_tpu=False
    if use_tpu:
      print('TPU')
      # Places network on the default TPU core
      model = model.to(device)

    #optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    optimizer = optim.Adam(model.parameters(), lr=lr)  #Adam
    number_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    log['num_params'].append(number_params)

    print(f'total parameters: {number_params}')    

    #iterating over epochs    
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch + 1, num_epochs))
        print('-' * 10)
        
        for phase in ['train', 'test']:
                        
            loss_meter = AverageMeter()      
            mean_meter = AverageMeter()
            mean_sq_meter = AverageMeter()

            if phase == 'train':
                model.train(True)
            else:
                model.train(False)
            
                        
            for i, data in enumerate(loaders[re.findall('[a-zA-Z]+',phase)[0]]):
                inputs, labels = data
                #normalize greyscale to [0,1]                
                normalize = True
                if normalize:
                    inputs /=255
                
                if use_gpu:
                    inputs = inputs.cuda()
                    labels = labels.cuda()

                optimizer.zero_grad()

                outputs = model(inputs) 
                
                if steps <10000 and steps % 100 ==0 :
                    display(
                        pd.Series( outputs.view(-1).detach().cpu().numpy()).describe()
                        )

                                    
                loss = criterion(outputs.view(-1), 
                    labels)

                if phase == 'train':
                    loss.backward()
                    optimizer.step()
                    steps += 1

                N = outputs.size(0)

                loss_meter.update(loss.data.item(), N)
                
                #metrics to compute R-squared
                mean_meter.update(labels.mean().item(),N)                
                mean_sq_meter.update((labels**2).mean().item(),N)
                
                
                        
            epoch_loss = loss_meter.avg
            mean_tss = mean_sq_meter.avg - mean_meter.avg
            epoch_error = epoch_loss / mean_tss
                        
            if phase == 'train':
                log_saver['train_loss'].append(epoch_loss)
                log_saver['train_error'].append(epoch_error)

            elif phase == 'test':

                log_saver['test_loss'].append(epoch_loss)
                log_saver['test_error'].append(epoch_error)

            print(
                    f'{phase} loss: {epoch_loss:.4f}; error: {epoch_error:.4f}'
            )
            print('sample count total:')
            print(i)
            print(loss_meter.count)
            
                    
        if epoch % 100 == 0 or epoch == num_epochs - 1:
            print('Saving..')
            state = {'net': model, 'epoch': epoch, 'log': log_saver}

            if not os.path.isdir('./checkpoint_CNN/'+model_name):
                os.mkdir('./checkpoint_CNN/'+model_name)
            torch.save(state,
                       './checkpoint_CNN/'+model_name+'/'+name+'.t7')
            
        #Early Stopping            
        print('Cycle Completed for: '+phase+', total samples '+str(loss_meter.count))
        current_loss = epoch_loss
        if current_loss >= last_loss:
            trigger_times += 1
            print('Trigger Times:', trigger_times)
            if trigger_times >= patience:
                print('Early stopping!\nStart to test process.')
                break
                #return model, log_saver
        else:
            print('trigger times: 0')
            trigger_times = 0
        last_loss = current_loss
                    
            

    time_elapsed = time.time() - since
    print(
        f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s'
    )
    
    #save final model:
    state = {'net': model, 'epoch': epoch, 'log': log_saver}
    torch.save(state,
               './checkpoint_CNN/'+model_name+'/'+name+'_final.t7')


In [None]:
# Plot the results (for the last model in the loop):

In [None]:

import matplotlib.pyplot as plt
import numpy as np
def plot(log, result_dir):
    fontdict = {'size': 30}

    def get_fig(i, title):
        fig = plt.figure(i, figsize=(20, 10))
        ax = fig.add_subplot(111)
        #plt.title(title, fontsize=30, y=1.04)
        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        return fig, ax

    fig1, ax1 = get_fig(1, 'Loss on Cifar10')
    fig2, ax2 = get_fig(2, 'Error on Cifar10')

    ax1.plot(log['train_loss'],'b', linewidth=3, label='training')
    ax1.plot(log['test_loss'],'r', linewidth=3, label='test')
    ax1.set_ylabel('loss',fontdict=fontdict)
    ax2.plot(log['train_error'],'b', linewidth=3, label='training')
    ax2.plot(log['test_error'],'r', linewidth=3, label='test')
    ax2.set_ylabel('error',fontdict=fontdict)

    for ax in [ax1, ax2]:
        ax.set_xlabel('Number of epochs', fontdict=fontdict)
        ax.legend(loc='upper right', fontsize=20)
        
    if not os.path.exists(result_dir):
        os.mkdir(result_dir)
    fig1.savefig(result_dir + 'loss.png')
    fig2.savefig(result_dir + 'error.png')


plot(log, './alex_results/')
plt.show()

# Evaluating the results

# Load Test Data

In [None]:
Label_Test=pd.read_hdf('./Label_Test_Addtional_Predictors.h5',key='data')

length_Test=pd.read_hdf('./length_Test.h5',key='data')   

year = 2000
Images_Test = np.memmap(
                    op.join("./img_data/monthly_20d", f"20d_month_has_vb_[20]_ma_{year}_images.dat"), 
                    dtype=np.uint8, mode='r+',order='C',
                    shape=(len(Label_Test), IMAGE_HEIGHT[20], IMAGE_WIDTH[20]))
Label_Test = Label_Test.reset_index().drop('index',axis=1) 
Label_Test.tail()

In [None]:
Label_Test.shape

In [None]:
Images_Test.shape

In [None]:
raw_labels = (
    Label_Test[target_label].mask(Label_Test[target_label]==0).dropna()
    )


annotations_test = raw_labels.reset_index().rename(columns= {'index':'img_name',
                                           target_label:'label'})
annotations_test

In [None]:
#Load dataset
from torch.utils.data import DataLoader


dataset_test = CNNDataset(Images_Test,annotations_test)


testloader = DataLoader(dataset=dataset_test, 
                        batch_size=BATCH_SIZE['test'],
                        shuffle=False,
                        num_workers=num_workers, pin_memory=pin_memory) 



In [None]:
dataiter = iter(testloader)
images, labels = dataiter.next()

from matplotlib.pyplot import imshow
from matplotlib.pyplot import show
for img in images[:10]:
    imshow(img[0])
    show()

# Generate Predictions

In [None]:
import torch
epoch_loss = pd.Series()
epoch_accuracy = pd.Series()
PREDICTIONS = {}
# The log for recording train (test) loss and errors.


for seed in range(0,5):

    print('seed '+str(seed))
    torch.manual_seed(seed)
    name = model_name+'_'+str(seed)
    
    
    state = torch.load(
        './checkpoint_CNN/'+model_name+'/'+name+'_final.t7')
                    

    model = state['net']

    number_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'total parameters: {number_params}')

    if use_gpu:
        model = model.cuda()

    since = time.time()
    steps = 0

    loss_meter = AverageMeter()      
    mean_meter = AverageMeter()
    mean_sq_meter = AverageMeter()

    model.eval()
    Predictions = []

    for i, data in enumerate(testloader):
        inputs, labels = data

        normalize = True
        if normalize:
            inputs /=255

        if use_gpu:
            inputs = inputs.cuda()
            labels = labels.cuda()

        outputs = model(inputs) 


        if use_gpu:
          prob = list(outputs.cpu().view(-1).detach().numpy())          
        else:
          prob = list(outputs.view(-1).detach().numpy())
        Predictions += prob

        loss = criterion(outputs.view(-1), 
            labels)

        N = outputs.size(0)

        loss_meter.update(loss.data.item(), N)
        
        #metric used to compute r-squared
        mean_meter.update(labels.mean().item(),N)

        mean_sq_meter.update((labels**2).mean().item(),N)
        del inputs, labels, outputs

    epoch_loss.loc[seed] = loss_meter.avg
    mean_tss = mean_sq_meter.avg - mean_meter.avg
    epoch_accuracy.loc[seed] = epoch_loss.loc[seed] / mean_tss 
    PREDICTIONS[seed] = pd.Series(Predictions)    


In [None]:
#averaging all results
Predictions = pd.concat(PREDICTIONS,axis=1)
Predictions['mean'] = Predictions.mean(axis=1)

In [None]:
#check
assert len(Predictions) == len(annotations_test), "Unequal dimensions"

In [None]:
Predictions.index=annotations_test.index

annotations_test['prediction'] = Predictions['mean']
Label_Test['prediction']=annotations_test.set_index('img_name')['prediction']
Label_Test['prediction'].count()

In [None]:
#Store results

if not os.path.isdir('./output_summary/'):
    os.mkdir('./output_summary/')

try:
    Summary=pd.read_hdf('./output_summary/Test Results.h5',key='df')
except:
    Summary = pd.DataFrame(columns=['test loss','test accuracy'])

    
test_results = pd.Series({
            'test loss':epoch_loss.mean(),
            'test accuracy':epoch_accuracy.mean()})
Summary.loc[model_name] = test_results
Summary.to_hdf('./output_summary/Test Results.h5',key='df')


test_results.to_hdf('./output_summary/Test Results.h5',key=model_name)
epoch_loss.to_hdf('./output_summary/epoch_loss.h5',key=model_name)
epoch_accuracy.to_hdf('./output_summary/epoch_accuracy.h5',key=model_name)
Predictions.to_hdf('./output_summary/Predictions by seed.h5',key=model_name)
Label_Test.to_hdf('./output_summary/Predictions.h5',key=model_name)

In [None]:
Label_Test=pd.read_hdf('./output_summary/Predictions.h5',key=model_name)
Label_Test

# Portfolio Construction

In [None]:
pred_field = 'prediction'
return_field = 'Ret_month'

In [None]:
y_pred = Label_Test.set_index(['Date','StockID'])[pred_field].unstack()
return_next_per = Label_Test.set_index(['Date','StockID'])[return_field].unstack()
marketcap = Label_Test.set_index(['Date','StockID'])['MarketCap'].unstack()
vol = Label_Test.set_index(['Date','StockID'])['EWMA_vol'].unstack()

In [None]:
# to monitor the performance of the model
# stocks are split into 10 deciles based on the factor value generated by the model
y_rank = y_pred.rank(axis=1)
y_count = y_rank.max(axis=1)
#split stocks on each date into deciles
y_normalize = y_rank.div(y_count,axis=0).sub(0.1/y_count,axis=0) * 10
decile_portfolio = np.floor(y_normalize)

In [None]:
#the weighting scheme within each decile can be:
# equally-weighted
# value-weighted

Portfolio_Returns = {}
EW = {}
VW = {}

#portfolio weights
EW_w = {} #equally weighted portfolio
VW_w = {} # value (market capitalization) weighted portfolios

Count = {}
for i in range(0,10):
        port = (decile_portfolio==i)
        
        #equal weight
        EW[i+1]=return_next_per.where(port).mean(axis=1)
        
        if i in [0,9]:
            EW_w[i+1]=((port * 1).div(port.sum(axis=1),axis=0)).fillna(0)
        
        #total number of stocks 
        Count[i] = port.sum(axis=1)
        
        #market cap weight
        marketcap_port = marketcap.where(port)
        marketcap_port = marketcap_port.div(marketcap_port.sum(axis=1),axis=0)
        
        VW[i+1] = (marketcap_port * return_next_per).sum(axis=1)
        if i in [0,9]:
            VW_w[i+1]=marketcap_port.fillna(0)
            
#portfolio weights
EW_w = EW_w[10] - EW_w[1]
VW_w = VW_w[10] - VW_w[1]

In [None]:
#Vol Scaled Portfolio
# this is an experiment: no need to report it 
# since different stocks have different volatilities,
# we would like to hold less of a stock if it is more volatile


#demean s.t scores [-0.5,+0.5]
y_vol_scale = (y_normalize/10 - 0.5)
#adjust the volatility
y_vol_scale = y_vol_scale/vol
#adjust to have same leverage
mult = (EW_w.abs() ).sum(axis=1) / (y_vol_scale.abs() ).sum(axis=1)
W_vol_scale  = y_vol_scale.mul(mult,axis=0)
Port_vol_scale = (W_vol_scale * return_next_per).sum(axis=1)


In [None]:


Portfolio_Returns['EW'] = EW
Portfolio_Returns['VW'] = VW
Portfolio_Returns['Vol Controlled'] = Port_vol_scale

In [None]:
pd.concat(
    Portfolio_Returns['EW'],axis=1).to_hdf('./output_summary/Portfolio_Returns_EW.h5',key=model_name)
pd.concat(
    Portfolio_Returns['VW'],axis=1).to_hdf('./output_summary/Portfolio_Returns_VW.h5',key=model_name)
Portfolio_Returns['Vol Controlled'].to_hdf('./output_summary/Portfolio_Returns_Vol Controlled.h5',key=model_name)

EW_w.to_hdf('./output_summary/Equal_Weight_Portfolio.h5',key=model_name)

VW_w.to_hdf('./output_summary/Value_Weight_Portfolio.h5',key=model_name)


# Compute Performance of This Single Strategy 

In [None]:
from scipy.stats import t


def alpha_t(HL):
    t_stat = HL.mean()/(
        HL.std()/np.sqrt(len(HL)))
    return 1 - t.cdf(t_stat, len(HL)-1)

In [None]:
#p-values of the sharpe ratio
rejection = pd.Series()

#equal weighted portfolio
ret = pd.concat(Portfolio_Returns['EW'],axis=1)
HL = ret[10] - ret[1]
Dec = pd.concat({
    'Ret':ret.mean()*12,
    'SR':ret.mean()/ret.std()*(12**0.5)
},axis=1)

Dec.loc['H-L'] = pd.Series({
    'Ret':(HL).mean()*12,
    'SR': (HL).mean()/ (HL).std()*(12**0.5)
})

rejection.loc['Equal-Weight'] = alpha_t(HL)
#Turnover
portfolio_beginning = EW_w.shift() * (1+return_next_per.shift())
scaling_factor = (EW_w.shift() * return_next_per.shift()).sum(axis=1)+1
portfolio_beginning = portfolio_beginning.div(scaling_factor,axis=0)
TO = EW_w -  portfolio_beginning
TO = TO.abs().sum(axis=1).mean()/2


#value weighted portfolio
ret = pd.concat(Portfolio_Returns['VW'],axis=1)
HL = ret[10] - ret[1]
Dec2 = pd.concat({
    'Ret':ret.mean()*12,
    'SR':ret.mean()/ret.std()*(12**0.5)
},axis=1)

Dec2.loc['H-L'] = pd.Series({
    'Ret':(HL).mean()*12,
    'SR': (HL).mean()/ (HL).std()*(12**0.5)
})
rejection.loc['Value-Weight'] = alpha_t(HL)

#Turnover
portfolio_beginning = VW_w.shift() * (1+return_next_per.shift())
scaling_factor = (VW_w.shift() * return_next_per.shift()).sum(axis=1)+1
portfolio_beginning = portfolio_beginning.div(scaling_factor,axis=0)
TO2 = VW_w -  portfolio_beginning
TO2 = TO2.abs().sum(axis=1).mean()/2

pd.concat({'Equal-Weight':Dec,
          'Value-weight':Dec2},
          axis=1).to_hdf('./output_summary/Portfolio Stats.h5',key=model_name)


In [None]:
#significance
rejection.to_hdf('./output_summary/rejection.h5',key=model_name)

In [None]:
#turnover 
portfolio_beginning = W_vol_scale.shift() * (1+return_next_per.shift())
scaling_factor = (W_vol_scale.shift() * return_next_per.shift()).sum(axis=1)+1
portfolio_beginning = portfolio_beginning.div(scaling_factor,axis=0)

TO3 = W_vol_scale -  portfolio_beginning
TO3 = TO3.abs().sum(axis=1).mean()/2

pd.Series({'Equal-Weight':TO,
          'Value-weight':TO2,
          'Vol-Scale':TO3,
          }).to_hdf('./output_summary/Turnover.h5',key=model_name)

# Factor Loading

In [None]:
import datetime

mom = pd.read_csv('./F-F_Momentum_Factor.csv',index_col=[0])
mom.index= [
    datetime.datetime(int(np.floor(ind/100)),int(ind %100),28) for ind in mom.index]

reversal = pd.read_csv('./F-F_ST_Reversal_Factor.csv',index_col=[0])
reversal.index= [
    datetime.datetime(int(np.floor(ind/100)),int(ind %100),28) for ind in reversal.index]



ff5 = pd.read_csv('./F-F_Research_Data_5_Factors_2x3.csv',index_col=[0])
ff5.index= [
    datetime.datetime(int(np.floor(ind/100)),int(ind %100),28) for ind in ff5.index]


ff5['Momentum'] = mom
ff5['Reversal'] = reversal

ff5 /=100
ff5= ff5.resample('BM').last().drop('RF',axis=1)

In [None]:
#Grad Cam shows the image most activated when price patterns are volatile?
Factor = {}

#try lowVol 1/vol
y_rank = (1/vol).rank(axis=1)
y_count = y_rank.max(axis=1)
y_normalize = y_rank.div(y_count,axis=0).sub(0.1/y_count,axis=0) * 10
decile_portfolio = np.floor(y_normalize)

EW_factor = {}
VW_factor = {}

for i in range(0,10):
        port = (decile_portfolio==i)
        
        #equal weight
        EW_factor[i+1]=return_next_per.where(port).mean(axis=1)
        
        #market cap weight
        marketcap_port = marketcap.where(port)
        marketcap_port = marketcap_port.div(marketcap_port.sum(axis=1),axis=0)
        
        VW_factor[i+1] = (marketcap_port * return_next_per).sum(axis=1)
            
#portfolio weights
Factor['LowVol'] = pd.concat({
    'EW':EW_factor[10] - EW_factor[1],
    'VW':VW_factor[10] - VW_factor[1],
},axis=1)


In [None]:
#Compute factor loadings of CNN model under both value weighted (VW) and equal weighted (EW) setting
from statsmodels.regression.linear_model import OLS
import statsmodels.api as sm

Summary = {}
Factor['LowVol'].index = pd.to_datetime(Factor['LowVol'].index)

for w in ['VW','EW']:
    
    ret = pd.concat(Portfolio_Returns[w],axis=1)
    HL = ret[10] - ret[1]

    HL.index = pd.to_datetime(HL.index)

    ff5['Y'] = HL
    ff5.dropna(inplace=True)    
    summary_return = {}
    
    #cnn factor return ~ Famma French 5 factor 
    
    reg = OLS(endog = ff5['Y'],
              exog = sm.add_constant(ff5[['CMA','HML','Mkt-RF','RMW','SMB']])
             ).fit()

    summary = reg.params.append(
        reg.tvalues.rename(index={ind: ind+' t-stat' for ind in reg.tvalues.index})
    ).sort_index()
    
    summary_return['Famma-French 5 Factor']=summary
    
    
    #cnn factor return ~ Momentum + Reversal 
    reg = OLS(endog = ff5['Y'],
              exog = sm.add_constant(ff5[['Momentum','Reversal']])
             ).fit()

    summary = reg.params.append(
        reg.tvalues.rename(index={ind: ind+' t-stat' for ind in reg.tvalues.index})
    ).sort_index()
    
    summary_return['Momentum/Reversal']=summary
    
    
    dataset_factor = ff5.join(
                        pd.concat({
                                'LowVol':Factor['LowVol'][w],
                                },axis=1)        
    ).dropna()
            
    
    #cnn factor return ~ LowVol 
    reg = OLS(endog = dataset_factor['Y'],
              exog = sm.add_constant(dataset_factor['LowVol'])
             ).fit()

    summary = reg.params.append(
        reg.tvalues.rename(index={ind: ind+' t-stat' for ind in reg.tvalues.index})
    ).sort_index()
    
    summary_return['LowVol']=summary
        
    
    #cnn factor return ~ All Factors  
    reg = OLS(endog = dataset_factor['Y'],
              exog = sm.add_constant(dataset_factor.drop('Y',axis=1))
             ).fit()

    summary = reg.params.append(
        reg.tvalues.rename(index={ind: ind+' t-stat' for ind in reg.tvalues.index})
    ).sort_index()
    
    summary_return['All']=summary
    
    
    Summary[w] = pd.concat(summary_return,axis=1)


Summary = pd.concat(Summary,axis=1)

Summary.loc[['const','const t-stat']].append(
    Summary.drop(['const','const t-stat'])
).to_hdf('./output_summary/Factor_Loadings.h5',key=model_name)

Summary.loc[['const','const t-stat']].append(
    Summary.drop(['const','const t-stat'])
)

# Compare Performance of different strategies 
# [this part is used for ROBUSTNESS]

In [None]:
#Load Saved Results on Errors & Accuracy
loss_validation = []
accuracy_validation = []
for seed in range(0,5):
    print('seed '+str(seed))
    name = model_name+'_'+str(seed)    
    state = torch.load(
        './checkpoint_CNN/'+model_name+'/'+name+'_final.t7')

    loss_validation.append(state['log']['test_loss'][-1])
    accuracy_validation.append(state['log']['test_error'][-1])
loss_validation = np.mean(loss_validation)
accuracy_validation = np.mean(accuracy_validation)

test_results = pd.read_hdf('./output_summary/Test Results.h5',
                           key=model_name)


In [None]:
stats = {}
stats['Loss'] = pd.Series({'V':loss_validation,
                           'T':test_results.loc['test loss']})
stats['R_squared'] = pd.Series({'V':1- accuracy_validation,
                               'T':1- test_results.loc['test accuracy']})

stats['Correlation'] = pd.Series({
    'pearson':pd.concat([
        Label_Test[pred_field], 
        Label_Test[return_field]],axis=1).corr(method='pearson').iloc[0,1],
    'spearman':pd.concat([
        Label_Test[pred_field], 
        Label_Test[return_field]],axis=1).corr(method='spearman').iloc[0,1]})



portfolio_return = {}
ret = pd.concat(Portfolio_Returns['EW'],axis=1)
portfolio_return['EW'] = ret[10] - ret[1]

ret = pd.concat(Portfolio_Returns['VW'],axis=1)
portfolio_return['VW'] = ret[10] - ret[1]

portfolio_return['Vol Controlled']  = Portfolio_Returns['Vol Controlled']


stats['SR'] = pd.Series({col: 
                         portfolio_return[col].mean() / portfolio_return[col].std() *(12**0.5) 
                         for col in portfolio_return.keys()})

stats['Vol'] = pd.Series({col:portfolio_return[col].std() *(12**0.5) 
                         for col in portfolio_return.keys()})


stats['Max Draw-Down / Vol'] = pd.Series({col:
                        (portfolio_return[col].cumsum().cummax() - portfolio_return[col].cumsum()).max()
                        for col in portfolio_return.keys()}) / stats['Vol']
pd.concat(stats)

In [None]:

pd.concat(stats).to_hdf('./output_summary/Robustness.h5',key=model_name)