In [None]:
from functions import *
from parameters import *
from functions2 import *

from sklearn.linear_model import Lasso
from sklearn.linear_model import lasso_path
import torch
import torch.nn.functional as F

from statsmodels.nonparametric.smoothers_lowess import lowess

from tqdm import tqdm
from IPython.display import display, HTML

In [None]:
LOG( "Data (data-frame)" )
filename = "raw/data_ml.csv"
LOG( f"  Reading {filename} [20 seconds]" )
d = pd.read_csv(filename)
d['date'] = pd.to_datetime( d['date'] )

predictors = list( signs.keys() )
target = 'R1M_Usd'

LOG( "Data (list of matrices)" )
LOG( "  Reading data/data_ml.pickle" )
dd = load( "data/data_ml.pickle" )

# Deep learning: linear model
This is equivalent to the models above, but implemented with a neural network.
* Linear model
* Change the loss function
* Add mini-batches
* Add sign constraints

### Linear model; forecast the returns

In [None]:
LOG( "Training data" )
i = np.array([ str(u) < DATE1 for u in d['date'] ]) 
train = d[i].copy()
x = train[ predictors ]
y = np.log1p(train[ target ])

LOG( "Clean the data" )
i = np.isfinite(y)
x = x[i]
y = y[i]

x = x.fillna(.5)

LOG( "Model" )

class Linear1(torch.nn.Module):
    def __init__(self,k):
        super(Linear1,self).__init__()
        self.linear = torch.nn.Linear(k,1)
    def forward(self,x):
        y = self.linear(x)
        return y

x = torch.tensor(x.values,               dtype=torch.float32)
y = torch.tensor(y.values.reshape(-1,1), dtype=torch.float32)

model = Linear1(x.shape[1])
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

LOG( "Loop [2 minutes]" )
N = 5000
for t in tqdm(range(N)):
    y_pred = model(x)
    loss = criterion(y_pred,y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

beta = list( model.parameters() )[0]
beta = beta.detach().numpy().flatten()

r = None

LOG( "Data for the backtest" )
trailing_log_returns = LAG( np.log1p( dd[ 'R1M_Usd' ] ) )
y = trailing_log_returns.copy()
y.fillna(0, inplace=True)

signal = np.zeros( shape = dd[ list(dd.keys())[0] ].shape )
for i,predictor in enumerate(predictors):
    signal += beta[i] * dd[predictor].fillna(.5)
signal = np.where( dd['universe'], 1, np.nan ) * signal
res = signal_backtest(signal, y, date=DATE1)

fig, ax = plt.subplots()
for i in range(6):
    ax.plot( res['dates'], res['prices'].iloc[i,:], color = quintile_colours[i] )
ax.set_yscale('log')
ax.axvline( pd.to_datetime(DATE1), color='black', linewidth=1 )
ax.set_title('Liner model (via pytorch)')
ax.text(0.02, .97, f"μ={100*res['out-of-sample'].iloc[5,:]['CAGR']:.1f}%",                  horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
ax.text(0.02, .90, f"σ={100*res['out-of-sample'].iloc[5,:]['Annualized Volatility']:.1f}%", horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
ax.text(0.02, .83, f"IR={res['out-of-sample'].iloc[5,:]['Information Ratio']:.2f}",         horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
fig.savefig("plots/model1_linear_wealth.pdf")
plt.show()

res['out-of-sample']

In [None]:
res['performance'  ]['period'] = 'all'
res['in-sample'    ]['period'] = 'in-sample'
res['out-of-sample']['period'] = 'out-of-sample'    
r = pd.concat( [ res['performance'], res['in-sample'], res['out-of-sample'] ] )
r['model'] = 'linear (1)'
r['epochs'] = N
r.to_csv("results/model1_linear.csv")

### Linear model; data as id×signal×date array; forecast the returns

Same model as above, but the data is no longer in an (id,date)×signal (2-dimensional) table, but in an id×date×signal 3-dimensional array.

The model and the performance are similar but, for some reason, fitting the model is much more time-consuming.

In [None]:
LOG( "Training data (3-dimensional array)" )
x, y, universe = get_data_3(date=DATE1, signs=signs)

In [None]:
class Linear2(torch.nn.Module):
    def __init__(self,k):
        super(Linear2,self).__init__()
        self.linear = torch.nn.Linear(k,1)
    def forward(self,x):
        # x is n×l×k; the linear layer is applied on the last dimension
        y = self.linear(x)   # n×l×1
        y = y[:,:,0]         #  n×l
        return y

universe = universe.reshape( y.shape[0], y.shape[1], 1 )
y = y.reshape( y.shape[0], y.shape[1], 1 )
x = torch.tensor(x, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)
universe = torch.tensor(universe, dtype=torch.float32)

In [None]:
model = Linear2(x.shape[2])
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

LOG( "Loop [LONG: 50 minutes for 5000 epochs]" )
N = 5000
losses = np.nan * np.zeros(N)
pbar = tqdm(range(N))
for t in pbar:
    y_pred = model(x) * universe[:,:,0]
    loss = criterion(y_pred,y[:,:,0])
    losses[t] = loss.item()
    pbar.set_description( f"Loss={loss.item():.5f}" )
    if not np.isfinite( loss.item() ):
        LOG( f"{t} PROBLEM" )
        break
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

fig, ax = plt.subplots()
ax.plot( losses )
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss")
ax.set_xscale('log')
fig.savefig("plots/model2_linear_3d_loss.pdf")
plt.show()

pd.Series(losses).to_csv("results/model2_linear_3d_loss.csv")

In [None]:
# Backtest the resulting strategy (exactly the same code as above)

beta = list( model.parameters() )[0]
beta = beta.detach().numpy().flatten()

r = None

LOG( "Data for the backtest" )
trailing_log_returns = LAG( np.log1p( dd[ 'R1M_Usd' ] ) )
y = trailing_log_returns.copy()
y.fillna(0, inplace=True)

signal = np.zeros( shape = dd[ list(dd.keys())[0] ].shape )
for i,predictor in enumerate(predictors):
    signal += beta[i] * dd[predictor].fillna(.5)
signal = np.where( dd['universe'], 1, np.nan ) * signal
res = signal_backtest(signal, y, date=DATE1)

fig, ax = plt.subplots()
for i in range(6):
    ax.plot( res['dates'], res['prices'].iloc[i,:], color = quintile_colours[i] )
ax.set_yscale('log')
ax.axvline( pd.to_datetime(DATE1), color='black', linewidth=1 )
ax.set_title('Linear model (via pytorch)')
ax.text(0.02, .97, f"μ={100*res['out-of-sample'].iloc[5,:]['CAGR']:.1f}%",                  horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
ax.text(0.02, .90, f"σ={100*res['out-of-sample'].iloc[5,:]['Annualized Volatility']:.1f}%", horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
ax.text(0.02, .83, f"IR={res['out-of-sample'].iloc[5,:]['Information Ratio']:.2f}",         horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
fig.savefig("plots/model2_linear_3d_wealth.pdf")
plt.show()

res['out-of-sample']

In [None]:
res['performance'  ]['period'] = 'all'
res['in-sample'    ]['period'] = 'in-sample'
res['out-of-sample']['period'] = 'out-of-sample'    
r = pd.concat( [ res['performance'], res['in-sample'], res['out-of-sample'] ] )
r['model'] = 'linear (2)'
r['epochs'] = N
r.to_csv("results/model2_linear_3d.csv")

### Simple, nonlinear model

This does not really work: the optimization often remains stuck. 
Restarting the optimization several (10?) times eventually gives something acceptable.

In [None]:
class NonLinear2(torch.nn.Module):
    def __init__(self,k):
        super(NonLinear2,self).__init__()
        self.fc1 = torch.nn.Linear(k,16)
        self.fc2 = torch.nn.Linear(16,4)
        self.fc3 = torch.nn.Linear(4,1)
    def forward(self,x):
        # x is n×l×k; the linear layer is applied on the last dimension
        y = self.fc1(x); y = F.relu(y)
        y = self.fc2(y); y = F.relu(y)
        y = self.fc3(y); # n×l×1
        y = y[:,:,0]         #  n×l
        return y

set_seed(1)
    
x, y, universe = get_data_3(date=DATE1, signs=signs)

universe = universe.reshape( y.shape[0], y.shape[1], 1 )
y = y.reshape( y.shape[0], y.shape[1], 1 )
x = torch.tensor(x, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)
universe = torch.tensor(universe, dtype=torch.float32)

model = NonLinear2(x.shape[2])
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

LOG( "Loop [LONG: 20 minutes for 5000 epochs]" )
N = 100    # Early stopping is needed, and multiple restarts. Mini-batches may fix those convergence and overfitting problems
losses = np.nan * np.zeros(N)
pbar = tqdm(range(N))
for t in pbar:
    y_pred = model(x) * universe[:,:,0]
    loss = criterion(y_pred,y[:,:,0])
    losses[t] = loss.item()
    pbar.set_description( f"Loss={loss.item():.5f}" )
    if not np.isfinite( loss.item() ):
        LOG( f"{t} PROBLEM" )
        break
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

fig, ax = plt.subplots()
ax.plot( losses )
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss")
ax.set_xscale('log')
fig.savefig("plots/model2_nonlinear_3d_loss.pdf")
plt.show()

pd.Series(losses).to_csv("results/model2_nonlinear_3d_loss.csv")

In [None]:
x, y, universe = get_data_3(all=True, signs=signs)
universe = universe.reshape( y.shape[0], y.shape[1], 1 )
y = y.reshape( y.shape[0], y.shape[1], 1 )
x = torch.tensor(x, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)
universe = torch.tensor(universe, dtype=torch.float32)

signal = model(x).detach().numpy()

trailing_log_returns = LAG( np.log1p( dd[ 'R1M_Usd' ] ) )
y = trailing_log_returns.copy()
y.fillna(0, inplace=True)

assert signal.shape == y.shape
signal = pd.DataFrame( signal, index = y.index, columns = y.columns )

res = signal_backtest(signal, y, date=DATE1)

fig, ax = plt.subplots()
for i in range(6):
    ax.plot( res['dates'], res['prices'].iloc[i,:], color = quintile_colours[i] )
ax.axvline( pd.to_datetime(DATE1), color='black', linewidth=1 )
ax.set_yscale('log')
ax.set_title('Non-linear')
ax.text(0.02, .97, f"μ={100*res['out-of-sample'].iloc[5,:]['CAGR']:.1f}%",                  horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
ax.text(0.02, .90, f"σ={100*res['out-of-sample'].iloc[5,:]['Annualized Volatility']:.1f}%", horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
ax.text(0.02, .83, f"IR={res['out-of-sample'].iloc[5,:]['Information Ratio']:.2f}",         horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
fig.savefig("plots/model2_nonlinear_3d_wealth.pdf")
plt.show()

res['out-of-sample']

In [None]:
res['performance'  ]['period'] = 'all'
res['in-sample'    ]['period'] = 'in-sample'
res['out-of-sample']['period'] = 'out-of-sample'    
r = pd.concat( [ res['performance'], res['in-sample'], res['out-of-sample'] ] )
r['model'] = 'nonlinear'
r['epochs'] = N
r.to_csv("results/model2_nonlinear_3d.csv")

In [None]:
LOG( "Done." )