In [None]:
from functions import *
from functions2 import *
from parameters import *

from sklearn.linear_model import Lasso
from sklearn.linear_model import lasso_path
import torch
import torch.nn.functional as F

from statsmodels.nonparametric.smoothers_lowess import lowess

from tqdm import tqdm
from IPython.display import display, HTML

In [None]:
LOG( "Data (data-frame)" )
filename = "raw/data_ml.csv"
LOG( f"  Reading {filename} [20 seconds]" )
d = pd.read_csv(filename)
d['date'] = pd.to_datetime( d['date'] )

predictors = list( signs.keys() )

LOG( "Data (list of matrices)" )
LOG( "  Reading data/data_ml.pickle" )
dd = load( "data/data_ml.pickle" )

# Deep learning: nonlinear model

In [None]:
class NonLinear6(torch.nn.Module):
    def __init__(self,k):
        super(NonLinear6,self).__init__()
        self.fc1 = torch.nn.Linear(k,16)
        self.fc2 = torch.nn.Linear(16,4)
        self.fc3 = torch.nn.Linear(4,1)
    def forward(self,xs):
        x, universe = xs
        # x is n×l×k; the linear layer is applied on the last dimension
        y = self.fc1(x); y = F.relu(y)
        y = self.fc2(y); y = F.relu(y)
        y = self.fc3(y)      # n×l×1
        p = y.exp()          # Use a softplus instead of an exponential?
        p = p * universe
        p = p[:,:,0]         #  n×l
        p = p / ( 1e-16 + p.sum(axis=0) )  # portolio weights: positive, sum up to 1 for each date
        return p

In [None]:
LOG( "[LONG] 25 minutes for 10,000 epochs" )

x, y, universe = get_data_3(date=DATE1, signs=signs, target=target)

universe = universe.reshape( y.shape[0], y.shape[1], 1 )
y = y.reshape( y.shape[0], y.shape[1], 1 )
x = torch.tensor(x, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)
universe = torch.tensor(universe, dtype=torch.float32)

model = NonLinear6(x.shape[2])

optimizer = torch.optim.Adam(model.parameters())
N = 100
IRs = np.nan * np.zeros(N)
pbar = tqdm(range(N))
for t in pbar:
    
    x.shape  # id×date×feature
    ## Take half the stocks at random
    i = np.random.choice( x.shape[0], x.shape[0] // 2, replace=False ) 
    ## Take a 3-year period, at random
    j = np.random.choice( x.shape[1] - 36 )
    j = np.arange( j, j+36 )
    
    w = model( (x[i,:,:][:,j,:], universe[i,:,:][:,j,:]) )
    ratio_returns = w * y[i,:,:][:,j,:][:,:,0].expm1()     # y already contains the forward returns
    ratio_returns = ratio_returns.sum(axis=0)
    log_returns = ratio_returns.log1p()
    IR = log_returns.mean() / log_returns.std()
    loss = -IR
    IRs[t] = IR.item()
    pbar.set_description( f"IR={np.nanmean(IRs):.3f}" )
    if not np.isfinite( loss.item() ):
        LOG( f"{t} PROBLEM" )
        break
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
LOG( "DONE" )

## Add the final IR, on the whole sample
w = model( (x,universe) )
ratio_returns = w * y[:,:,0].expm1()     # y already contains the forward returns
ratio_returns = ratio_returns.sum(axis=0)
log_returns = ratio_returns.log1p()
IR = log_returns.mean() / log_returns.std()
IR = IR.item()

## The performance we recorded is very noisy: it is on different periods...
from statsmodels.nonparametric.smoothers_lowess import lowess
fig, ax = plt.subplots()
ax.scatter( 1+np.arange(len(IRs)), IRs )
r = lowess( IRs, 1+np.arange(len(IRs)) )
ax.plot( r[:,0], r[:,1], color = 'black', linewidth=5 )
ax.scatter( 1+len(IRs), IR, color = 'tab:orange', marker='x', s=200, linewidth=5)
ax.set_xlabel("Epoch")
ax.set_ylabel("IR")
ax.set_xscale('log')
fig.savefig("plots/model6_nonlinear_IR_loss.pdf")
plt.show()

pd.Series(IRs).to_csv("results/model6_nonlinear_IR_loss.csv")

In [None]:
## Wealth curves
##
## This is not the strategy actually learned, but the quintile portfolios from the score.
## The strategy learned provided actual weights.
## 

x, y, universe = get_data_3(all=True, signs=signs, target='R1M_Usd')
universe = universe.reshape( y.shape[0], y.shape[1], 1 )
y = y.reshape( y.shape[0], y.shape[1], 1 )
x = torch.tensor(x, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)
universe = torch.tensor(universe, dtype=torch.float32)

signal = model( (x,universe) ).detach().numpy()

trailing_log_returns = LAG( np.log1p( dd[ 'R1M_Usd' ] ) )
y = trailing_log_returns.copy()
y.fillna(0, inplace=True)

assert signal.shape == y.shape
signal = pd.DataFrame( signal, index = y.index, columns = y.columns )

res = signal_backtest(signal, y, date=DATE1)

fig, ax = plt.subplots()
for i in range(6):
    ax.plot( res['dates'], res['prices'].iloc[i,:], color = quintile_colours[i] )
ax.axvline( pd.to_datetime(DATE1), color='black', linewidth=1 )
ax.set_yscale('log')
ax.set_title('Maximizing the IR (non-linear, signal)')
ax.text(0.02, .97, f"μ={100*res['out-of-sample'].iloc[5,:]['CAGR']:.1f}%",                  horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
ax.text(0.02, .90, f"σ={100*res['out-of-sample'].iloc[5,:]['Annualized Volatility']:.1f}%", horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
ax.text(0.02, .83, f"IR={res['out-of-sample'].iloc[5,:]['Information Ratio']:.2f}",         horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
fig.savefig("plots/model6_nonlinear_IR_wealth.pdf")
plt.show()

res['out-of-sample']

In [None]:
res['performance'  ]['period'] = 'all'
res['in-sample'    ]['period'] = 'in-sample'
res['out-of-sample']['period'] = 'out-of-sample'    
r = pd.concat( [ res['performance'], res['in-sample'], res['out-of-sample'] ] )
r['model'] = 'IR nonlinear'
r['epochs'] = N
r.to_csv("results/model6_nonlinear_IR.csv")

In [None]:
# Backtest the strategy actually learned

r = compute_portfolio_returns( signal, np.expm1(trailing_log_returns) ) 
p = np.exp(cumsum_na(r))               # Log-price = cummulated log-returns
p = replace_last_leading_NaN_with_1(p) # "cumsum" is not the exact inverse of "diff" -- it discards the first value, 1: put it back
s = analyze_returns( r[ r.index > DATE1 ], as_df = True )   

fig, ax = plt.subplots()
for i in range(5):
    ax.plot( res['dates'], res['prices'].iloc[i,:], color = quintile_colours[i] )
ax.plot( p.index, p, color='black' )
ax.axvline( pd.to_datetime(DATE1), color='black', linewidth=1 )
ax.set_yscale('log')
ax.set_title('Maximizing the IR (weights)')
ax.text(0.02, .97, f"μ={100*s.iloc[0,:]['CAGR']:.1f}%",                  horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
ax.text(0.02, .90, f"σ={100*s.iloc[0,:]['Annualized Volatility']:.1f}%", horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
ax.text(0.02, .83, f"IR={s.iloc[0,:]['Information Ratio']:.2f}",         horizontalalignment='left', verticalalignment='top', transform = ax.transAxes)
fig.savefig("plots/model6_nonlinear_IR_wealth_weights.pdf")
plt.show()

s

In [None]:
s['period'] = 'out-of-sample'    
s['model'] = 'IR nonlinear'
s['portfolio'] = 'Weights'
s['epochs'] = N
s.to_csv("results/model6_nonlinear_IR_weights.csv")

In [None]:
# Try to understand what the model actually learned

def plot_copula(x,y, ax=None, title=None, unif=True, cmap='Blues'):
    i = np.isfinite(x) & np.isfinite(y)
    x = x[i]
    y = y[i]

    if unif:
        x = uniformize(x)
        y = uniformize(y)
        xmin, xmax, ymin, ymax = 0,1, 0,1
    else:
        xmin, xmax = min(x), max(x)
        ymin, ymax = min(y), max(y)

    ax_was_None = ax is None
    if ax_was_None:
        fig, ax = plt.subplots( figsize = (4,4) )

    xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
    positions = np.vstack([xx.ravel(), yy.ravel()])
    values = np.vstack([x,y])
    kernel = scipy.stats.gaussian_kde(values)
    f = np.reshape( kernel(positions).T, xx.shape )
    ax.set_xlim(xmin, xmax)
    ax.set_ylim(ymin, ymax)
    cfset = ax.contourf(xx, yy, f, cmap=cmap)
    cset = ax.contour(xx, yy, f, colors='k')
    ax.axes.xaxis.set_visible(False)
    ax.axes.yaxis.set_visible(False)
    if title is not None:
        ax.set_title(title)
    if ax_was_None:
        plt.show()
        
def remove_empty_axes(axs):
    for ax in axs.flatten():
        if (not ax.lines) and (not ax.collections) and (not ax.has_data()):
            ax.axis('off')

In [None]:
LOG( "Copula densities [VERY LONG: 1.5 hours]")        
nr, nc = mfrow(x.shape[2], aspect=1)
fig, axs = plt.subplots( nr, nc, figsize=(29.7/1.5,21/1.5) )
for i in tqdm(range(x.shape[2])):
    a = np.where( universe[:,:,0], x[:,:,i], np.nan )
    b = signal.values
    b = np.apply_along_axis( uniformize, 0, b )    
    ax = axs.flatten()[i]
    plot_copula(a.flatten(),b.flatten(), ax = ax)
    ax.set_title( predictors[i] )
remove_empty_axes(axs)
fig.tight_layout()
fig.subplots_adjust(hspace=.2, wspace=.05)
fig.savefig('plots/model6_nonlinear_copulas_all.pdf')
plt.show()

In [None]:
# Individual plots
for i in tqdm(range(x.shape[2])):
    a = np.where( universe[:,:,0], x[:,:,i], np.nan )
    b = signal.values
    b = np.apply_along_axis( uniformize, 0, b )    
    fig, ax = plt.subplots(figsize=(3,3))
    plot_copula(a.flatten(),b.flatten(), ax = ax)
    ax.set_title( predictors[i] )
    fig.tight_layout()
    fig.savefig(f'plots/model6_nonlinear_copulas_{predictors[i]}.pdf')
    plt.close()

In [None]:
nr, nc = mfrow(x.shape[2], aspect=1)
fig, axs = plt.subplots( nr, nc, figsize=(29.7/1.5,21/1.5) )
for i in tqdm(range(x.shape[2])):
    a = np.where( universe[:,:,0], x[:,:,i], np.nan )
    
    if False: 
      # Some of the variables just measure size: normalize them
      # (not really possible with the data we have: we can only see something for FCF -- for most other variables, the effect of the MCap is overpowering)
      if i > 0:
        a = a - x.detach().numpy()[:,:,0]
        a = np.apply_along_axis( uniformize, 0, a )    
        
    b = signal.values.copy()
    a = np.floor( a * 20 * .9999 )
    b = np.apply_along_axis( uniformize, 0, b )
    a = a.flatten()
    b = b.flatten()
    c = pd.DataFrame( { 'quantile': a, 'value': b } )
    c = c.pivot_table(values='value', columns = 'quantile', aggfunc='mean')
    ax = axs.flatten()[i]
    ax.plot( c.columns, c.values.flatten() )
    ax.scatter( c.columns, c.values.flatten() )
    ax.axes.xaxis.set_visible(False)
    ax.axes.yaxis.set_visible(False)    
    ax.set_title( predictors[i] )
remove_empty_axes(axs)
fig.tight_layout()
fig.subplots_adjust(hspace=.2, wspace=.05)
fig.savefig('plots/model6_nonlinear_median_per_quantile_all.pdf')
plt.show()

LOG( "Individual plots" )
for i in tqdm(range(x.shape[2])):
   
    a = np.where( universe[:,:,0], x[:,:,i], np.nan )
    b = signal.values.copy()
    a = np.floor( a * 20 * .9999 )
    b = np.apply_along_axis( uniformize, 0, b )
    a = a.flatten()
    b = b.flatten()
    c = pd.DataFrame( { 'quantile': a, 'value': b } )
    c = c.pivot_table(values='value', columns = 'quantile', aggfunc='mean')
    fig, ax = plt.subplots(figsize=(3,3))
    ax.plot( c.columns, c.values.flatten() )
    ax.scatter( c.columns, c.values.flatten() )
    ax.axes.xaxis.set_visible(False)
    ax.axes.yaxis.set_visible(False)    
    ax.set_title( predictors[i] )
    fig.tight_layout()
    fig.savefig(f'plots/model6_nonlinear_median_per_quantile_{predictors[i]}.pdf')
    plt.close()

In [None]:
nr, nc = mfrow(x.shape[2])
fig, axs = plt.subplots( nr, nc, figsize=(20,20) )
for i in tqdm(range(x.shape[2])):
    ax = axs.flatten()[i]

    a = np.where( universe[:,:,0], x[:,:,i], np.nan )
    b = signal.values.copy()
    a = np.floor( a * 20 * .9999 )
    b = np.apply_along_axis( uniformize, 0, b )
    a = a.flatten()
    b = b.flatten()
    c = pd.DataFrame( { 'quantile': a, 'value': b } )
    c1 = c.pivot_table(values='value', columns = 'quantile', aggfunc=lambda x: np.percentile(x, 25))
    c2 = c.pivot_table(values='value', columns = 'quantile', aggfunc=lambda x: np.percentile(x, 50))
    c3 = c.pivot_table(values='value', columns = 'quantile', aggfunc=lambda x: np.percentile(x, 75))
    c0 = c1.columns
    c1 = c1.values.flatten()
    c2 = c2.values.flatten()
    c3 = c3.values.flatten()
    ax.fill_between( c0, c1, c3, color='lightblue')
    ax.plot(c0, c1, color='tab:blue')
    ax.plot(c0, c3, color='tab:blue')
    ax.plot(c0, c2, marker='o', color='black')
    ax.axes.xaxis.set_visible(False)
    ax.axes.yaxis.set_visible(False)    
    ax.set_title( predictors[i] )
remove_empty_axes(axs)
fig.tight_layout()
fig.subplots_adjust(hspace=.2, wspace=.05)
fig.savefig('plots/model6_nonlinear_quartiles_per_quantile_all.pdf')
plt.show()

In [None]:
    a = np.where( universe[:,:,0], x[:,:,i], np.nan )
    b = signal.values.copy()
    a = np.floor( a * 20 * .9999 )
    b = np.apply_along_axis( uniformize, 0, b )
    a = a.flatten()
    b = b.flatten()
    c = pd.DataFrame( { 'quantile': a, 'value': b } )
    c1 = c.pivot_table(values='value', columns = 'quantile', aggfunc=lambda x: np.percentile(x, 25))
    c2 = c.pivot_table(values='value', columns = 'quantile', aggfunc=lambda x: np.percentile(x, 50))
    c3 = c.pivot_table(values='value', columns = 'quantile', aggfunc=lambda x: np.percentile(x, 75))
    c0 = c1.columns
    c1 = c1.values.flatten()
    c2 = c2.values.flatten()
    c3 = c3.values.flatten()
    fig, ax = plt.subplots()
    ax.fill_between( c0, c1, c3, color='lightblue')
    ax.plot(c0, c1, marker='o', color='tab:blue')
    ax.plot(c0, c3, marker='o', color='tab:blue')
    ax.plot(c0, c2, marker='o', color='black')
    ax.axes.xaxis.set_visible(False)
    ax.axes.yaxis.set_visible(False)    
    ax.set_title( predictors[i] )
    plt.show()

In [None]:
for which in ['scatter', 'hexbin']:
    LOG( which )
    nr, nc = mfrow(x.shape[2], aspect=1)
    fig, axs = plt.subplots( nr, nc, figsize=(29.7/1.5,21/1.5) )
    for i in tqdm(range(x.shape[2])):
        a = np.where( universe[:,:,0], x[:,:,i], np.nan )
        b = signal.values.copy()
        b = np.apply_along_axis( uniformize, 0, b )
        a = a.flatten()
        b = b.flatten()
        ax = axs.flatten()[i]
        if which == 'scatter': 
            n = len(a)
            ax.scatter(
                a + .01 * np.random.uniform(-1,1,n), 
                b + .01 * np.random.uniform(-1,1,n),
                alpha=1/255, 
                s=10
            )
        else: 
            ax.hexbin( a, b, gridsize=20, cmap='Blues' )
        ax.axes.xaxis.set_visible(False)
        ax.axes.yaxis.set_visible(False)    
        ax.set_title( predictors[i] )
    remove_empty_axes(axs)
    fig.tight_layout()
    fig.subplots_adjust(hspace=.22, wspace=.05)
    if which == 'scatter':
        LOG( f"{which}: PNG file" )
        fig.savefig(f'plots/model6_nonlinear_{which}_all.png', facecolor='white', transparent=False)  # The PDF file would be too large...
    else: 
        LOG( f"{which}: PDF file" )
        fig.savefig(f'plots/model6_nonlinear_{which}_all.pdf')
    LOG( "Plot" )
    plt.show()
    
    LOG( f"{which}: Individual plots" )
    for i in tqdm(range(x.shape[2])):

        a = np.where( universe[:,:,0], x[:,:,i], np.nan )
        b = signal.values.copy()
        b = np.apply_along_axis( uniformize, 0, b )
        a = a.flatten()
        b = b.flatten()
        if which == 'scatter': 
            fig, ax = plt.subplots(figsize=(19.20,10.80))
        else: 
            fig, ax = plt.subplots(figsize=(3,3))
        if which == 'scatter': 
            n = len(a)
            ax.scatter( 
                a + .01 * np.random.uniform(-1,1,n), 
                b + .01 * np.random.uniform(-1,1,n),
                alpha=1/255, s=200
            )
            ax.set_xlim(0,1)
            ax.set_ylim(0,1)            
        else: 
            ax.hexbin( a, b, gridsize=20, cmap='Blues' )
        ax.axes.xaxis.set_visible(False)
        ax.axes.yaxis.set_visible(False)    
        ax.set_title( predictors[i] )        
        fig.tight_layout()
        if which == 'scatter':
            fig.savefig(f'plots/model6_nonlinear_{which}_{predictors[i]}.png', facecolor='white', transparent = False)
        else:
            fig.savefig(f'plots/model6_nonlinear_{which}_{predictors[i]}.pdf')
        plt.close()    

In [None]:
LOG( "Done." )