In [1]:
import torch
import torch.nn as nn
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random as rand

torch.set_default_dtype(torch.float64)

In [2]:
print(torch.__version__)

1.5.0


In [None]:
# Here we check the integrity of the data

data = pd.read_pickle('final_data')

print('Number of Nan in data : ' + str(data.isna().sum().sum()))

num_zeros = 0
weird_dim = 0
for i in data.index:
    for j in data.columns:
        num_zeros += np.count_nonzero(data.loc[i,j][0:4]==0)
        if len(data.iloc[0,0])!=10:
            weird_dim += 1
print('Number of unwanted 0 in data : ' + str(num_zeros))
print('Number of data with weird sizes : ' + str(weird_dim))

print('Shape of data : ' + str(data.shape))

In [None]:
# Preview of the raw data
data

In [3]:
''' Handle a slice of dataframe: turn it in numpy array and add classification tokens'''        
def toNumpy(sliced):             
        
        data_list = []
        
        for i in range(len(sliced.index)):
            data_list.append(sliced.iloc[i].tolist())

        return np.array(data_list)
    

'''Allow to get a slice of size num_row from dataframe data, return the slice as numpy array '''
def getSlice(data, num_row, start_index):
        
        if num_row >= data.shape[0]:
            raise Exception('num_row must not be greater than the number of row in data : ' + str(data.shape[0]))
        if num_row<=0:
            raise Exception('no 0 for num_row plz')
        
        
        sliced = data.iloc[start_index:start_index+num_row]
        return toNumpy(sliced)
        
        
''' add positional and stock encoding '''
def addEncoding(data_list, num_row):       
        if data_list.shape != (num_row,477,11):
            raise Exception('data_list does not have the right shape : ' + str(data_list.shape))
        
        for i in range(data_list.shape[0]):
            for j in range(data_list.shape[1]):
                
                if data_list[i,j,9] != -1.111:
                    raise Exception('the positional encoding is already set somehow?')
                
                data_list[i,j,9]= (i+1)/num_row
                
                if data_list[i,j,10] != -2.222:
                    raise Exception('the stock encoding is already set somehow?')
                
                data_list[i,j,10]= (j+1)/477
        
        return data_list
    

''' add the 477 empty arrays to be used as classification tokens '''
def addClassificationTokens(data_list):
    return np.concatenate((np.zeros((1,477,11)), data_list), axis=0)


In [4]:
'''handle dataframe and returns the expected results for 5 days regression'''
def getRegResults(data, start, num_days):
    
    sliced = toNumpy(data.iloc[start:start+(num_days*10)])
    
    results = []
    
    for s in range(sliced.shape[1]):
        
        stock = []
        col = sliced[:,s,:]

        i=0
        while( len(stock)<num_days ):

            if col[i][6]==1:
                #info = col[i,[0,1,2,3,4,5,8]]
                daily_change = (col[i,3]/col[i,2]) - 1
                #info[6]=daily_change
                stock.append([daily_change])
            i+=1
        
        results.append(stock)
    
    return np.transpose(np.array(results), (1,0,2))


'''given a sequence of daily variation, define the class of a sequence'''
def getClass(array):
    #var = np.var(array, dtype=np.float64)
    mean = array.mean()
    #if var>0.01:
    #    return 3 #volatile
    if mean>0.003:
        return 0 #bullish
    elif mean<-0.003:
        return 1 #bearish
    else:
        return 2 #stable
    
    
    
'''handle dataframe and returns the expected results for classification'''
#cls is done by looking at 3 days and classifying into : volatile, stable, bearish, bullish.

def getClsResults(data, num_days):
    if num_days>data.shape[0]:
        raise Exception('the regression must be done on more days than the classification')
    
    sliced = data[0:num_days,:,0]
    
    results = []
    
    for i in range(sliced.shape[1]):
        #get info on 3 days
        stock = sliced[:,i]
        cls = getClass(stock)
        results.append(cls)
        
    return np.array(results)

In [54]:
a = getRegResults(data, 0,5)
print(a.shape)
print(a.dtype)


(5, 477, 1)
float64


In [55]:
b = getClsResults(a, 3)
print(b.dtype)
b.shape

int32


(477,)

In [5]:

''' reshape the (date, stock, features) into (date*stock, feature)'''
def flatten(sliced):
    return sliced.reshape(sliced.shape[0]*sliced.shape[1], sliced.shape[2])


'''generate a numpy batch of num_batch samples for num_row dates and for all stocks, use without start param'''
def getBatches(data, num_row, num_batch, start = -1):
    
    batches = []
    results_reg = []
    results_cls = []
    
    for i in range(num_batch):
        
        if start == -1:
            start_index = rand.randint(0, data.shape[0] - num_row-5)
        else:
            start_index = start
        #print(start_index)
        
        batch = getSlice(data, num_row,start_index) 
        result_reg = getRegResults(data, start_index+num_row, 5)
        result_cls = getClsResults(result_reg,3)
        
        batches.append(flatten(addClassificationTokens(addEncoding(batch, num_row))))
        results_reg.append(flatten(result_reg))
        results_cls.append(result_cls)
    
    #try:
    tensor_batch=torch.Tensor(np.transpose(np.array(batches),(1,0,2)))
    tensor_reg=torch.Tensor(np.transpose(np.array(results_reg), (1,0,2)))
    tensor_cls=torch.Tensor(np.transpose(np.array(results_cls), (1,0))).long()
    
    return tensor_batch, tensor_reg, tensor_cls
    #except:
    #    print(np.array(batches).shape)
    #    print(np.array(results_reg).shape)
    #    print(np.array(results_cls).shape)

In [None]:
t = data.iloc[60:60+3,305]
print(t.name)
mean = []
for i in t.index:
    mean.append(t[i][3]/t[i][2] - 1)
    print(str(t[i][6]==1) + '    ' + str(t[i][3]/t[i][2] - 1))
print(mean)
print(np.mean(mean))

In [None]:
a,b,c = getBatches(data, 10,2, 50)
print(a.shape)
print(b.shape)
print(c.shape)

In [None]:
print(b.dtype)
print(b.shape)
print(b[477*2+305].numpy())

print(c.dtype)
print(c.shape)
c[305].numpy()

In [3]:
data = pd.read_pickle('final_data')

'''order of the embedding : 
0 : high
1 : low
2 : open
3 : close
4 : volume
5 : adj close
6 : Real Or Fake Value?
7 : year Positional Encoding
8 : price change compared to yesterday
9 : positional encoding
10 : stock encoding
'''

'order of the embedding : \n0 : high\n1 : low\n2 : open\n3 : close\n4 : volume\n5 : adj close\n6 : Real Or Fake Value?\n7 : year Positional Encoding\n8 : price change compared to yesterday\n9 : positional encoding\n10 : stock encoding\n'

In [10]:
data.iloc[0:10,45:50]

Unnamed: 0,^TNX,^VIX,A,AAP,AAPL
2004-08-30,"[4.2230000495910645, 4.188000202178955, 4.2230...","[15.579999923706055, 15.300000190734865, 15.35...","[15.40772533416748, 15.064377784729006, 15.379...","[25.13333320617676, 24.79999923706055, 24.9133...","[2.4800000190734863, 2.4257142543792725, 2.428..."
2004-08-31,"[4.184000015258788, 4.0960001945495605, 4.1820...","[15.850000381469727, 15.279999732971193, 15.64...","[15.021459579467773, 14.320457458496096, 15.0,...","[24.979999542236328, 24.6200008392334, 24.9799...","[2.4964284896850586, 2.4285714626312256, 2.433..."
2004-09-01,"[4.1599998474121085, 4.081999778747559, 4.0939...","[15.390000343322756, 14.720000267028807, 15.39...","[14.98569393157959, 14.470672607421875, 14.613...","[24.913333892822266, 24.63999938964844, 24.706...","[2.570714235305786, 2.442142963409424, 2.45000..."
2004-09-02,"[4.196000099182129, 4.116000175476073, 4.13199...","[15.050000190734865, 14.18000030517578, 14.970...","[15.236051559448242, 14.570815086364744, 14.80...","[24.893333435058594, 24.64666748046875, 24.893...","[2.557857036590576, 2.4878571033477783, 2.5357..."
2004-09-03,"[4.297999858856201, 4.165999889373778, 4.18200...","[14.380000114440918, 13.789999961853027, 14.35...","[15.135908126831055, 14.484978675842285, 15.12...","[25.02666664123535, 24.65999984741211, 24.7399...","[2.5657143592834477, 2.5007143020629883, 2.500..."
2004-09-05,"[4.297999858856201, 4.165999889373778, 4.18200...","[14.380000114440918, 13.789999961853027, 14.35...","[15.135908126831055, 14.484978675842285, 15.12...","[25.02666664123535, 24.65999984741211, 24.7399...","[2.5657143592834477, 2.5007143020629883, 2.500..."
2004-09-06,"[4.297999858856201, 4.165999889373778, 4.18200...","[14.380000114440918, 13.789999961853027, 14.35...","[15.135908126831055, 14.484978675842285, 15.12...","[25.02666664123535, 24.65999984741211, 24.7399...","[2.5657143592834477, 2.5007143020629883, 2.500..."
2004-09-07,"[4.293000221252441, 4.244999885559082, 4.28100...","[14.640000343322756, 14.029999732971193, 14.52...","[14.97138786315918, 14.692418098449707, 14.771...","[25.29333305358887, 24.96666717529297, 24.9666...","[2.5850000381469727, 2.5164284706115723, 2.528..."
2004-09-08,"[4.287000179290772, 4.1550002098083505, 4.2649...","[14.3100004196167, 13.880000114440918, 14.1599...","[14.66380500793457, 14.406294822692873, 14.542...","[25.26666641235352, 25.03333282470703, 25.0933...","[2.61214280128479, 2.548571348190308, 2.549999..."
2004-09-09,"[4.203999996185304, 4.13100004196167, 4.133999...","[14.40999984741211, 13.699999809265135, 14.119...","[15.22889804840088, 14.53505039215088, 14.5493...","[25.20000076293945, 24.479999542236328, 25.066...","[2.5928571224212646, 2.5199999809265137, 2.578..."


In [None]:
'''
getSlice works fine - tested and compared with the dataframe directly => its good
time to make model

'''

In [7]:
class FeedForwardNetwork(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size, dropout_rate):
        super(FeedForwardNetwork, self).__init__()

        self.seq = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, output_size),
            nn.Dropout(dropout_rate)
        )
        
    def forward(self, x):
        return self.seq(x)

In [8]:
class EncoderLayer(nn.Module):
    
    def __init__(self, input_size, num_heads, dropout_rate):
        super(EncoderLayer, self).__init__()
        
        self.attention = nn.MultiheadAttention(embed_dim=input_size, num_heads=num_heads , dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)
        self.norm0 = nn.LayerNorm(input_size, eps=1e-6)
        
        self.ffn = FeedForwardNetwork(input_size, input_size*4 ,input_size, dropout_rate)
        self.norm1 = nn.LayerNorm(input_size, eps=1e-6)
        
    def forward(self, x):
            
        #possibility here : add q,k,v linear layers for each stock independantly?
            
        y, wei = self.attention(x,x,x)
        #print('0')
        y = self.dropout(y)
    
        y = y+x
        y = self.norm0(y)
            
        z = self.ffn(y)
            
        z = z+y
        z = self.norm1(z)
            
        return z

In [9]:
class EmbeddingLayer(nn.Module):
    def __init__(self, input_size, encoder_size, dropout_rate ):
        super(EmbeddingLayer,self).__init__()
        
        self.embed = nn.Sequential(
            nn.Linear(input_size, int((input_size+encoder_size)/1.5)),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(int((input_size+encoder_size)/1.5), encoder_size*2),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.LayerNorm(encoder_size*2, eps=1e-6),
            nn.Linear(encoder_size*2, encoder_size*2),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(encoder_size*2, encoder_size),
            nn.GELU(),
            nn.Dropout(dropout_rate)
        )
        
    def forward(self, x):
        return self.embed(x)

In [10]:
class ClassificationLayer(nn.Module):
    def __init__(self, encoder_size, number_classes, dropout_rate):
        super(ClassificationLayer, self).__init__()
        
        self.cls = nn.Sequential(
            nn.Linear(encoder_size, encoder_size),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(encoder_size, encoder_size),
            nn.GELU(),
            nn.Linear(encoder_size, number_classes),
            nn.ReLU()
        )
    def forward(self,x):
        return self.cls(x)


In [11]:
class RegressionLayer(nn.Module):
    def __init__(self, encoder_size, dropout_rate):
        super(RegressionLayer, self).__init__()
        
        self.reg = nn.Sequential(
            nn.Linear(encoder_size, encoder_size),
            nn.Tanh(),
            nn.Dropout(dropout_rate),
            nn.Linear(encoder_size, encoder_size),
            nn.Softsign(),
            nn.Linear(encoder_size, 1)
        )
    def forward(self,x):
        return self.reg(x)


In [12]:
class Encoder(nn.Module):
    
    def __init__(self, input_size, encoder_size, num_encoders, num_heads, dropout_rate ):
        super(Encoder,self).__init__()
        
        self.embed = EmbeddingLayer(input_size, encoder_size, dropout_rate)
        
        self.norm0 = nn.LayerNorm(encoder_size, eps=1e-6)
        
        encoders = [EncoderLayer(encoder_size, num_heads, dropout_rate)
                    for _ in range(num_encoders)]
        self.layers = nn.ModuleList(encoders)
        
        self.cls_layer = ClassificationLayer(encoder_size,3, 0.1)
        self.reg_layer = RegressionLayer(64,0.1)
        
    def forward(self, x):
            
        y = self.embed(x)
        y = self.norm0(y)
            
        for layer in self.layers:
            y = layer(y)
        
        cls = self.cls_layer(y.narrow(0,0,477))
        reg = self.reg_layer(y.narrow(0,477,477*5))
        
        return reg, cls
            

In [13]:
'''Training procedure'''

model = Encoder(input_size=11, encoder_size=64, num_encoders=3, num_heads=4, dropout_rate=0.1)

In [14]:
print(model)

Encoder(
  (embed): EmbeddingLayer(
    (embed): Sequential(
      (0): Linear(in_features=11, out_features=50, bias=True)
      (1): GELU()
      (2): Dropout(p=0.1, inplace=False)
      (3): Linear(in_features=50, out_features=128, bias=True)
      (4): GELU()
      (5): Dropout(p=0.1, inplace=False)
      (6): LayerNorm((128,), eps=1e-06, elementwise_affine=True)
      (7): Linear(in_features=128, out_features=128, bias=True)
      (8): GELU()
      (9): Dropout(p=0.1, inplace=False)
      (10): Linear(in_features=128, out_features=64, bias=True)
      (11): GELU()
      (12): Dropout(p=0.1, inplace=False)
    )
  )
  (norm0): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
  (layers): ModuleList(
    (0): EncoderLayer(
      (attention): MultiheadAttention(
        (out_proj): Linear(in_features=64, out_features=64, bias=True)
      )
      (dropout): Dropout(p=0.1, inplace=False)
      (norm0): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
      (ffn): FeedForwardNetwo

In [15]:
total_params = sum(p.numel() for p in model.parameters())
print(total_params)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(total_params)

199132
199132


In [16]:
b,true_reg,true_cls = getBatches(data, num_row=8, num_batch=2)

print(b.shape)

torch.Size([4293, 2, 11])


False

In [17]:
pred_reg, pred_cls = model(b)

0
0
0


In [18]:
print(str(pred_reg.shape) + str(pred_reg.requires_grad))
print(str(true_reg.shape)+ str(true_reg.requires_grad))
print(str(pred_cls.shape) + str(pred_cls.requires_grad))
print(str(true_cls.shape)+ str(true_cls.requires_grad))
assert pred_reg.shape==true_reg.shape
assert (pred_cls.shape[0]==true_cls.shape[0] and pred_cls.shape[1]==true_cls.shape[1])
assert pred_cls.requires_grad==pred_reg.requires_grad==True
assert true_cls.requires_grad==true_reg.requires_grad==False

torch.Size([2385, 2, 1])True
torch.Size([2385, 2, 1])False
torch.Size([477, 2, 3])True
torch.Size([477, 2])False


In [19]:
reg_loss = nn.MSELoss()

loss_output = reg_loss(pred_reg, true_reg)

for i in range(pred_cls.shape[1]):
    cls_loss = nn.CrossEntropyLoss()
    cls_loss_output = cls_loss(pred_cls[:,i,:], true_cls[:,i])
    loss_output = loss_output + (1/pred_cls.shape[1])*cls_loss_output

loss_output

tensor(1.1305, grad_fn=<AddBackward0>)

In [24]:
num_step = 2

loss_values = []
cls_loss_values = []
reg_loss_values = []

reg_loss = nn.MSELoss()
cls_loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), weight_decay=0.001)

for t in range(num_step):
    print('step ' + str(t))
    batch,true_reg,true_cls = getBatches(data, num_row=8, num_batch=2)
    
    # clears optimizer buffer and forward() the model
    optimizer.zero_grad()
    pred_reg, pred_cls = model(batch)
    
    assert pred_reg.shape==true_reg.shape
    assert (pred_cls.shape[0]==true_cls.shape[0] and pred_cls.shape[1]==true_cls.shape[1])
    assert pred_cls.requires_grad==pred_reg.requires_grad==True
    assert true_cls.requires_grad==true_reg.requires_grad==False
    
    # Double Loss calculation
    loss_output = reg_loss(pred_reg, true_reg)
    reg_loss_values.append(loss_output.item())
    
    cls_loss_value_batch = []
    for i in range(pred_cls.shape[1]):
        cls_loss_output = cls_loss(pred_cls[:,i,:], true_cls[:,i])
        cls_loss_value_batch.append(cls_loss_output.item())
        loss_output = loss_output + (1/pred_cls.shape[1])*cls_loss_output
    
    cls_loss_values.append(np.mean(cls_loss_value_batch))
    loss_values.append(loss_output.item())
    
    # computes gradient and optimizes weights 
    loss_output.backward()
    optimizer.step()

step 0
0
0
0
step 1
0
0
0
