In [None]:
!pip install yfinance
!pip install transformers

In [37]:
import yfinance as yf
import datetime
import numpy as np
import pandas as pd
from pylab import mpl, plt
import math, time
import itertools
from datetime import datetime
from operator import itemgetter
from tqdm import tqdm
from math import sqrt
import torch
import torch.nn as nn
from torch.autograd import Variable
from transformers import RobertaTokenizer, RobertaModel


In [5]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [6]:
import wandb
wandb.login()
wandb.init(project="stock_prediction")

[34m[1mwandb[0m: Currently logged in as: [33mvisriv[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Hyperparams

In [33]:
no_of_days_to_lookforward = 1
no_of_days_to_lookback = 5
up_threshold = 0.015
down_threshold = -0.015
max_text_per_iter = 100
batch_size = 1
MAX_LEN = 1000
num_epochs = 20

### Get stocks data for last N days

In [8]:
stock_symbols = [ 'XOM']
no_of_days = 4*365

EXPORT_DATA_FOLDER = './data/'

# Set the start and end dates for the data 
# here matching it with dates of news text available
start = datetime.strptime('2019/01/04', '%Y/%m/%d')
end = datetime.strptime('2023/01/04', '%Y/%m/%d')


# start = datetime.datetime.now() - datetime.timedelta(days=no_of_days)
# end = datetime.datetime.now()

for symbol in stock_symbols:
    # Download the historical price and volume data using yfinance
    data_raw = yf.download(symbol, start=start, end=end)

    # Normalize features by percent of changes between today and yesterday
    pct_change_open = data_raw['Open'].pct_change().fillna(0)
    pct_change_high = data_raw['High'].pct_change().fillna(0)
    pct_change_high_over_open = (data_raw['High']-data_raw['Open'])/data_raw['Open']
    pct_change_low = data_raw['Low'].pct_change().fillna(0)
    pct_change_low_over_open = (data_raw['Low']-data_raw['Open'])/data_raw['Open']
    pct_change_close = data_raw['Close'].pct_change().fillna(0)
    pct_change_close_over_open = (data_raw['Close']-data_raw['Open'])/data_raw['Open']
    pct_change_adjclose = data_raw['Adj Close'].pct_change().fillna(0)
    pct_change_adjclose_over_open = (data_raw['Adj Close']-data_raw['Open'])/data_raw['Open']
    pct_change_volume = data_raw['Volume'].pct_change().fillna(0)

    # Prepare labels: 2 means the close price of tomorow is higher than today's close price; 1 is down; 0 means the movement is between up_threshold and down_threshold
    label = np.where(pct_change_close > up_threshold, 2, np.where(pct_change_close < down_threshold, 1, 0))[1:]
    label = np.append(label, 0)

    # Construct a data_norm data frame
    data_norm = pd.DataFrame({'Open_norm':pct_change_open,
                              'High_norm':pct_change_high,
                              'Low_norm': pct_change_low,
                              'Close_norm':pct_change_close,
                              'Volume_norm':pct_change_volume,
                              'High-Open_norm':pct_change_high_over_open,
                              'Low-Open_norm':pct_change_low_over_open,
                              'Close-Open_norm':pct_change_close_over_open,
                              'Label_2up1down':label})

    # Normalize by min-max normalization after the pct normalization
    data_norm['Open_norm'] = data_norm['Open_norm'].apply(lambda x: (x - data_norm['Open_norm'].min()) / (data_norm['Open_norm'].max() - data_norm['Open_norm'].min()))
    data_norm['High_norm'] = data_norm['High_norm'].apply(lambda x: (x - data_norm['High_norm'].min()) / (data_norm['High_norm'].max() - data_norm['High_norm'].min()))
    data_norm['Low_norm'] = data_norm['Low_norm'].apply(lambda x: (x - data_norm['Low_norm'].min()) / (data_norm['Low_norm'].max() - data_norm['Low_norm'].min()))
    data_norm['Close_norm'] = data_norm['Close_norm'].apply(lambda x: (x - data_norm['Close_norm'].min()) / (data_norm['Close_norm'].max() - data_norm['Close_norm'].min()))
    data_norm['Volume_norm'] = data_norm['Volume_norm'].apply(lambda x: (x - data_norm['Volume_norm'].min()) / (data_norm['Volume_norm'].max() - data_norm['Volume_norm'].min()))
    data_norm['High-Open_norm'] = data_norm['High-Open_norm'].apply(lambda x: (x - data_norm['High-Open_norm'].min()) / (data_norm['High-Open_norm'].max() - data_norm['High-Open_norm'].min()))
    data_norm['Low-Open_norm'] = data_norm['Low-Open_norm'].apply(lambda x: (x - data_norm['Low-Open_norm'].min()) / (data_norm['Low-Open_norm'].max() - data_norm['Low-Open_norm'].min()))
    data_norm['Close-Open_norm'] = data_norm['Close-Open_norm'].apply(lambda x: (x - data_norm['Close-Open_norm'].min()) / (data_norm['Close-Open_norm'].max() - data_norm['Close-Open_norm'].min()))

    # Remove the first and the last row, becuase of NAN values
    data_raw = data_raw.iloc[1:-1]
    data_norm = data_norm.iloc[1:-1]

    data_raw.to_csv(EXPORT_DATA_FOLDER+symbol+'_raw_data.csv', index=True)
    data_norm.to_csv(EXPORT_DATA_FOLDER+symbol+'_norm_data.csv', index=True)



[*********************100%***********************]  1 of 1 completed


## TODO 

In [None]:
'''
(2023-06-05)
cuda support check
//read textual data into correct shape
hyperparam tuning: number of neurons: tune to right number of neurons in FC in model
//max_text_per_iter -> code in dataloader to maintain the size 

(2023-06-05)
cuda check
roberta encoder fix
multi label - how to create target label?

'''

## Prep textual data

In [9]:
text_data_df = pd.read_csv('./data/XOM_20200401_20230401_medium.csv', sep= ',', header= 0)
text_data_df = text_data_df[['Date', 'News']]


text_data_df = text_data_df.groupby('Date')['News'].apply('$$$###'.join)

text_data_df.index = pd.to_datetime(text_data_df.index, dayfirst=True)
# text_data_df


In [10]:
all_train_df = data_norm.join(text_data_df, how = 'inner')
all_train_df

Unnamed: 0_level_0,Open_norm,High_norm,Low_norm,Close_norm,Volume_norm,High-Open_norm,Low-Open_norm,Close-Open_norm,Label_2up1down,News
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-04-01,0.424588,0.325630,0.445426,0.444210,0.187485,0.493448,0.887338,0.568367,2,"Global Polymers Market, By Type (Thermoplastic..."
2020-04-02,0.778879,0.795313,0.736065,0.797702,0.445139,0.821628,0.826839,0.730773,1,European Morning Briefing: U.S. Jobs Report Ey...
2020-04-03,0.853804,0.435964,0.626517,0.372487,0.160943,0.174091,0.435487,0.093004,2,Nordic Morning Briefing: Services PMI Data in ...
2020-04-06,0.427455,0.266519,0.635065,0.619722,0.182685,0.235477,0.869836,0.546103,2,圖表 Texas Takes Two Punches -- Oil Shock and O...
2020-04-07,0.876632,0.689534,0.761913,0.567103,0.283049,0.096776,0.676091,0.194020,2,Exxon Cuts Capital Spending by 30% in Response...
...,...,...,...,...,...,...,...,...,...,...
2022-12-23,0.542767,0.435239,0.669969,0.596883,0.188873,0.165444,0.996267,0.555196,0,Energy Transfer's Gulf Run gas pipeline gets U...
2022-12-27,0.662985,0.484486,0.633508,0.546500,0.238297,0.108558,0.967843,0.504141,1,"Butadiene Market, By Application (Polybutadien..."
2022-12-28,0.600374,0.392759,0.523834,0.424789,0.195329,0.008094,0.853954,0.357091,0,Exxon sues EU in move to block new windfall ta...
2022-12-29,0.508146,0.400534,0.562507,0.521098,0.222456,0.158359,1.000000,0.522474,0,Global Car Care Products Market 2023-2027 Publ...


In [34]:
all_train = all_train_df.values

window_size = no_of_days_to_lookback

X_numerical_train = []
y_train = []
X_text_train = []
X_text_train_curr = []


for i in range(window_size, len(all_train) - no_of_days_to_lookforward + 1):
    X_numerical_train.append(all_train[i-window_size: i, :-2])
    
    # split and append sequence of text
    curr_seq = all_train[i-window_size: i, -1]
    for j in range(window_size):
        split_curr_seq = curr_seq[window_size - 1 -j].split('$$$###')
        X_text_train_curr = X_text_train_curr + split_curr_seq
    
    if len(X_text_train_curr) > max_text_per_iter:
        X_text_train_curr = X_text_train_curr[:100]
    
    X_text_train.append(X_text_train_curr)
        
    # target labels
    y_train.append(all_train[i:i+no_of_days_to_lookforward, -2])

X_numerical_train, y_train = np.array(X_numerical_train).astype(np.float16), np.array(y_train).astype(np.int32)
print(type(X_numerical_train))
print(type(y_train))

X_numerical_train = torch.from_numpy(X_numerical_train).type(torch.Tensor)
y_train = torch.from_numpy(y_train).long()



<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [35]:
print(len(X_numerical_train))
print(len(X_text_train))
print(len(y_train))

689
689
689


In [None]:
print(len(X_text_train))
print(len(X_text_train[2]))
print(X_text_train[2])

## Data loader

In [44]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

tokenizer = RobertaTokenizer.from_pretrained('roberta-large', truncation=True, do_lower_case=True)

class SiameseDataloader(Dataset):
    
    def __init__(self, X_numerical_train, y_train, X_text_train, MAX_LEN, tokenizer):
        self.X_numerical_train = X_numerical_train
        self.X_text_train = X_text_train
        self.MAX_LEN = MAX_LEN
        self.tokenizer = tokenizer
        
    def __getitem__(self, index):

        
        input_ids = []
        attention_masks = []
        token_type_ids = []
        input_seq = []

        for sent in X_text_train[index]:
            encoded_sent = self.tokenizer.encode_plus(
                text=sent,
                add_special_tokens=True,        # Add `[CLS]` and `[SEP]` special tokens
                max_length=self.MAX_LEN,             # Choose max length to truncate/pad
                pad_to_max_length=True,         # Pad sentence to max length 
                #return_attention_mask=True      # Return attention mask
                return_token_type_ids=True
                )
            input_ids.append(encoded_sent.get('input_ids'))
            attention_masks.append(encoded_sent.get('attention_mask'))
            token_type_ids.append(encoded_sent.get('token_type_ids'))

        # Convert lists to tensors
        input_ids = torch.tensor(input_ids)
        attention_masks = torch.tensor(attention_masks)
        token_type_ids = torch.tensor(token_type_ids)


        return {
            'x_numerical': X_numerical_train[index],
            'ids': torch.tensor(input_ids, dtype=torch.long),
            'mask': torch.tensor(attention_masks, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(y_train[index], dtype=torch.long)
        }
    
    
    
    

    def __len__(self):
        return len(self.X_numerical_train)

In [45]:
train_set = SiameseDataloader(X_numerical_train, y_train, X_text_train, MAX_LEN, tokenizer)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)


## Build model


In [40]:



class SiameseModel(nn.Module):
    def __init__(self, input_dim1, input_dim2, 
                 hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4,
                 num_layers1, num_layers2, output_dim1, output_dim2):
        super(SiameseModel, self).__init__()
        self.input_dim1 = input_dim1
        self.input_dim2 = input_dim2
        self.hidden_dim1 = hidden_dim1
        self.hidden_dim2 = hidden_dim2
        self.hidden_dim3 = hidden_dim3
        self.hidden_dim4 = hidden_dim4
        self.num_layers1 = num_layers1
        self.num_layers2 = num_layers2
        self.output_dim1 = output_dim1
        self.output_dim2 = output_dim2
        
        

        self.roberta = RobertaModel.from_pretrained("roberta-large")
        
        
        self.lstm1 = nn.LSTM(input_dim1, hidden_dim1, num_layers1, batch_first=True)
        self.lstm2 = nn.LSTM(input_dim2, hidden_dim2, num_layers2, batch_first=True)


        self.fc1 = nn.Linear(hidden_dim1, output_dim1)
        self.fc2 = nn.Linear(hidden_dim2, output_dim2)
        self.fc3 = nn.Linear(output_dim1+output_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, hidden_dim4)
        self.fc5 = nn.Linear(hidden_dim4, 3)
        
        
        
    def forward(self, x1, ids, masks, token_type_ids):
        #left tower with numerical features
        
        h_10 = Variable(torch.zeros(self.num_layers1, x1.size(0), self.hidden_dim1))
        c_10 = Variable(torch.zeros(self.num_layers1, x1.size(0), self.hidden_dim1))
        ula1, (h_out1, _) = self.lstm1(x1, (h_10, c_10))
        h_out1 = h_out1.view(-1, self.hidden_dim1)
        out1 = self.fc1(h_out1)
        
        
        

        e2 = torch.zeros()
        # right tower with roberta on textual features  
        #TODO
        for k in range(ids.shape[1]):
            seq_ids = ids[:,k,:]
            seq_masks = masks[:,k,:]
            seq_token_type_ids = token_type_ids[:,k,:]
            
            
            e2k = self.roberta(input_ids= seq_ids, attention_mask=seq_masks, token_type_ids=seq_token_type_ids)
            e2k = e2k[0][:, 0, :]
    
        
        
        h_20 = Variable(torch.zeros(self.num_layers2, e2.size(0), self.hidden_dim2))
        c_20 = Variable(torch.zeros(self.num_layers2, e2.size(0), self.hidden_dim2))
        ula2, (h_out2, _) = self.lstm2(x2, (h_20, c_20))
        h_out2 = h_out2.view(-1, self.hidden_dim2)
        out2 = self.fc2(h_out2)
        
        
        
        
        # siamese merging layers
        
#         output = torch.cat((out1, out2),1)
#         output = F.relu(self.fc3(output))
#         output = F.relu(self.fc4(output))
#         output = self.fc5(output)
#         return output
    
#TODO : correct these values
model = SiameseModel(input_dim1 = 8, input_dim2 = 1024, 
                 hidden_dim1 = 20, hidden_dim2 = 768, hidden_dim3 = 128, hidden_dim4 = 64,
                 num_layers1 = 1, num_layers2 = 1, output_dim1 = 10, output_dim2 = 256)


    
    


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
print(model)
print(len(list(model.parameters())))
# for i in range(len(list(model.parameters()))):
#     print(list(model.parameters())[i].size())


## Train model

In [41]:
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)
loss_arr = np.zeros(num_epochs)

In [47]:
roberta = RobertaModel.from_pretrained("roberta-large")
for epoch in range(num_epochs):

    for idx, data in tqdm(enumerate(train_loader, 0)):
        x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        ids = data['ids'].to(device, dtype = torch.long)
        masks = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        # debugging roberta encoder
        '''
        debug start here
        '''
        for k in range(2):#ids.shape[1]):
            seq_ids = ids[:,k,:]
            seq_masks = masks[:,k,:]
            seq_token_type_ids = token_type_ids[:,k,:]


            e2k = roberta(input_ids= seq_ids, attention_mask=seq_masks, token_type_ids=seq_token_type_ids)
            print(type(e2k))
            e2k1 = e2k[0][:, 0, :]
            print(type(e2k1))
            print((e2k1.shape))

            print(type(e2k1))
        
        '''
        debug end here
        '''
        
    #     print(ids.shape)
    #     print(masks.shape)
    #     print(token_type_ids.shape)
    
        y_pred = model(x_numerical, ids, masks, token_type_ids)
        
        if idx > 1:
            break
    
        loss = criterion(y_pred, y_train)
        if epoch % 10 == 0 and epoch !=0:
            print("Epoch ", t, "CELoss: ", loss.item())
        loss_arr[epoch] = loss.item()
        wandb.log({'celoss': loss.item().avg, 'epoch': epoch, 'batch_id': batch_id})

        # Zero out gradient, else they will accumulate between epochs
        optimiser.zero_grad()

        # Backward pass
        loss.backward()

        # Update parameters
        optimiser.step()
    
        
        
        

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  'ids': torch.tensor(input_ids, dtype=torch.long),
  'mask': torch.tensor(attention_masks, dtype=torch.long),
  'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
  'targets': torch.tensor(y_train[index], dtype=torch.long)


<class 'transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions'>
<class 'torch.Tensor'>
torch.Size([1, 1024])
<class 'torch.Tensor'>
<class 'transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions'>


0it [00:09, ?it/s]

<class 'torch.Tensor'>
torch.Size([1, 1024])
<class 'torch.Tensor'>





TypeError: zeros() received an invalid combination of arguments - got (), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of SymInts size, *, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)


In [None]:
plt.plot(loss_arr, label="Training loss")
plt.legend()
plt.show()

In [None]:

'''