In [None]:
!pip install yfinance
!pip install transformers


In [36]:
import yfinance as yf
import datetime
import numpy as np
import pandas as pd
from pylab import mpl, plt
plt.style.use('seaborn')
# mpl.rcParams['font.family'] = 'serif'
%matplotlib inline

from pandas import datetime
import math, time
import itertools
import datetime
from operator import itemgetter

from math import sqrt
import torch
import torch.nn as nn
from torch.autograd import Variable

# Set hyperparameters
stock_symbols = [ 'XOM']
no_of_days = 4*365
up_threshold = 0.015
down_threshold = -0.015
EXPORT_DATA_FOLDER = './data/'

# Set the start and end dates for the data
start = datetime.strptime('04-01-2019', '%m/%d/%y ')
end = datetime.strptime('04-01-2023', '%m/%d/%y ')


# start = datetime.datetime.now() - datetime.timedelta(days=no_of_days)
# end = datetime.datetime.now()

for symbol in stock_symbols:
    # Download the historical price and volume data using yfinance
    data_raw = yf.download(symbol, start=start, end=end)

    # Normalize features by percent of changes between today and yesterday
    pct_change_open = data_raw['Open'].pct_change().fillna(0)
    pct_change_high = data_raw['High'].pct_change().fillna(0)
    pct_change_high_over_open = (data_raw['High']-data_raw['Open'])/data_raw['Open']
    pct_change_low = data_raw['Low'].pct_change().fillna(0)
    pct_change_low_over_open = (data_raw['Low']-data_raw['Open'])/data_raw['Open']
    pct_change_close = data_raw['Close'].pct_change().fillna(0)
    pct_change_close_over_open = (data_raw['Close']-data_raw['Open'])/data_raw['Open']
    pct_change_adjclose = data_raw['Adj Close'].pct_change().fillna(0)
    pct_change_adjclose_over_open = (data_raw['Adj Close']-data_raw['Open'])/data_raw['Open']
    pct_change_volume = data_raw['Volume'].pct_change().fillna(0)

    # Prepare labels: 2 means the close price of tomorow is higher than today's close price; 1 is down; 0 means the movement is between up_threshold and down_threshold
    label = np.where(pct_change_close > up_threshold, 2, np.where(pct_change_close < down_threshold, 1, 0))[1:]
    label = np.append(label, 0)

    # Construct a data_norm data frame
    data_norm = pd.DataFrame({'Open_norm':pct_change_open,
                              'High_norm':pct_change_high,
                              'Low_norm': pct_change_low,
                              'Close_norm':pct_change_close,
                              'Volume_norm':pct_change_volume,
                              'High-Open_norm':pct_change_high_over_open,
                              'Low-Open_norm':pct_change_low_over_open,
                              'Close-Open_norm':pct_change_close_over_open,
                              'Label_2up1down':label})

    # Normalize by min-max normalization after the pct normalization
    data_norm['Open_norm'] = data_norm['Open_norm'].apply(lambda x: (x - data_norm['Open_norm'].min()) / (data_norm['Open_norm'].max() - data_norm['Open_norm'].min()))
    data_norm['High_norm'] = data_norm['High_norm'].apply(lambda x: (x - data_norm['High_norm'].min()) / (data_norm['High_norm'].max() - data_norm['High_norm'].min()))
    data_norm['Low_norm'] = data_norm['Low_norm'].apply(lambda x: (x - data_norm['Low_norm'].min()) / (data_norm['Low_norm'].max() - data_norm['Low_norm'].min()))
    data_norm['Close_norm'] = data_norm['Close_norm'].apply(lambda x: (x - data_norm['Close_norm'].min()) / (data_norm['Close_norm'].max() - data_norm['Close_norm'].min()))
    data_norm['Volume_norm'] = data_norm['Volume_norm'].apply(lambda x: (x - data_norm['Volume_norm'].min()) / (data_norm['Volume_norm'].max() - data_norm['Volume_norm'].min()))
    data_norm['High-Open_norm'] = data_norm['High-Open_norm'].apply(lambda x: (x - data_norm['High-Open_norm'].min()) / (data_norm['High-Open_norm'].max() - data_norm['High-Open_norm'].min()))
    data_norm['Low-Open_norm'] = data_norm['Low-Open_norm'].apply(lambda x: (x - data_norm['Low-Open_norm'].min()) / (data_norm['Low-Open_norm'].max() - data_norm['Low-Open_norm'].min()))
    data_norm['Close-Open_norm'] = data_norm['Close-Open_norm'].apply(lambda x: (x - data_norm['Close-Open_norm'].min()) / (data_norm['Close-Open_norm'].max() - data_norm['Close-Open_norm'].min()))

    # Remove the first and the last row, becuase of NAN values
    data_raw = data_raw.iloc[1:-1]
    data_norm = data_norm.iloc[1:-1]

    data_raw.to_csv(EXPORT_DATA_FOLDER+symbol+'_raw_data.csv', index=True)
    data_norm.to_csv(EXPORT_DATA_FOLDER+symbol+'_norm_data.csv', index=True)



  plt.style.use('seaborn')
  from pandas import datetime
  from .autonotebook import tqdm as notebook_tqdm


[*********************100%***********************]  1 of 1 completed


In [92]:
data_norm.index

DatetimeIndex(['2021-06-08', '2021-06-09', '2021-06-10', '2021-06-11',
               '2021-06-14', '2021-06-15', '2021-06-16', '2021-06-17',
               '2021-06-18', '2021-06-21',
               ...
               '2023-05-18', '2023-05-19', '2023-05-22', '2023-05-23',
               '2023-05-24', '2023-05-25', '2023-05-26', '2023-05-30',
               '2023-05-31', '2023-06-01'],
              dtype='datetime64[ns]', name='Date', length=500, freq=None)

## TODO (2023-06-05)



cuda support check
read textual data into correct shape
hyperparam tuning: number of neurons: tune to right number of neurons in FC in model
max_text_per_iter -> code in dataloader to maintain the size 


## Prep textual data

In [96]:
text_data_df = pd.read_csv('./data/XOM_20200401_20230401_medium.csv', sep= ',', header= 0)
text_data_df = text_data_df[['Date', 'News']]


text_data_df = text_data_df.groupby('Date')['News'].apply('$$$###'.join)

text_data_df.index = pd.to_datetime(text_data_df.index, dayfirst=True)
text_data_df


Date
2021-01-01    Tomato processor's accrued production costs fa...
2022-01-01    Industrial Alcohol Market Forecasts to 2028 – ...
2023-01-01    Global Cumene Market Research Report 2022-2032...
2020-10-01    Press Release: SBM Offshore awarded contracts ...
2021-10-01    JPMorgan 's Own Employee Travel Numbers Now He...
                                    ...                        
2021-08-09    Climate Change Is a ‘Hammer Hitting Us on the ...
2022-08-09    KASE - Trading in common shares US30231G1022 (...
2020-09-09    TAP Clouds Italy’s LNG Import Plans The immine...
2021-09-09    Storm's Fallout Cripples U.S. Oil Output --- S...
2022-09-09    Thermoplastic Elastomer Market Forecasts to 20...
Name: News, Length: 1065, dtype: object

In [102]:
all_train_df = data_norm.join(text_data_df, how = 'inner')
all_train_df

Unnamed: 0_level_0,Open_norm,High_norm,Low_norm,Close_norm,Volume_norm,High-Open_norm,Low-Open_norm,Close-Open_norm,Label_2up1down,News
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-06-08,0.624690,0.576438,0.482223,0.675290,0.490718,0.240756,0.785477,0.583539,0,"Blowing Agent Market by Type (HC, HFC, HCFC), ..."
2021-06-09,0.757691,0.598956,0.716982,0.610093,0.229087,0.180812,0.878255,0.490038,0,Global Asphalt Market 2021-2025 Published By: ...
2021-06-10,0.724752,0.550617,0.533313,0.562715,0.210127,0.087882,0.660152,0.373188,0,Global Benzene Market 2021-2025 Published By: ...
2021-06-11,0.524817,0.385896,0.512483,0.486898,0.118679,0.043161,0.780528,0.374858,0,Keystone Illustrates Pipelines' Hurdles The fa...
2021-06-14,0.509796,0.431470,0.456017,0.540300,0.216318,0.116407,0.813746,0.460184,2,C-Suite Strategies (A Special Report): Managem...
...,...,...,...,...,...,...,...,...,...,...
2023-03-27,0.865728,0.678916,0.752635,0.704917,0.258925,0.250995,0.874040,0.582445,0,"Exxon Eyes Staggered, But Larger, Rovuma LNG S..."
2023-03-28,0.656699,0.567402,0.619880,0.638819,0.150382,0.316675,0.962312,0.633900,2,PDF China National Chemical Corporation Ltd. ...
2023-03-29,0.805350,0.591039,0.692672,0.671698,0.314097,0.153800,0.916154,0.566586,0,Global Polyolefin Market 2023-2027 Published B...
2023-03-30,0.714846,0.511150,0.599019,0.585574,0.170996,0.002758,0.850549,0.488014,0,30 March The high-yielding shares powering thi...


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting huggingface-hub<1.0,>=0.14.1
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting regex!=2019.12.17
  Downloading regex-2023.6.3-cp39-cp39-macosx_1

In [103]:
all_train = all_train_df.values

window_size = 5

X_numerical_train = []
y_train = []
X_text_train = []

for i in range(window_size, len(all_train)):
    X_numerical_train.append(all_train[i-window_size: i, :-2])
    X_text_train.append(all_train[i-window_size: i, -1])
    
    y_train.append(all_train[i, -2]) #TODO
    
X_numerical_train, y_train = np.array(X_numerical_train), np.array(y_train)

X_numerical_train = torch.from_numpy(X_numerical_train).type(torch.Tensor)
y_train = torch.from_numpy(y_train).long()



TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [107]:
type(all_train[0: 5, :-2])

numpy.ndarray

## Data loader

In [77]:
from torch.utils.data import Dataset
class SiameseDataloader(Dataset):
    
    def __init__(self, X_numerical_train, y_train, X_text_train):
        
        pass
        
    def __getitem__(self, index):

        return (X_train[index], text_train[index]), y_train[index]

    def __len__(self):
        return len(self.X_numerical_train)

## Build model


In [74]:


from transformers import RobertaTokenizer, RobertaModel

class SiameseModel(nn.Module):
    def __init__(self, input_dim1, input_dim2, 
                 hidden_dim1, hidden_dim2, hidden_dim3, 
                 num_layers1, num_layers2, output_dim1, output_dim2):
        super(SiameseModel, self).__init__()
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
        self.text_encoder = RobertaModel.from_pretrained('roberta-large')
        
        
        self.lstm1 = nn.LSTM(input_dim1, hidden_dim1, num_layers1, batch_first=True)
        self.lstm2 = nn.LSTM(input_dim2, hidden_dim2, num_layers2, batch_first=True)


        self.fc1 = nn.Linear(hidden_dim1, output_dim1)
        self.fc2 = nn.Linear(hidden_dim2, output_dim2)
        self.fc3 = nn.Linear(output_dim1+output_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, 3)
        
        
        
    def forward(self, x1, x2):
        #left tower with numerical features
        h10 = torch.zeros(self.num_layers1, x1.size(0), self.hidden_dim1).requires_grad_()
        c10 = torch.zeros(self.num_layers1, x1.size(0), self.hidden_dim1).requires_grad_()
        out1, (h1n, c1n) = self.lstm1(x1, (h10.detach(), c10.detach()))
        out1 = self.fc1(out1[:, -1, :]) 
        
        
        # right tower with roberta on textual features
        encoded_input = self.tokenizer(x2, return_tensors='pt')
        e2 = self.text_encoder(**encoded_input)
        
        h20 = torch.zeros(self.num_layers2, e2.size(0), self.hidden_dim2).requires_grad_()
        c20 = torch.zeros(self.num_layers2, e2.size(0), self.hidden_dim2).requires_grad_()
        out2, (hn, cn) = self.lstm2(x2, (h20.detach(), c20.detach()))
        out2 = self.fc2(out2[:, -1, :]) 
        
        
        

        
        output = torch.cat((out1, out2),1)
        output = F.relu(self.fc3(output))
        output = self.fc4(output)
        return output
    
#TODO : correct these values
model = SiameseModel(input_dim1 = 8, input_dim2 = 1024, 
                 hidden_dim1 = 10, hidden_dim2 = 800, hidden_dim3 = 500, 
                 num_layers1 = 1, num_layers2 = 1, output_dim1 = 50, output_dim2 = 50)


    
    


Downloading (…)olve/main/vocab.json: 100%|███| 899k/899k [00:00<00:00, 1.06MB/s]
Downloading (…)olve/main/merges.txt: 100%|████| 456k/456k [00:00<00:00, 621kB/s]
Downloading (…)lve/main/config.json: 100%|██████| 482/482 [00:00<00:00, 193kB/s]
Downloading pytorch_model.bin: 100%|███████| 1.43G/1.43G [06:27<00:00, 3.68MB/s]
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSeq

In [75]:
print(model)
print(len(list(model.parameters())))
for i in range(len(list(model.parameters()))):
    print(list(model.parameters())[i].size())


SiameseModel(
  (text_encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm

torch.Size([1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([4096, 1024])
torch.Size([4096])
torch.Size([1024, 4096])
torch.Size([1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([4096, 1024])
torch.Size([4096])
torch.Size([1024, 4096])
torch.Size([1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024, 1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([1024])
torch.Size([4096, 1024])
torch.Size([4096])
torch.Size([1024, 4096])

## Train model

In [69]:

criterion = nn.CrossEntropyLoss()

optimiser = torch.optim.Adam(model.parameters(), lr=0.01)

num_epochs = 20
loss_arr = np.zeros(num_epochs)

for t in range(num_epochs):
    
    for batch_idx, (x1, x2, y_train) in enumerate(train_loader):
        
        # Forward pass
        y_train_pred = model(x1, x2)


        loss = criterion(y_train_pred, y_train)
        if t % 10 == 0 and t !=0:
            print("Epoch ", t, "CELoss: ", loss.item())
        loss_arr[t] = loss.item()

        # Zero out gradient, else they will accumulate between epochs
        optimiser.zero_grad()

        # Backward pass
        loss.backward()

        # Update parameters
        optimiser.step()
    
    

    



   
        
        
        



Epoch  10 MSE:  0.8407482504844666
Epoch  20 MSE:  0.8068487048149109
Epoch  30 MSE:  0.7411683797836304
Epoch  40 MSE:  0.7207826375961304
Epoch  50 MSE:  0.47958889603614807
Epoch  60 MSE:  0.29257601499557495
Epoch  70 MSE:  0.14471979439258575
Epoch  80 MSE:  0.045132964849472046
Epoch  90 MSE:  0.01147476676851511
Epoch  100 MSE:  0.0039354609325528145
Epoch  110 MSE:  0.001934159197844565
Epoch  120 MSE:  0.0012813452631235123
Epoch  130 MSE:  0.0009837034158408642
Epoch  140 MSE:  0.0008080229163169861
Epoch  150 MSE:  0.0006962246261537075
Epoch  160 MSE:  0.0006137943710200489
Epoch  170 MSE:  0.0005495513323694468
Epoch  180 MSE:  0.0004968825378455222
Epoch  190 MSE:  0.00045257911551743746
Epoch  200 MSE:  0.0004146856372244656
Epoch  210 MSE:  0.00038184275035746396
Epoch  220 MSE:  0.0003531062975525856
Epoch  230 MSE:  0.0003277502255514264
Epoch  240 MSE:  0.0003052429819945246
Epoch  250 MSE:  0.0002851351164281368
Epoch  260 MSE:  0.000267079594777897
Epoch  270 MSE: 

In [None]:
plt.plot(loss_arr, label="Training loss")
plt.legend()
plt.show()

In [None]:

'''