In [1]:
!pip install yfinance
!pip install transformers
!pip install wandb
!pip install beautifulsoup4


Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m101.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m117.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.7 MB/s[0m eta [36m0:00

In [2]:
import yfinance as yf
import datetime
import numpy as np
import pandas as pd
from pylab import mpl, plt
import math, time
import itertools
from datetime import datetime
from operator import itemgetter
from tqdm import tqdm
from math import sqrt
import torch
import torch.nn as nn
from torch.autograd import Variable
from transformers import RobertaTokenizer, RobertaModel
import torch.nn.functional as F

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
import wandb
wandb.login()
wandb.init(project="stock_prediction")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mvisriv[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Hyperparams

In [5]:
no_of_days_to_lookforward = 1
no_of_days_to_lookback = 5
up_threshold = 0.015
down_threshold = -0.015
max_text_per_iter = 20
batch_size = 8
MAX_LEN = 10
num_epochs = 200

### Get stocks data for last N days

In [6]:
!mkdir data


In [7]:
stock_symbols = [ 'XOM']
no_of_days = 4*365

EXPORT_DATA_FOLDER = './data/'
# !mkdir data
# Set the start and end dates for the data
# here matching it with dates of news text available
train_start = datetime.strptime('2020/01/04', '%Y/%m/%d')
train_end = datetime.strptime('2022/09/30', '%Y/%m/%d')
test_start = datetime.strptime('2022/10/01', '%Y/%m/%d')
test_end = datetime.strptime('2023/01/04', '%Y/%m/%d')


# start = datetime.datetime.now() - datetime.timedelta(days=no_of_days)
# end = datetime.datetime.now()

# Get training data
for symbol in stock_symbols:
    # Download the historical price and volume data using yfinance
    train_data_raw = yf.download(symbol, start=train_start, end=train_end)

    # Normalize features by percent of changes between today and yesterday
    pct_change_open = train_data_raw['Open'].pct_change().fillna(0)
    pct_change_high = train_data_raw['High'].pct_change().fillna(0)
    pct_change_high_over_open = (train_data_raw['High']-train_data_raw['Open'])/train_data_raw['Open']
    pct_change_low = train_data_raw['Low'].pct_change().fillna(0)
    pct_change_low_over_open = (train_data_raw['Low']-train_data_raw['Open'])/train_data_raw['Open']
    pct_change_close = train_data_raw['Close'].pct_change().fillna(0)
    pct_change_close_over_open = (train_data_raw['Close']-train_data_raw['Open'])/train_data_raw['Open']
    pct_change_adjclose = train_data_raw['Adj Close'].pct_change().fillna(0)
    pct_change_adjclose_over_open = (train_data_raw['Adj Close']-train_data_raw['Open'])/train_data_raw['Open']
    pct_change_volume = train_data_raw['Volume'].pct_change().fillna(0)

    # Prepare labels: 2 means the close price of tomorow is higher than today's close price; 1 is down; 0 means the movement is between up_threshold and down_threshold
    label = np.where(pct_change_close > up_threshold, 2, np.where(pct_change_close < down_threshold, 1, 0))[1:]
    label = np.append(label, 0)

    # Construct a train_data_norm data frame
    train_data_norm = pd.DataFrame({'Open_norm':pct_change_open,
                              'High_norm':pct_change_high,
                              'Low_norm': pct_change_low,
                              'Close_norm':pct_change_close,
                              'Volume_norm':pct_change_volume,
                              'High-Open_norm':pct_change_high_over_open,
                              'Low-Open_norm':pct_change_low_over_open,
                              'Close-Open_norm':pct_change_close_over_open,
                              'Label_2up1down':label})

    # Normalize by min-max normalization after the pct normalization
    train_data_norm['Open_norm'] = train_data_norm['Open_norm'].apply(lambda x: (x - train_data_norm['Open_norm'].min()) / (train_data_norm['Open_norm'].max() - train_data_norm['Open_norm'].min()))
    train_data_norm['High_norm'] = train_data_norm['High_norm'].apply(lambda x: (x - train_data_norm['High_norm'].min()) / (train_data_norm['High_norm'].max() - train_data_norm['High_norm'].min()))
    train_data_norm['Low_norm'] = train_data_norm['Low_norm'].apply(lambda x: (x - train_data_norm['Low_norm'].min()) / (train_data_norm['Low_norm'].max() - train_data_norm['Low_norm'].min()))
    train_data_norm['Close_norm'] = train_data_norm['Close_norm'].apply(lambda x: (x - train_data_norm['Close_norm'].min()) / (train_data_norm['Close_norm'].max() - train_data_norm['Close_norm'].min()))
    train_data_norm['Volume_norm'] = train_data_norm['Volume_norm'].apply(lambda x: (x - train_data_norm['Volume_norm'].min()) / (train_data_norm['Volume_norm'].max() - train_data_norm['Volume_norm'].min()))
    train_data_norm['High-Open_norm'] = train_data_norm['High-Open_norm'].apply(lambda x: (x - train_data_norm['High-Open_norm'].min()) / (train_data_norm['High-Open_norm'].max() - train_data_norm['High-Open_norm'].min()))
    train_data_norm['Low-Open_norm'] = train_data_norm['Low-Open_norm'].apply(lambda x: (x - train_data_norm['Low-Open_norm'].min()) / (train_data_norm['Low-Open_norm'].max() - train_data_norm['Low-Open_norm'].min()))
    train_data_norm['Close-Open_norm'] = train_data_norm['Close-Open_norm'].apply(lambda x: (x - train_data_norm['Close-Open_norm'].min()) / (train_data_norm['Close-Open_norm'].max() - train_data_norm['Close-Open_norm'].min()))

    # Remove the first and the last row, becuase of NAN values
    train_data_raw = train_data_raw.iloc[1:-1]
    train_data_norm = train_data_norm.iloc[1:-1]

    train_data_raw.to_csv(EXPORT_DATA_FOLDER+symbol+'train_raw_data.csv', index=True)
    train_data_norm.to_csv(EXPORT_DATA_FOLDER+symbol+'train_norm_data.csv', index=True)


# Get test data
for symbol in stock_symbols:
    # Download the historical price and volume data using yfinance
    test_data_raw = yf.download(symbol, start=test_start, end=test_end)

    # Normalize features by percent of changes between today and yesterday
    pct_change_open = test_data_raw['Open'].pct_change().fillna(0)
    pct_change_high = test_data_raw['High'].pct_change().fillna(0)
    pct_change_high_over_open = (test_data_raw['High']-test_data_raw['Open'])/test_data_raw['Open']
    pct_change_low = test_data_raw['Low'].pct_change().fillna(0)
    pct_change_low_over_open = (test_data_raw['Low']-test_data_raw['Open'])/test_data_raw['Open']
    pct_change_close = test_data_raw['Close'].pct_change().fillna(0)
    pct_change_close_over_open = (test_data_raw['Close']-test_data_raw['Open'])/test_data_raw['Open']
    pct_change_adjclose = test_data_raw['Adj Close'].pct_change().fillna(0)
    pct_change_adjclose_over_open = (test_data_raw['Adj Close']-test_data_raw['Open'])/test_data_raw['Open']
    pct_change_volume = test_data_raw['Volume'].pct_change().fillna(0)

    # Prepare labels: 2 means the close price of tomorow is higher than today's close price; 1 is down; 0 means the movement is between up_threshold and down_threshold
    label = np.where(pct_change_close > up_threshold, 2, np.where(pct_change_close < down_threshold, 1, 0))[1:]
    label = np.append(label, 0)

    # Construct a test_data_norm data frame
    test_data_norm = pd.DataFrame({'Open_norm':pct_change_open,
                              'High_norm':pct_change_high,
                              'Low_norm': pct_change_low,
                              'Close_norm':pct_change_close,
                              'Volume_norm':pct_change_volume,
                              'High-Open_norm':pct_change_high_over_open,
                              'Low-Open_norm':pct_change_low_over_open,
                              'Close-Open_norm':pct_change_close_over_open,
                              'Label_2up1down':label})

    # Normalize by min-max normalization after the pct normalization
    test_data_norm['Open_norm'] = test_data_norm['Open_norm'].apply(lambda x: (x - test_data_norm['Open_norm'].min()) / (test_data_norm['Open_norm'].max() - test_data_norm['Open_norm'].min()))
    test_data_norm['High_norm'] = test_data_norm['High_norm'].apply(lambda x: (x - test_data_norm['High_norm'].min()) / (test_data_norm['High_norm'].max() - test_data_norm['High_norm'].min()))
    test_data_norm['Low_norm'] = test_data_norm['Low_norm'].apply(lambda x: (x - test_data_norm['Low_norm'].min()) / (test_data_norm['Low_norm'].max() - test_data_norm['Low_norm'].min()))
    test_data_norm['Close_norm'] = test_data_norm['Close_norm'].apply(lambda x: (x - test_data_norm['Close_norm'].min()) / (test_data_norm['Close_norm'].max() - test_data_norm['Close_norm'].min()))
    test_data_norm['Volume_norm'] = test_data_norm['Volume_norm'].apply(lambda x: (x - test_data_norm['Volume_norm'].min()) / (test_data_norm['Volume_norm'].max() - test_data_norm['Volume_norm'].min()))
    test_data_norm['High-Open_norm'] = test_data_norm['High-Open_norm'].apply(lambda x: (x - test_data_norm['High-Open_norm'].min()) / (test_data_norm['High-Open_norm'].max() - test_data_norm['High-Open_norm'].min()))
    test_data_norm['Low-Open_norm'] = test_data_norm['Low-Open_norm'].apply(lambda x: (x - test_data_norm['Low-Open_norm'].min()) / (test_data_norm['Low-Open_norm'].max() - test_data_norm['Low-Open_norm'].min()))
    test_data_norm['Close-Open_norm'] = test_data_norm['Close-Open_norm'].apply(lambda x: (x - test_data_norm['Close-Open_norm'].min()) / (test_data_norm['Close-Open_norm'].max() - test_data_norm['Close-Open_norm'].min()))

    # Remove the first and the last row, becuase of NAN values
    test_data_raw = test_data_raw.iloc[1:-1]
    test_data_norm = test_data_norm.iloc[1:-1]

    test_data_raw.to_csv(EXPORT_DATA_FOLDER+symbol+'test_raw_data.csv', index=True)
    test_data_norm.to_csv(EXPORT_DATA_FOLDER+symbol+'test_norm_data.csv', index=True)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


## TODO

In [None]:
'''
(2023-06-05)
cuda support check
//read textual data into correct shape
hyperparam tuning: number of neurons: tune to right number of neurons in FC in model
//max_text_per_iter -> code in dataloader to maintain the size

(2023-06-07)
cuda check
roberta encoder fix
multi label - how to create target label?

'''

## Prep textual data

### Crawl textual news data from internet

In [None]:
# Import libraries
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import Request
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Parameters
n = 3 #the # of article headlines displayed per ticker
tickers = ['AAPL', 'TSLA', 'AMZN']



# Get Data
finviz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

for ticker in tickers:
    url = finviz_url + ticker
    req = Request(url=url,
                  headers={'user-agent': 'Mozilla/5.0',
                                   'referer': 'https://...'})
    resp = urlopen(req)
    html = BeautifulSoup(resp, features="lxml")
    news_table = html.find(id='news-table')
    news_tables[ticker] = news_table

try:
    for ticker in tickers:
        df = news_tables[ticker]
        df_tr = df.findAll('tr')

        print ('\n')
        print ('Recent News Headlines for {}: '.format(ticker))

        for i, table_row in enumerate(df_tr):
            a_text = table_row.a.text
            td_text = table_row.td.text
            td_text = td_text.strip()
            print(a_text,'(',td_text,')')
            if i == n-1:
                break
except KeyError:
    pass


# Iterate through the news
parsed_news = []
for file_name, news_table in news_tables.items():
    for x in news_table.findAll('tr'):
        text = x.a.get_text()
        date_scrape = x.td.text.split()

        if len(date_scrape) == 1:
            time = date_scrape[0]

        else:
            date = date_scrape[0]
            time = date_scrape[1]

        ticker = file_name.split('_')[0]

        parsed_news.append([ticker, date, time, text])



### Read downloaded data from saved files

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
text_data_df = pd.read_csv('/content/drive/MyDrive/machine_learning/projects/xai-seq/data/XOM_20200401_20230401_medium.csv',
                           sep= ',',
                           header= 0,
                           engine='python',
                           on_bad_lines = 'skip')
text_data_df = text_data_df[['Date', 'News']]


text_data_df = text_data_df.groupby('Date')['News'].apply('$$$###'.join)

text_data_df.index = pd.to_datetime(text_data_df.index, dayfirst=True)
# text_data_df


In [10]:
all_train_df = train_data_norm.join(text_data_df, how = 'inner')
all_test_df = test_data_norm.join(text_data_df, how = 'inner')

print(all_train_df.index.min())
print(all_train_df.index.max())
print(all_test_df.index.min())
print(all_test_df.index.max())

2020-04-01 00:00:00
2022-09-28 00:00:00
2022-10-04 00:00:00
2022-12-30 00:00:00


### Merge textual and numerical data


In [11]:
all_train = all_train_df.values

window_size = no_of_days_to_lookback

X_numerical_train = []
y_train = []
X_text_train = []
X_text_train_curr = []





for i in range(window_size, len(all_train) - no_of_days_to_lookforward + 1):
    X_numerical_train.append(all_train[i-window_size: i, :-2])

    # split and append sequence of text
    curr_seq = all_train[i-window_size: i, -1]
    for j in range(window_size):
        split_curr_seq = curr_seq[window_size - 1 -j].split('$$$###')
        X_text_train_curr = X_text_train_curr + split_curr_seq

    if len(X_text_train_curr) > max_text_per_iter:
        X_text_train_curr = X_text_train_curr[:max_text_per_iter]

    X_text_train.append(X_text_train_curr)

    # target labels
    y_train.append(all_train[i:i+no_of_days_to_lookforward, -2])

X_numerical_train, y_train = np.array(X_numerical_train).astype(np.float16), np.array(y_train).astype(np.int32)
print(type(X_numerical_train))
print(type(y_train))

X_numerical_train = torch.from_numpy(X_numerical_train).type(torch.Tensor)
y_train = torch.from_numpy(y_train).long()

print(len(X_numerical_train))
print(len(X_text_train))
print(len(y_train))
print(X_numerical_train.shape)

print(len(X_text_train))
print(len(X_text_train[2]))
# print(X_text_train[2])

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
624
624
624
torch.Size([624, 5, 8])
624
20


In [12]:
all_test = all_test_df.values


X_numerical_test = []
y_test = []
X_text_test = []
X_text_test_curr = []

for i in range(window_size, len(all_test) - no_of_days_to_lookforward + 1):
    X_numerical_test.append(all_test[i-window_size: i, :-2])

    # split and append sequence of text (in reverse order to add the latest news first)
    curr_seq = all_test[i-window_size: i, -1]
    for j in range(window_size):
        split_curr_seq = curr_seq[window_size - 1 -j].split('$$$###')
        X_text_test_curr = X_text_test_curr + split_curr_seq

    if len(X_text_test_curr) > max_text_per_iter:
        X_text_test_curr = X_text_test_curr[:max_text_per_iter]

    X_text_test.append(X_text_test_curr)

    # target labels
    y_test.append(all_test[i:i+no_of_days_to_lookforward, -2])

X_numerical_test, y_test = np.array(X_numerical_test).astype(np.float16), np.array(y_test).astype(np.int32)
print(type(X_numerical_test))
print(type(y_test))

X_numerical_test = torch.from_numpy(X_numerical_test).type(torch.Tensor)
y_test = torch.from_numpy(y_test).long()

print(len(X_numerical_test))
print(len(X_text_test))
print(len(y_test))
print(X_numerical_test.shape)

print(len(X_text_test))
print(len(X_text_test[2]))
# print(X_text_test[2])

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
57
57
57
torch.Size([57, 5, 8])
57
20


## Data loader

In [13]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

tokenizer = RobertaTokenizer.from_pretrained('roberta-large', truncation=True, do_lower_case=True)

class SiameseDataloader(Dataset):

    def __init__(self, X_numerical_train, y_train, X_text_train, MAX_LEN, tokenizer):
        self.X_numerical_train = X_numerical_train
        self.X_text_train = X_text_train
        self.MAX_LEN = MAX_LEN
        self.tokenizer = tokenizer

    def __getitem__(self, index):


        input_ids = []
        attention_masks = []
        token_type_ids = []
        input_seq = []

        for sent in X_text_train[index]:
            encoded_sent = self.tokenizer.encode_plus(
                text=sent,
                add_special_tokens=True,        # Add `[CLS]` and `[SEP]` special tokens
                max_length=self.MAX_LEN,             # Choose max length to truncate/pad
                pad_to_max_length=True,         # Pad sentence to max length
                #return_attention_mask=True      # Return attention mask
                return_token_type_ids=True
                )
            input_ids.append(encoded_sent.get('input_ids'))
            attention_masks.append(encoded_sent.get('attention_mask'))
            token_type_ids.append(encoded_sent.get('token_type_ids'))

        # Convert lists to tensors
        input_ids = torch.tensor(input_ids)
        attention_masks = torch.tensor(attention_masks)
        token_type_ids = torch.tensor(token_type_ids)


        return {
            'x_numerical': X_numerical_train[index],
            'ids': torch.tensor(input_ids, dtype=torch.long),
            'mask': torch.tensor(attention_masks, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(y_train[index], dtype=torch.long)
        }





    def __len__(self):
        return len(self.X_numerical_train)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

In [14]:
train_set = SiameseDataloader(X_numerical_train, y_train, X_text_train, MAX_LEN, tokenizer)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=False)


test_set = SiameseDataloader(X_numerical_test, y_test, X_text_test, MAX_LEN, tokenizer)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)


## Build model


In [20]:


import torch.nn as nn

class SiameseModel(nn.Module):
    def __init__(self, input_dim1, input_dim2,
                 hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4,
                 num_layers1, num_layers2, output_dim1, output_dim2):
        super(SiameseModel, self).__init__()
        self.input_dim1 = input_dim1
        self.input_dim2 = input_dim2
        self.hidden_dim1 = hidden_dim1
        self.hidden_dim2 = hidden_dim2
        self.hidden_dim3 = hidden_dim3
        self.hidden_dim4 = hidden_dim4
        self.num_layers1 = num_layers1
        self.num_layers2 = num_layers2
        self.output_dim1 = output_dim1
        self.output_dim2 = output_dim2



        self.roberta = RobertaModel.from_pretrained("roberta-large").to(device)


        self.lstm1 = nn.LSTM(input_dim1, hidden_dim1, num_layers1, batch_first=True)
        self.lstm2 = nn.LSTM(input_dim2, hidden_dim2, num_layers2, batch_first=True)


        self.fc1 = nn.Linear(hidden_dim1, output_dim1)
        self.fc2 = nn.Linear(hidden_dim2, output_dim2)
        self.fc3 = nn.Linear(output_dim1, hidden_dim3)
        # self.fc3 = nn.Linear(output_dim1+output_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, hidden_dim4)
        self.fc5 = nn.Linear(hidden_dim4, 3)
        self.lrelu = nn.LeakyReLU(0.1)



    def forward(self, x1, ids, masks, token_type_ids):
        #left tower with numerical features
        h_10 = Variable(torch.zeros(self.num_layers1, x1.size(0), self.hidden_dim1)).to(device)
        c_10 = Variable(torch.zeros(self.num_layers1, x1.size(0), self.hidden_dim1)).to(device)
        ula1, (h_out1, _) = self.lstm1(x1, (h_10, c_10))
        h_out1 = h_out1.view(-1, self.hidden_dim1)
        out1 = self.fc1(h_out1)




        # right tower with roberta on textual features
        #TODO
        # batch_size_here = ids.shape[0]
        # e2 = torch.zeros(batch_size_here, max_text_per_iter,1024).to(device)

        # for k in range(ids.shape[1]):
        #     seq_ids = ids[:,k,:]
        #     seq_masks = masks[:,k,:]
        #     seq_token_type_ids = token_type_ids[:,k,:]


        #     e2k = self.roberta(input_ids= seq_ids, attention_mask=seq_masks, token_type_ids=seq_token_type_ids)
        #     # print(e2.shape)
        #     # print(e2k[1].shape)
        #     # first 0 is for last_hidden_state: https://huggingface.co/docs/transformers/main/en/model_doc/roberta#transformers.RobertaModel.forward.returns:~:text=transformers.modeling_outputs.-,BaseModelOutputWithPoolingAndCrossAttentions%20or%20tuple(torch.FloatTensor),-A%20transformers.modeling_outputs
        #     # the shape of e2k[0] is (batch_size, sequence_length (<=MAX_LEN), hidden_size (=1024))
        #     e2k1 = e2k[0][:, 0, :]
        #     e2[:,k,:] = e2k1


        # print(e2.shape)
        # h_20 = Variable(torch.zeros(self.num_layers2, e2.size(0), self.hidden_dim2)).to(device)
        # c_20 = Variable(torch.zeros(self.num_layers2, e2.size(0), self.hidden_dim2)).to(device)
        # ula2, (h_out2, _) = self.lstm2(e2, (h_20, c_20))
        # h_out2 = h_out2.view(-1, self.hidden_dim2)
        # out2 = self.fc2(h_out2)




        # siamese merging layers

        output = out1
        # output = torch.cat((out1, out2),1)
        # output = F.relu(self.fc3(output))
        # output = F.relu(self.fc4(output))

        output = self.lrelu(self.fc3(output))
        output = self.lrelu(self.fc4(output))
        output = self.fc5(output)
        return output

#TODO : correct these values
model = SiameseModel(input_dim1 = 8, input_dim2 = 1024,
                 hidden_dim1 = 20, hidden_dim2 = 768, hidden_dim3 = 128, hidden_dim4 = 64,
                 num_layers1 = 1, num_layers2 = 1, output_dim1 = 10, output_dim2 = 256).to(device)






Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
print(model)
print(len(list(model.parameters())))
# for i in range(len(list(model.parameters()))):
#     print(list(model.parameters())[i].size())


SiameseModel(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerN

In [None]:
def plot_grad_flow(named_parameters):
    ave_grads = []
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):
            layers.append(n)
            ave_grads.append(p.grad.abs().mean())
    plt.plot(ave_grads, alpha=0.3, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, linewidth=1, color="k" )
    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(xmin=0, xmax=len(ave_grads))
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    plt.grid(True)

## Train model

In [18]:
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)
loss_arr = np.zeros(num_epochs)

In [19]:
# roberta = RobertaModel.from_pretrained("roberta-large").to(device)
train_loss_record = []
for epoch in range(num_epochs):
    train_loss = []
    train_loss_sum = []
    train_correct = 0
    train_total = 0

    for idx, data in tqdm(enumerate(train_loader, 0)):
        x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        ids = data['ids'].to(device, dtype = torch.long)
        masks = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        # debugging roberta encoder and second lstm
        '''
        debug starts here
        '''
        # if idx > 1:
        #     break
        # batch_size_here = data['ids'].shape[0]
        # print('batch_size_here:', batch_size_here)
        # e2 = torch.zeros(batch_size_here, max_text_per_iter, 1024)
        # print('ids shape:', ids.shape)

        # for k in range(ids.shape[1]):  #number of sentences in sequence = max_text_per_iter
        #     print('k:', k)
        #     seq_ids = ids[:,k,:].to(device)
        #     seq_masks = masks[:,k,:].to(device)
        #     seq_token_type_ids = token_type_ids[:,k,:].to(device)


        #     e2k = roberta(input_ids= seq_ids, attention_mask=seq_masks, token_type_ids=seq_token_type_ids)
        #     print(e2.shape)
        #     print(e2k[1].shape)
        #     #first 0 is for last_hidden_state: https://huggingface.co/docs/transformers/main/en/model_doc/roberta#transformers.RobertaModel.forward.returns:~:text=transformers.modeling_outputs.-,BaseModelOutputWithPoolingAndCrossAttentions%20or%20tuple(torch.FloatTensor),-A%20transformers.modeling_outputs
        #     # the shape of e2k[0] is (batch_size, sequence_length (<=MAX_LEN), hidden_size (=1024))
        #     e2k1 = e2k[0][:, 0, :]
        #     e2[:,k,:] = e2k1




        # lstm2 = nn.LSTM(1024, 768, 1, batch_first=True)
        # fc2 = nn.Linear(768, 256)

        # h_20 = Variable(torch.zeros(1, e2.size(0), 768))
        # c_20 = Variable(torch.zeros(1, e2.size(0), 768))
        # ula2, (h_out2, _) = lstm2(e2, (h_20, c_20))
        # h_out2 = h_out2.view(-1, 768)
        # out2 = fc2(h_out2)


    #     print(ids.shape)
    #     print(masks.shape)
    #     print(token_type_ids.shape)

        # print(out2)



        '''
        debug ends here
        '''



        y_pred = model(x_numerical, ids, masks, token_type_ids)
        print('y_pred:', y_pred)
        _, pred_label = torch.max(y_pred.data, 1)

        loss = criterion(y_pred, targets.reshape(-1))

         # Zero out gradient, else they will accumulate between epochs
        optimiser.zero_grad()

        # Backward pass
        loss.backward()

        # Update parameters
        optimiser.step()



        train_loss.append(loss.data.cpu())
        train_loss_sum.append(loss.data.cpu())


        wandb.log({'avg train loss in this batch': loss.item(), 'epoch': epoch, 'batch_id': idx})
        # Get accuracy
        train_total += targets.reshape(-1).size(0)
        train_correct += (pred_label == targets.reshape(-1)).sum()
    train_accuracy = 100 * train_correct / train_total

    # Record at every epoch
    print('Train Loss at epoch {}: {}\n'.format(epoch, np.mean(train_loss_sum)))
    train_loss_record.append(np.mean(train_loss_sum))
    wandb.log({'avg train loss in this epoch': np.mean(train_loss_sum), 'epoch': epoch})
    wandb.log({'train accuracy in this epoch': train_accuracy, 'epoch': epoch})






    # evaluate on test set every epoch
    test_loss = []
    test_loss_sum = []

    correct = 0
    total = 0

    for idx, data in tqdm(enumerate(test_loader, 0)):
        test_x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        test_ids = data['ids'].to(device, dtype = torch.long)
        test_masks = data['mask'].to(device, dtype = torch.long)
        test_token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        test_targets = data['targets'].to(device, dtype = torch.long)

        y_pred = model(test_x_numerical, test_ids, test_masks, test_token_type_ids)
        _, pred_label = torch.max(y_pred.data, 1)

#         print('y_pred:', y_pred)
        tloss = criterion(y_pred, test_targets.reshape(-1))

        test_loss.append(tloss.data.cpu())
        test_loss_sum.append(tloss.data.cpu())

        wandb.log({'avg test loss in this batch': tloss.item(), 'epoch': epoch, 'batch_id': idx})

        # Get accuracy
        total += test_targets.reshape(-1).size(0)
        correct += (pred_label == test_targets.reshape(-1)).sum()
    accuracy = 100 * correct / total

    # Record at every epoch
    print('test Loss at epoch {}: {}\n'.format(epoch, np.mean(test_loss_sum)))
    wandb.log({'avg test loss in this epoch': np.mean(test_loss_sum), 'epoch': epoch})
    wandb.log({'test accuracy in this epoch': accuracy, 'epoch': epoch})



    plot_grad_flow(model.named_parameters()) # version 1








0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  'ids': torch.tensor(input_ids, dtype=torch.long),
  'mask': torch.tensor(attention_masks, dtype=torch.long),
  'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
  'targets': torch.tensor(y_train[index], dtype=torch.long)
0it [00:00, ?it/s]


NameError: ignored

In [None]:
# evaluate on test set every epoch
test_loss = []
test_loss_sum = []

correct = 0
total = 0

for idx, data in tqdm(enumerate(test_loader, 0)):
    test_x_numerical = data['x_numerical'].to(device, dtype = torch.float)
    test_ids = data['ids'].to(device, dtype = torch.long)
    test_masks = data['mask'].to(device, dtype = torch.long)
    test_token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    test_targets = data['targets'].to(device, dtype = torch.long)

    y_pred = model(test_x_numerical, test_ids, test_masks, test_token_type_ids)
    _, pred_label = torch.max(y_pred.data, 1)

#         print('y_pred:', y_pred)
    tloss = criterion(y_pred, test_targets.reshape(-1))

    test_loss.append(tloss.data.cpu())
    test_loss_sum.append(tloss.data.cpu())

    wandb.log({'avg test loss in this batch': tloss.item(), 'epoch': epoch, 'batch_id': idx})

    # Get accuracy
    total += test_targets.reshape(-1).size(0)
    correct += (pred_label == test_targets.reshape(-1)).sum()
accuracy = 100 * correct / total

In [None]:
print(e2k[0].shape)
print(e2k[1].shape)
print(e2k)

In [None]:
run_id = str(1)
!mkdir output/$run_id
output_model_file = 'roberta_stock_pred.bin'
output_vocab_file = './output' + str(run_id)

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)


In [None]:
plt.plot(loss_arr, label="Training loss")
plt.legend()
plt.show()

In [None]:

'''