In [1]:
import wandb
wandb.login()
wandb.init(project="stock_prediction")

<wandb.sdk.wandb_run.Run at 0x11251be20>

In [2]:
import yfinance as yf
import datetime
import numpy as np
import pandas as pd
from pylab import mpl, plt
import math, time
import itertools
from datetime import datetime
from operator import itemgetter
from tqdm import tqdm
from math import sqrt
import torch
import torch.nn as nn
from torch.autograd import Variable
from transformers import RobertaTokenizer, RobertaModel
import torch.nn.functional as F

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
no_of_days_to_lookforward = 1
no_of_days_to_lookback = 5
up_threshold = 0.015
down_threshold = -0.015
max_text_per_iter = 20
batch_size = 8
MAX_LEN = 10
num_epochs = 20

In [5]:
stock_symbols = [ 'XOM']
no_of_days = 4*365

EXPORT_DATA_FOLDER = './data/'

# Set the start and end dates for the data 
# here matching it with dates of news text available
train_start = datetime.strptime('2020/01/04', '%Y/%m/%d')
train_end = datetime.strptime('2022/09/30', '%Y/%m/%d')
test_start = datetime.strptime('2022/10/01', '%Y/%m/%d')
test_end = datetime.strptime('2023/01/04', '%Y/%m/%d')


# start = datetime.datetime.now() - datetime.timedelta(days=no_of_days)
# end = datetime.datetime.now()

# Get training data
for symbol in stock_symbols:
    # Download the historical price and volume data using yfinance
    train_data_raw = yf.download(symbol, start=train_start, end=train_end)

    # Normalize features by percent of changes between today and yesterday
    pct_change_open = train_data_raw['Open'].pct_change().fillna(0)
    pct_change_high = train_data_raw['High'].pct_change().fillna(0)
    pct_change_high_over_open = (train_data_raw['High']-train_data_raw['Open'])/train_data_raw['Open']
    pct_change_low = train_data_raw['Low'].pct_change().fillna(0)
    pct_change_low_over_open = (train_data_raw['Low']-train_data_raw['Open'])/train_data_raw['Open']
    pct_change_close = train_data_raw['Close'].pct_change().fillna(0)
    pct_change_close_over_open = (train_data_raw['Close']-train_data_raw['Open'])/train_data_raw['Open']
    pct_change_adjclose = train_data_raw['Adj Close'].pct_change().fillna(0)
    pct_change_adjclose_over_open = (train_data_raw['Adj Close']-train_data_raw['Open'])/train_data_raw['Open']
    pct_change_volume = train_data_raw['Volume'].pct_change().fillna(0)

    # Prepare labels: 2 means the close price of tomorow is higher than today's close price; 1 is down; 0 means the movement is between up_threshold and down_threshold
    label = np.where(pct_change_close > up_threshold, 2, np.where(pct_change_close < down_threshold, 1, 0))[1:]
    label = np.append(label, 0)

    # Construct a train_data_norm data frame
    train_data_norm = pd.DataFrame({'Open_norm':pct_change_open,
                              'High_norm':pct_change_high,
                              'Low_norm': pct_change_low,
                              'Close_norm':pct_change_close,
                              'Volume_norm':pct_change_volume,
                              'High-Open_norm':pct_change_high_over_open,
                              'Low-Open_norm':pct_change_low_over_open,
                              'Close-Open_norm':pct_change_close_over_open,
                              'Label_2up1down':label})

    # Normalize by min-max normalization after the pct normalization
    train_data_norm['Open_norm'] = train_data_norm['Open_norm'].apply(lambda x: (x - train_data_norm['Open_norm'].min()) / (train_data_norm['Open_norm'].max() - train_data_norm['Open_norm'].min()))
    train_data_norm['High_norm'] = train_data_norm['High_norm'].apply(lambda x: (x - train_data_norm['High_norm'].min()) / (train_data_norm['High_norm'].max() - train_data_norm['High_norm'].min()))
    train_data_norm['Low_norm'] = train_data_norm['Low_norm'].apply(lambda x: (x - train_data_norm['Low_norm'].min()) / (train_data_norm['Low_norm'].max() - train_data_norm['Low_norm'].min()))
    train_data_norm['Close_norm'] = train_data_norm['Close_norm'].apply(lambda x: (x - train_data_norm['Close_norm'].min()) / (train_data_norm['Close_norm'].max() - train_data_norm['Close_norm'].min()))
    train_data_norm['Volume_norm'] = train_data_norm['Volume_norm'].apply(lambda x: (x - train_data_norm['Volume_norm'].min()) / (train_data_norm['Volume_norm'].max() - train_data_norm['Volume_norm'].min()))
    train_data_norm['High-Open_norm'] = train_data_norm['High-Open_norm'].apply(lambda x: (x - train_data_norm['High-Open_norm'].min()) / (train_data_norm['High-Open_norm'].max() - train_data_norm['High-Open_norm'].min()))
    train_data_norm['Low-Open_norm'] = train_data_norm['Low-Open_norm'].apply(lambda x: (x - train_data_norm['Low-Open_norm'].min()) / (train_data_norm['Low-Open_norm'].max() - train_data_norm['Low-Open_norm'].min()))
    train_data_norm['Close-Open_norm'] = train_data_norm['Close-Open_norm'].apply(lambda x: (x - train_data_norm['Close-Open_norm'].min()) / (train_data_norm['Close-Open_norm'].max() - train_data_norm['Close-Open_norm'].min()))

    # Remove the first and the last row, becuase of NAN values
    train_data_raw = train_data_raw.iloc[1:-1]
    train_data_norm = train_data_norm.iloc[1:-1]

    train_data_raw.to_csv(EXPORT_DATA_FOLDER+symbol+'train_raw_data.csv', index=True)
    train_data_norm.to_csv(EXPORT_DATA_FOLDER+symbol+'train_norm_data.csv', index=True)
    
    
# Get test data
for symbol in stock_symbols:
    # Download the historical price and volume data using yfinance
    test_data_raw = yf.download(symbol, start=test_start, end=test_end)

    # Normalize features by percent of changes between today and yesterday
    pct_change_open = test_data_raw['Open'].pct_change().fillna(0)
    pct_change_high = test_data_raw['High'].pct_change().fillna(0)
    pct_change_high_over_open = (test_data_raw['High']-test_data_raw['Open'])/test_data_raw['Open']
    pct_change_low = test_data_raw['Low'].pct_change().fillna(0)
    pct_change_low_over_open = (test_data_raw['Low']-test_data_raw['Open'])/test_data_raw['Open']
    pct_change_close = test_data_raw['Close'].pct_change().fillna(0)
    pct_change_close_over_open = (test_data_raw['Close']-test_data_raw['Open'])/test_data_raw['Open']
    pct_change_adjclose = test_data_raw['Adj Close'].pct_change().fillna(0)
    pct_change_adjclose_over_open = (test_data_raw['Adj Close']-test_data_raw['Open'])/test_data_raw['Open']
    pct_change_volume = test_data_raw['Volume'].pct_change().fillna(0)

    # Prepare labels: 2 means the close price of tomorow is higher than today's close price; 1 is down; 0 means the movement is between up_threshold and down_threshold
    label = np.where(pct_change_close > up_threshold, 2, np.where(pct_change_close < down_threshold, 1, 0))[1:]
    label = np.append(label, 0)

    # Construct a test_data_norm data frame
    test_data_norm = pd.DataFrame({'Open_norm':pct_change_open,
                              'High_norm':pct_change_high,
                              'Low_norm': pct_change_low,
                              'Close_norm':pct_change_close,
                              'Volume_norm':pct_change_volume,
                              'High-Open_norm':pct_change_high_over_open,
                              'Low-Open_norm':pct_change_low_over_open,
                              'Close-Open_norm':pct_change_close_over_open,
                              'Label_2up1down':label})

    # Normalize by min-max normalization after the pct normalization
    test_data_norm['Open_norm'] = test_data_norm['Open_norm'].apply(lambda x: (x - test_data_norm['Open_norm'].min()) / (test_data_norm['Open_norm'].max() - test_data_norm['Open_norm'].min()))
    test_data_norm['High_norm'] = test_data_norm['High_norm'].apply(lambda x: (x - test_data_norm['High_norm'].min()) / (test_data_norm['High_norm'].max() - test_data_norm['High_norm'].min()))
    test_data_norm['Low_norm'] = test_data_norm['Low_norm'].apply(lambda x: (x - test_data_norm['Low_norm'].min()) / (test_data_norm['Low_norm'].max() - test_data_norm['Low_norm'].min()))
    test_data_norm['Close_norm'] = test_data_norm['Close_norm'].apply(lambda x: (x - test_data_norm['Close_norm'].min()) / (test_data_norm['Close_norm'].max() - test_data_norm['Close_norm'].min()))
    test_data_norm['Volume_norm'] = test_data_norm['Volume_norm'].apply(lambda x: (x - test_data_norm['Volume_norm'].min()) / (test_data_norm['Volume_norm'].max() - test_data_norm['Volume_norm'].min()))
    test_data_norm['High-Open_norm'] = test_data_norm['High-Open_norm'].apply(lambda x: (x - test_data_norm['High-Open_norm'].min()) / (test_data_norm['High-Open_norm'].max() - test_data_norm['High-Open_norm'].min()))
    test_data_norm['Low-Open_norm'] = test_data_norm['Low-Open_norm'].apply(lambda x: (x - test_data_norm['Low-Open_norm'].min()) / (test_data_norm['Low-Open_norm'].max() - test_data_norm['Low-Open_norm'].min()))
    test_data_norm['Close-Open_norm'] = test_data_norm['Close-Open_norm'].apply(lambda x: (x - test_data_norm['Close-Open_norm'].min()) / (test_data_norm['Close-Open_norm'].max() - test_data_norm['Close-Open_norm'].min()))

    # Remove the first and the last row, becuase of NAN values
    test_data_raw = test_data_raw.iloc[1:-1]
    test_data_norm = test_data_norm.iloc[1:-1]

    test_data_raw.to_csv(EXPORT_DATA_FOLDER+symbol+'test_raw_data.csv', index=True)
    test_data_norm.to_csv(EXPORT_DATA_FOLDER+symbol+'test_norm_data.csv', index=True)

In [6]:
text_data_df = pd.read_csv('./data/XOM_20200401_20230401_medium.csv', sep= ',', header= 0)
text_data_df = text_data_df[['Date', 'News']]


text_data_df = text_data_df.groupby('Date')['News'].apply('$$$###'.join)

text_data_df.index = pd.to_datetime(text_data_df.index, dayfirst=True)
# text_data_df

In [7]:
all_train_df = train_data_norm.join(text_data_df, how = 'inner')
all_test_df = test_data_norm.join(text_data_df, how = 'inner')

print(all_train_df.index.min())
print(all_train_df.index.max())
print(all_test_df.index.min())
print(all_test_df.index.max())

In [8]:
all_train = all_train_df.values

window_size = no_of_days_to_lookback

X_numerical_train = []
y_train = []
X_text_train = []
X_text_train_curr = []





for i in range(window_size, len(all_train) - no_of_days_to_lookforward + 1):
    X_numerical_train.append(all_train[i-window_size: i, :-2])
    
    # split and append sequence of text
    curr_seq = all_train[i-window_size: i, -1]
    for j in range(window_size):
        split_curr_seq = curr_seq[window_size - 1 -j].split('$$$###')
        X_text_train_curr = X_text_train_curr + split_curr_seq
    
    if len(X_text_train_curr) > max_text_per_iter:
        X_text_train_curr = X_text_train_curr[:max_text_per_iter]
    
    X_text_train.append(X_text_train_curr)
        
    # target labels
    y_train.append(all_train[i:i+no_of_days_to_lookforward, -2])

X_numerical_train, y_train = np.array(X_numerical_train).astype(np.float16), np.array(y_train).astype(np.int32)
print(type(X_numerical_train))
print(type(y_train))

X_numerical_train = torch.from_numpy(X_numerical_train).type(torch.Tensor)
y_train = torch.from_numpy(y_train).long()

print(len(X_numerical_train))
print(len(X_text_train))
print(len(y_train))
print(X_numerical_train.shape)

print(len(X_text_train))
print(len(X_text_train[2]))
# print(X_text_train[2])

In [9]:
all_test = all_test_df.values


X_numerical_test = []
y_test = []
X_text_test = []
X_text_test_curr = []

for i in range(window_size, len(all_test) - no_of_days_to_lookforward + 1):
    X_numerical_test.append(all_test[i-window_size: i, :-2])
    
    # split and append sequence of text (in reverse order to add the latest news first)
    curr_seq = all_test[i-window_size: i, -1]
    for j in range(window_size):
        split_curr_seq = curr_seq[window_size - 1 -j].split('$$$###')
        X_text_test_curr = X_text_test_curr + split_curr_seq
    
    if len(X_text_test_curr) > max_text_per_iter:
        X_text_test_curr = X_text_test_curr[:max_text_per_iter]
    
    X_text_test.append(X_text_test_curr)
        
    # target labels
    y_test.append(all_test[i:i+no_of_days_to_lookforward, -2])

X_numerical_test, y_test = np.array(X_numerical_test).astype(np.float16), np.array(y_test).astype(np.int32)
print(type(X_numerical_test))
print(type(y_test))

X_numerical_test = torch.from_numpy(X_numerical_test).type(torch.Tensor)
y_test = torch.from_numpy(y_test).long()

print(len(X_numerical_test))
print(len(X_text_test))
print(len(y_test))
print(X_numerical_test.shape)

print(len(X_text_test))
print(len(X_text_test[2]))
# print(X_text_test[2])

In [10]:
X_numerical_train[0]

tensor([[0.4246, 0.3257, 0.4453, 0.4441, 0.1932, 0.4934, 0.8872, 0.5684],
        [0.7788, 0.7954, 0.7358, 0.7979, 0.5088, 0.8218, 0.8267, 0.7310],
        [0.8540, 0.4360, 0.6265, 0.3726, 0.1608, 0.1741, 0.4355, 0.0930],
        [0.4275, 0.2666, 0.6353, 0.6196, 0.1874, 0.2355, 0.8696, 0.5459],
        [0.8765, 0.6895, 0.7617, 0.5669, 0.3101, 0.0968, 0.6763, 0.1940]])

In [11]:
len(X_text_train[0])
X_text_train[0]

['Exxon Cuts Capital Spending by 30% in Response to Coronavirus; The largest portion of the cuts will be in the Permian Basin, a key U.S. oil... Exxon Mobil Corp. said Tuesday that it would cut its 2020 capital spending by 30% as global demand for oil is sapped by the coronavirus. The largest portion of the $10 billion in cuts will be in the Permian Basin, the largest U.S. oil field ...',
 'Wall Street’s Rally Fades This briefing is no longer updating. Read the latest developments in the coronavirus outbreak here. The Treasury is trying to expand the small-business loans program.',
 'Coronavirus update: 1.39 million cases globally, 79,091 dead; Wuhan, China, reports zero deaths for first time since January; ExxonMobil... ExxonMobil cuts 2020 capex budget by 30% to combat oversupply and weak demand, while USA parent Gannett adopts poison pill The Chinese city of Wuhan reported zero deaths Tuesday from the coronavirus that causes COVID-19 for the first time ...',
 'ExxonMobil Reduces 202

In [12]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

tokenizer = RobertaTokenizer.from_pretrained('roberta-large', truncation=True, do_lower_case=True)

class SiameseDataloader(Dataset):
    
    def __init__(self, X_numerical_train, y_train, X_text_train, MAX_LEN, tokenizer):
        self.X_numerical_train = X_numerical_train
        self.X_text_train = X_text_train
        self.MAX_LEN = MAX_LEN
        self.tokenizer = tokenizer
        
    def __getitem__(self, index):

        
        input_ids = []
        attention_masks = []
        token_type_ids = []
        input_seq = []

        for sent in X_text_train[index]:
            encoded_sent = self.tokenizer.encode_plus(
                text=sent,
                add_special_tokens=True,        # Add `[CLS]` and `[SEP]` special tokens
                max_length=self.MAX_LEN,             # Choose max length to truncate/pad
                pad_to_max_length=True,         # Pad sentence to max length 
                #return_attention_mask=True      # Return attention mask
                return_token_type_ids=True
                )
            input_ids.append(encoded_sent.get('input_ids'))
            attention_masks.append(encoded_sent.get('attention_mask'))
            token_type_ids.append(encoded_sent.get('token_type_ids'))

        # Convert lists to tensors
        input_ids = torch.tensor(input_ids)
        attention_masks = torch.tensor(attention_masks)
        token_type_ids = torch.tensor(token_type_ids)


        return {
            'x_numerical': X_numerical_train[index],
            'ids': torch.tensor(input_ids, dtype=torch.long),
            'mask': torch.tensor(attention_masks, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(y_train[index], dtype=torch.long)
        }
    
    
    
    

    def __len__(self):
        return len(self.X_numerical_train)

In [13]:
train_set = SiameseDataloader(X_numerical_train, y_train, X_text_train, MAX_LEN, tokenizer)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)


test_set = SiameseDataloader(X_numerical_test, y_test, X_text_test, MAX_LEN, tokenizer)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)

In [14]:
for idx, data in tqdm(enumerate(train_loader, 0)):
    if idx > 1:
        break
        
    
    x_numerical = data['x_numerical'].to(device, dtype = torch.float)
    ids = data['ids'].to(device, dtype = torch.long)
    masks = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.long)
    

In [15]:
print(x_numerical.shape)
x_numerical

tensor([[[0.6504, 0.5669, 0.6841, 0.5850, 0.2462, 0.3333, 0.9990, 0.5942],
         [0.6992, 0.4353, 0.6587, 0.5420, 0.1996, 0.0776, 0.9468, 0.4761],
         [0.6108, 0.4353, 0.5005, 0.4270, 0.2791, 0.0438, 0.7686, 0.3164],
         [0.4272, 0.3818, 0.5210, 0.5122, 0.2263, 0.3665, 1.0000, 0.6069],
         [0.7075, 0.4756, 0.6240, 0.5225, 0.3088, 0.1785, 0.8672, 0.4436]],

        [[0.6362, 0.3625, 0.5742, 0.4460, 0.2111, 0.0000, 0.8896, 0.4282],
         [0.5767, 0.4607, 0.5884, 0.5176, 0.2417, 0.1074, 0.9478, 0.4736],
         [0.6455, 0.4663, 0.6265, 0.5396, 0.2803, 0.0553, 0.9404, 0.4409],
         [0.5693, 0.3777, 0.4717, 0.3982, 0.4419, 0.0000, 0.7886, 0.3022],
         [0.4878, 0.4260, 0.5649, 0.4915, 0.1412, 0.2593, 0.9790, 0.4521]],

        [[0.4551, 0.3179, 0.4626, 0.4565, 0.1854, 0.1486, 0.9238, 0.5156],
         [0.6396, 0.4521, 0.6226, 0.5234, 0.2983, 0.0793, 0.9209, 0.4661],
         [0.6548, 0.4875, 0.6294, 0.5581, 0.2102, 0.0492, 0.9009, 0.4473],
         [0.6235, 0.5

In [16]:
print(ids.shape)
ids

tensor([[[    0,  9089, 43903,  ...,    30,   389,     2],
         [    0, 28216,   852,  ...,   274,  4216,     2],
         [    0, 15228,   261,  ...,   112,     4,     2],
         ...,
         [    0, 14773,  7458,  ...,  1782,   814,     2],
         [    0, 34543,   195,  ...,  5090,   389,     2],
         [    0,   698,    35,  ...,   189,   236,     2]],

        [[    0,  9089, 43903,  ...,    30,   389,     2],
         [    0, 28216,   852,  ...,   274,  4216,     2],
         [    0, 15228,   261,  ...,   112,     4,     2],
         ...,
         [    0, 14773,  7458,  ...,  1782,   814,     2],
         [    0, 34543,   195,  ...,  5090,   389,     2],
         [    0,   698,    35,  ...,   189,   236,     2]],

        [[    0,  9089, 43903,  ...,    30,   389,     2],
         [    0, 28216,   852,  ...,   274,  4216,     2],
         [    0, 15228,   261,  ...,   112,     4,     2],
         ...,
         [    0, 14773,  7458,  ...,  1782,   814,     2],
         [

In [17]:
print(targets.shape)
targets

tensor([[0],
        [1],
        [1],
        [2],
        [0],
        [2],
        [2],
        [1]])

In [18]:



class SiameseModel(nn.Module):
    def __init__(self, input_dim1, input_dim2, 
                 hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4,
                 num_layers1, num_layers2, output_dim1, output_dim2):
        super(SiameseModel, self).__init__()
        self.input_dim1 = input_dim1
        self.input_dim2 = input_dim2
        self.hidden_dim1 = hidden_dim1
        self.hidden_dim2 = hidden_dim2
        self.hidden_dim3 = hidden_dim3
        self.hidden_dim4 = hidden_dim4
        self.num_layers1 = num_layers1
        self.num_layers2 = num_layers2
        self.output_dim1 = output_dim1
        self.output_dim2 = output_dim2
        
        

#         self.roberta = RobertaModel.from_pretrained("roberta-large").to(device)
        
        
#         self.lstm1 = nn.LSTM(input_dim1, hidden_dim1, num_layers1, batch_first=True)
#         self.lstm2 = nn.LSTM(input_dim2, hidden_dim2, num_layers2, batch_first=True)


        self.fc1 = nn.Linear(hidden_dim1, output_dim1)
        self.fc2 = nn.Linear(hidden_dim2, output_dim2)
        self.fc3 = nn.Linear(input_dim1, hidden_dim3)
#         self.fc3 = nn.Linear(output_dim1+output_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, hidden_dim4)
        self.fc5 = nn.Linear(hidden_dim4, 3)
        
        
        
    def forward(self, x1, ids, masks, token_type_ids):
        #left tower with numerical features
#         h_10 = Variable(torch.zeros(self.num_layers1, x1.size(0), self.hidden_dim1)).to(device)
#         c_10 = Variable(torch.zeros(self.num_layers1, x1.size(0), self.hidden_dim1)).to(device)
#         ula1, (h_out1, _) = self.lstm1(x1, (h_10, c_10))
#         h_out1 = h_out1.view(-1, self.hidden_dim1)
#         out1 = self.fc1(h_out1)
        
        
        

        # right tower with roberta on textual features  
        #TODO
#         batch_size_here = ids.shape[0]
#         e2 = torch.zeros(batch_size_here, max_text_per_iter,1024).to(device)
        
#         for k in range(ids.shape[1]):
#             seq_ids = ids[:,k,:]
#             seq_masks = masks[:,k,:]
#             seq_token_type_ids = token_type_ids[:,k,:]


#             e2k = self.roberta(input_ids= seq_ids, attention_mask=seq_masks, token_type_ids=seq_token_type_ids)
#             # print(e2.shape)
#             # print(e2k[1].shape)
#             #first 0 is for last_hidden_state: https://huggingface.co/docs/transformers/main/en/model_doc/roberta#transformers.RobertaModel.forward.returns:~:text=transformers.modeling_outputs.-,BaseModelOutputWithPoolingAndCrossAttentions%20or%20tuple(torch.FloatTensor),-A%20transformers.modeling_outputs
#             # the shape of e2k[0] is (batch_size, sequence_length (<=MAX_LEN), hidden_size (=1024))
#             e2k1 = e2k[0][:, 0, :]  
#             e2[:,k,:] = e2k1
    
    
#         print('e2 shape: ', e2.shape)        
#         h_20 = Variable(torch.zeros(self.num_layers2, e2.size(0), self.hidden_dim2)).to(device)
#         c_20 = Variable(torch.zeros(self.num_layers2, e2.size(0), self.hidden_dim2)).to(device)
#         ula2, (h_out2, _) = self.lstm2(e2, (h_20, c_20))
#         h_out2 = h_out2.view(-1, self.hidden_dim2)
#         out2 = self.fc2(h_out2)
        
        
        
        
        # siamese merging layers
        x1 = torch.squeeze(x1[:, 0, :]) #x1[:, 0, :].squeeze()
        print('shape of x1 after squeeze:', x1.shape)
#         output = torch.cat((out1, out2),1)
#         output = F.relu(self.fc3(output))
        output = F.relu(self.fc3(x1))
        output = F.relu(self.fc4(output))
        output = self.fc5(output)
        return output
    
#TODO : correct these values
model = SiameseModel(input_dim1 = 8, input_dim2 = 1024, 
                 hidden_dim1 = 20, hidden_dim2 = 768, hidden_dim3 = 128, hidden_dim4 = 64,
                 num_layers1 = 1, num_layers2 = 1, output_dim1 = 10, output_dim2 = 256).to(device)


    
    

In [19]:
print(model)
print(len(list(model.parameters())))
# for i in range(len(list(model.parameters()))):
#     print(list(model.parameters())[i].size())

In [20]:
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)
loss_arr = np.zeros(num_epochs)

In [21]:
# roberta = RobertaModel.from_pretrained("roberta-large").to(device)
train_loss_record = []
for epoch in range(num_epochs):
    train_loss = []
    train_loss_sum = []
    
    for idx, data in tqdm(enumerate(train_loader, 0)):
        x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        ids = data['ids'].to(device, dtype = torch.long)
        masks = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        # debugging roberta encoder and second lstm
        '''
        debug starts here
        '''
        # if idx > 1:
        #     break
        # batch_size_here = data['ids'].shape[0]
        # print('batch_size_here:', batch_size_here)
        # e2 = torch.zeros(batch_size_here, max_text_per_iter, 1024)
        # print('ids shape:', ids.shape)
        
        # for k in range(ids.shape[1]):  #number of sentences in sequence = max_text_per_iter
        #     print('k:', k)
        #     seq_ids = ids[:,k,:].to(device)
        #     seq_masks = masks[:,k,:].to(device)
        #     seq_token_type_ids = token_type_ids[:,k,:].to(device)


        #     e2k = roberta(input_ids= seq_ids, attention_mask=seq_masks, token_type_ids=seq_token_type_ids)
        #     print(e2.shape)
        #     print(e2k[1].shape)
        #     #first 0 is for last_hidden_state: https://huggingface.co/docs/transformers/main/en/model_doc/roberta#transformers.RobertaModel.forward.returns:~:text=transformers.modeling_outputs.-,BaseModelOutputWithPoolingAndCrossAttentions%20or%20tuple(torch.FloatTensor),-A%20transformers.modeling_outputs
        #     # the shape of e2k[0] is (batch_size, sequence_length (<=MAX_LEN), hidden_size (=1024))
        #     e2k1 = e2k[0][:, 0, :]  
        #     e2[:,k,:] = e2k1
    
    
        

        # lstm2 = nn.LSTM(1024, 768, 1, batch_first=True)
        # fc2 = nn.Linear(768, 256)

        # h_20 = Variable(torch.zeros(1, e2.size(0), 768))
        # c_20 = Variable(torch.zeros(1, e2.size(0), 768))
        # ula2, (h_out2, _) = lstm2(e2, (h_20, c_20))
        # h_out2 = h_out2.view(-1, 768)
        # out2 = fc2(h_out2)
        

    #     print(ids.shape)
    #     print(masks.shape)
    #     print(token_type_ids.shape)
    
        # print(out2)


        
        '''
        debug ends here
        '''
        

    
        y_pred = model(x_numerical, ids, masks, token_type_ids)
        print('y_pred:', y_pred)
        print('target:', targets)
        loss = criterion(y_pred, targets.reshape(-1))
        
         # Zero out gradient, else they will accumulate between epochs
        optimiser.zero_grad()

        # Backward pass
        loss.backward()

        # Update parameters
        optimiser.step()
        
        
        
        train_loss.append(loss.data.cpu())
        train_loss_sum.append(loss.data.cpu())
        
        if epoch % 10 == 0 and epoch !=0:
            print("Epoch ", epoch, "CELoss: ", loss.item())   

        wandb.log({'avg train loss in this batch': loss.item(), 'epoch': epoch, 'batch_id': idx})

    # Record at every epoch
    print('Train Loss at epoch {}: {}\n'.format(epoch, np.mean(train_loss_sum)))
    train_loss_record.append(np.mean(train_loss_sum))
    wandb.log({'avg train loss in this epoch': np.mean(train_loss_sum), 'epoch': epoch})
    
    
    
    
    
    
    # evaluate on test set every epoch
    tloss = []
    test_loss_sum = []
    
    correct = 0
    total = 0
    
    for idx, data in tqdm(enumerate(test_loader, 0)):
        test_x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        test_ids = data['ids'].to(device, dtype = torch.long)
        test_masks = data['mask'].to(device, dtype = torch.long)
        test_token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        test_targets = data['targets'].to(device, dtype = torch.long)
        
        y_pred = model(test_x_numerical, test_ids, test_masks, test_token_type_ids)
        _, pred_label = torch.max(y_pred.data, 1)

#         print('y_pred:', y_pred)
        test_loss = criterion(y_pred, test_targets.reshape(-1))
    
        tloss.append(test_loss.data.cpu())
        test_loss_sum.append(test_loss.data.cpu()) 

        wandb.log({'avg test loss in this batch': test_loss.item(), 'epoch': epoch, 'batch_id': idx})
        
        # Get accuracy
        total += test_targets.reshape(-1).size(0)
        correct += (pred_label == test_targets.reshape(-1)).sum()
    accuracy = 100 * correct / total

    # Record at every epoch
    print('test Loss at epoch {}: {}\n'.format(epoch, np.mean(test_loss_sum)))
    wandb.log({'avg test loss in this epoch': np.mean(test_loss_sum), 'epoch': epoch})
    wandb.log({'test accuracy in this epoch': accuracy, 'epoch': epoch})
    
    
    


            
    
        
        
        

In [22]:
# roberta = RobertaModel.from_pretrained("roberta-large").to(device)
train_loss_record = []
for epoch in range(num_epochs):
    train_loss = []
    train_loss_sum = []
    
    for idx, data in tqdm(enumerate(train_loader, 0)):
        x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        ids = data['ids'].to(device, dtype = torch.long)
        masks = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        # debugging roberta encoder and second lstm
        '''
        debug starts here
        '''
        # if idx > 1:
        #     break
        # batch_size_here = data['ids'].shape[0]
        # print('batch_size_here:', batch_size_here)
        # e2 = torch.zeros(batch_size_here, max_text_per_iter, 1024)
        # print('ids shape:', ids.shape)
        
        # for k in range(ids.shape[1]):  #number of sentences in sequence = max_text_per_iter
        #     print('k:', k)
        #     seq_ids = ids[:,k,:].to(device)
        #     seq_masks = masks[:,k,:].to(device)
        #     seq_token_type_ids = token_type_ids[:,k,:].to(device)


        #     e2k = roberta(input_ids= seq_ids, attention_mask=seq_masks, token_type_ids=seq_token_type_ids)
        #     print(e2.shape)
        #     print(e2k[1].shape)
        #     #first 0 is for last_hidden_state: https://huggingface.co/docs/transformers/main/en/model_doc/roberta#transformers.RobertaModel.forward.returns:~:text=transformers.modeling_outputs.-,BaseModelOutputWithPoolingAndCrossAttentions%20or%20tuple(torch.FloatTensor),-A%20transformers.modeling_outputs
        #     # the shape of e2k[0] is (batch_size, sequence_length (<=MAX_LEN), hidden_size (=1024))
        #     e2k1 = e2k[0][:, 0, :]  
        #     e2[:,k,:] = e2k1
    
    
        

        # lstm2 = nn.LSTM(1024, 768, 1, batch_first=True)
        # fc2 = nn.Linear(768, 256)

        # h_20 = Variable(torch.zeros(1, e2.size(0), 768))
        # c_20 = Variable(torch.zeros(1, e2.size(0), 768))
        # ula2, (h_out2, _) = lstm2(e2, (h_20, c_20))
        # h_out2 = h_out2.view(-1, 768)
        # out2 = fc2(h_out2)
        

    #     print(ids.shape)
    #     print(masks.shape)
    #     print(token_type_ids.shape)
    
        # print(out2)


        
        '''
        debug ends here
        '''
        print('shape of x1 = x_numerical : ', x_numerical.shape)    
        y_pred = model(x_numerical, ids, masks, token_type_ids)
        print('y_pred:', y_pred)
        print('target:', targets)
        loss = criterion(y_pred, targets.reshape(-1))
        
         # Zero out gradient, else they will accumulate between epochs
        optimiser.zero_grad()

        # Backward pass
        loss.backward()

        # Update parameters
        optimiser.step()
        
        
        
        train_loss.append(loss.data.cpu())
        train_loss_sum.append(loss.data.cpu())
        
        if epoch % 10 == 0 and epoch !=0:
            print("Epoch ", epoch, "CELoss: ", loss.item())   

        wandb.log({'avg train loss in this batch': loss.item(), 'epoch': epoch, 'batch_id': idx})

    # Record at every epoch
    print('Train Loss at epoch {}: {}\n'.format(epoch, np.mean(train_loss_sum)))
    train_loss_record.append(np.mean(train_loss_sum))
    wandb.log({'avg train loss in this epoch': np.mean(train_loss_sum), 'epoch': epoch})
    
    
    
    
    
    
    # evaluate on test set every epoch
    tloss = []
    test_loss_sum = []
    
    correct = 0
    total = 0
    
    for idx, data in tqdm(enumerate(test_loader, 0)):
        test_x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        test_ids = data['ids'].to(device, dtype = torch.long)
        test_masks = data['mask'].to(device, dtype = torch.long)
        test_token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        test_targets = data['targets'].to(device, dtype = torch.long)
        
        y_pred = model(test_x_numerical, test_ids, test_masks, test_token_type_ids)
        _, pred_label = torch.max(y_pred.data, 1)

#         print('y_pred:', y_pred)
        test_loss = criterion(y_pred, test_targets.reshape(-1))
    
        tloss.append(test_loss.data.cpu())
        test_loss_sum.append(test_loss.data.cpu()) 

        wandb.log({'avg test loss in this batch': test_loss.item(), 'epoch': epoch, 'batch_id': idx})
        
        # Get accuracy
        total += test_targets.reshape(-1).size(0)
        correct += (pred_label == test_targets.reshape(-1)).sum()
    accuracy = 100 * correct / total

    # Record at every epoch
    print('test Loss at epoch {}: {}\n'.format(epoch, np.mean(test_loss_sum)))
    wandb.log({'avg test loss in this epoch': np.mean(test_loss_sum), 'epoch': epoch})
    wandb.log({'test accuracy in this epoch': accuracy, 'epoch': epoch})
    
    
    


            
    
        
        
        

In [23]:
# roberta = RobertaModel.from_pretrained("roberta-large").to(device)
train_loss_record = []
for epoch in range(num_epochs):
    train_loss = []
    train_loss_sum = []
    
    for idx, data in tqdm(enumerate(train_loader, 0)):
        x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        ids = data['ids'].to(device, dtype = torch.long)
        masks = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        # debugging roberta encoder and second lstm
        '''
        debug starts here
        '''
        # if idx > 1:
        #     break
        # batch_size_here = data['ids'].shape[0]
        # print('batch_size_here:', batch_size_here)
        # e2 = torch.zeros(batch_size_here, max_text_per_iter, 1024)
        # print('ids shape:', ids.shape)
        
        # for k in range(ids.shape[1]):  #number of sentences in sequence = max_text_per_iter
        #     print('k:', k)
        #     seq_ids = ids[:,k,:].to(device)
        #     seq_masks = masks[:,k,:].to(device)
        #     seq_token_type_ids = token_type_ids[:,k,:].to(device)


        #     e2k = roberta(input_ids= seq_ids, attention_mask=seq_masks, token_type_ids=seq_token_type_ids)
        #     print(e2.shape)
        #     print(e2k[1].shape)
        #     #first 0 is for last_hidden_state: https://huggingface.co/docs/transformers/main/en/model_doc/roberta#transformers.RobertaModel.forward.returns:~:text=transformers.modeling_outputs.-,BaseModelOutputWithPoolingAndCrossAttentions%20or%20tuple(torch.FloatTensor),-A%20transformers.modeling_outputs
        #     # the shape of e2k[0] is (batch_size, sequence_length (<=MAX_LEN), hidden_size (=1024))
        #     e2k1 = e2k[0][:, 0, :]  
        #     e2[:,k,:] = e2k1
    
    
        

        # lstm2 = nn.LSTM(1024, 768, 1, batch_first=True)
        # fc2 = nn.Linear(768, 256)

        # h_20 = Variable(torch.zeros(1, e2.size(0), 768))
        # c_20 = Variable(torch.zeros(1, e2.size(0), 768))
        # ula2, (h_out2, _) = lstm2(e2, (h_20, c_20))
        # h_out2 = h_out2.view(-1, 768)
        # out2 = fc2(h_out2)
        

    #     print(ids.shape)
    #     print(masks.shape)
    #     print(token_type_ids.shape)
    
        # print(out2)


        
        '''
        debug ends here
        '''
        print('shape of x1 = x_numerical : ', x_numerical.shape)    
        y_pred = model(x_numerical, ids, masks, token_type_ids)
        print('y_pred:', y_pred)
        print('target:', targets)
        loss = criterion(y_pred, targets.reshape(-1))
        
         # Zero out gradient, else they will accumulate between epochs
        optimiser.zero_grad()

        # Backward pass
        loss.backward()

        # Update parameters
        optimiser.step()
        
        
        
        train_loss.append(loss.data.cpu())
        train_loss_sum.append(loss.data.cpu())
        
        if epoch % 10 == 0 and epoch !=0:
            print("Epoch ", epoch, "CELoss: ", loss.item())   

        wandb.log({'avg train loss in this batch': loss.item(), 'epoch': epoch, 'batch_id': idx})

    # Record at every epoch
    print('Train Loss at epoch {}: {}\n'.format(epoch, np.mean(train_loss_sum)))
    train_loss_record.append(np.mean(train_loss_sum))
    wandb.log({'avg train loss in this epoch': np.mean(train_loss_sum), 'epoch': epoch})
    
    
    
    
    
    
    # evaluate on test set every epoch
    print('starting testing..')
    tloss = []
    test_loss_sum = []
    
    correct = 0
    total = 0
    
    for idx, data in tqdm(enumerate(test_loader, 0)):
        test_x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        test_ids = data['ids'].to(device, dtype = torch.long)
        test_masks = data['mask'].to(device, dtype = torch.long)
        test_token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        test_targets = data['targets'].to(device, dtype = torch.long)
        
        print('shape of x1 = test_x_numerical : ', test_x_numerical.shape)    
        y_pred = model(test_x_numerical, test_ids, test_masks, test_token_type_ids)
        _, pred_label = torch.max(y_pred.data, 1)

#         print('y_pred:', y_pred)
        test_loss = criterion(y_pred, test_targets.reshape(-1))
    
        tloss.append(test_loss.data.cpu())
        test_loss_sum.append(test_loss.data.cpu()) 

        wandb.log({'avg test loss in this batch': test_loss.item(), 'epoch': epoch, 'batch_id': idx})
        
        # Get accuracy
        total += test_targets.reshape(-1).size(0)
        correct += (pred_label == test_targets.reshape(-1)).sum()
    accuracy = 100 * correct / total

    # Record at every epoch
    print('test Loss at epoch {}: {}\n'.format(epoch, np.mean(test_loss_sum)))
    wandb.log({'avg test loss in this epoch': np.mean(test_loss_sum), 'epoch': epoch})
    wandb.log({'test accuracy in this epoch': accuracy, 'epoch': epoch})
    
    
    


            
    
        
        
        

In [24]:



class SiameseModel(nn.Module):
    def __init__(self, input_dim1, input_dim2, 
                 hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4,
                 num_layers1, num_layers2, output_dim1, output_dim2):
        super(SiameseModel, self).__init__()
        self.input_dim1 = input_dim1
        self.input_dim2 = input_dim2
        self.hidden_dim1 = hidden_dim1
        self.hidden_dim2 = hidden_dim2
        self.hidden_dim3 = hidden_dim3
        self.hidden_dim4 = hidden_dim4
        self.num_layers1 = num_layers1
        self.num_layers2 = num_layers2
        self.output_dim1 = output_dim1
        self.output_dim2 = output_dim2
        
        

#         self.roberta = RobertaModel.from_pretrained("roberta-large").to(device)
        
        
#         self.lstm1 = nn.LSTM(input_dim1, hidden_dim1, num_layers1, batch_first=True)
#         self.lstm2 = nn.LSTM(input_dim2, hidden_dim2, num_layers2, batch_first=True)


        self.fc1 = nn.Linear(hidden_dim1, output_dim1)
        self.fc2 = nn.Linear(hidden_dim2, output_dim2)
        self.fc3 = nn.Linear(input_dim1, hidden_dim3)
#         self.fc3 = nn.Linear(output_dim1+output_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, hidden_dim4)
        self.fc5 = nn.Linear(hidden_dim4, 3)
        
        
        
    def forward(self, x1, ids, masks, token_type_ids):
        #left tower with numerical features
#         h_10 = Variable(torch.zeros(self.num_layers1, x1.size(0), self.hidden_dim1)).to(device)
#         c_10 = Variable(torch.zeros(self.num_layers1, x1.size(0), self.hidden_dim1)).to(device)
#         ula1, (h_out1, _) = self.lstm1(x1, (h_10, c_10))
#         h_out1 = h_out1.view(-1, self.hidden_dim1)
#         out1 = self.fc1(h_out1)
        
        
        

        # right tower with roberta on textual features  
        #TODO
#         batch_size_here = ids.shape[0]
#         e2 = torch.zeros(batch_size_here, max_text_per_iter,1024).to(device)
        
#         for k in range(ids.shape[1]):
#             seq_ids = ids[:,k,:]
#             seq_masks = masks[:,k,:]
#             seq_token_type_ids = token_type_ids[:,k,:]


#             e2k = self.roberta(input_ids= seq_ids, attention_mask=seq_masks, token_type_ids=seq_token_type_ids)
#             # print(e2.shape)
#             # print(e2k[1].shape)
#             #first 0 is for last_hidden_state: https://huggingface.co/docs/transformers/main/en/model_doc/roberta#transformers.RobertaModel.forward.returns:~:text=transformers.modeling_outputs.-,BaseModelOutputWithPoolingAndCrossAttentions%20or%20tuple(torch.FloatTensor),-A%20transformers.modeling_outputs
#             # the shape of e2k[0] is (batch_size, sequence_length (<=MAX_LEN), hidden_size (=1024))
#             e2k1 = e2k[0][:, 0, :]  
#             e2[:,k,:] = e2k1
    
    
#         print('e2 shape: ', e2.shape)        
#         h_20 = Variable(torch.zeros(self.num_layers2, e2.size(0), self.hidden_dim2)).to(device)
#         c_20 = Variable(torch.zeros(self.num_layers2, e2.size(0), self.hidden_dim2)).to(device)
#         ula2, (h_out2, _) = self.lstm2(e2, (h_20, c_20))
#         h_out2 = h_out2.view(-1, self.hidden_dim2)
#         out2 = self.fc2(h_out2)
        
        
        
        
        # siamese merging layers
        x1 = torch.squeeze(x1[:, 0, :], 1) #x1[:, 0, :].squeeze()
        print('shape of x1 after squeeze:', x1.shape)
#         output = torch.cat((out1, out2),1)
#         output = F.relu(self.fc3(output))
        output = F.relu(self.fc3(x1))
        output = F.relu(self.fc4(output))
        output = self.fc5(output)
        return output
    
#TODO : correct these values
model = SiameseModel(input_dim1 = 8, input_dim2 = 1024, 
                 hidden_dim1 = 20, hidden_dim2 = 768, hidden_dim3 = 128, hidden_dim4 = 64,
                 num_layers1 = 1, num_layers2 = 1, output_dim1 = 10, output_dim2 = 256).to(device)


    
    

In [25]:
print(model)
print(len(list(model.parameters())))
# for i in range(len(list(model.parameters()))):
#     print(list(model.parameters())[i].size())

In [26]:
wandb.init(project="stock_prediction")

<wandb.sdk.wandb_run.Run at 0x12aa8d3a0>

In [27]:
# roberta = RobertaModel.from_pretrained("roberta-large").to(device)
train_loss_record = []
for epoch in range(num_epochs):
    train_loss = []
    train_loss_sum = []
    
    for idx, data in tqdm(enumerate(train_loader, 0)):
        x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        ids = data['ids'].to(device, dtype = torch.long)
        masks = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        # debugging roberta encoder and second lstm
        '''
        debug starts here
        '''
        # if idx > 1:
        #     break
        # batch_size_here = data['ids'].shape[0]
        # print('batch_size_here:', batch_size_here)
        # e2 = torch.zeros(batch_size_here, max_text_per_iter, 1024)
        # print('ids shape:', ids.shape)
        
        # for k in range(ids.shape[1]):  #number of sentences in sequence = max_text_per_iter
        #     print('k:', k)
        #     seq_ids = ids[:,k,:].to(device)
        #     seq_masks = masks[:,k,:].to(device)
        #     seq_token_type_ids = token_type_ids[:,k,:].to(device)


        #     e2k = roberta(input_ids= seq_ids, attention_mask=seq_masks, token_type_ids=seq_token_type_ids)
        #     print(e2.shape)
        #     print(e2k[1].shape)
        #     #first 0 is for last_hidden_state: https://huggingface.co/docs/transformers/main/en/model_doc/roberta#transformers.RobertaModel.forward.returns:~:text=transformers.modeling_outputs.-,BaseModelOutputWithPoolingAndCrossAttentions%20or%20tuple(torch.FloatTensor),-A%20transformers.modeling_outputs
        #     # the shape of e2k[0] is (batch_size, sequence_length (<=MAX_LEN), hidden_size (=1024))
        #     e2k1 = e2k[0][:, 0, :]  
        #     e2[:,k,:] = e2k1
    
    
        

        # lstm2 = nn.LSTM(1024, 768, 1, batch_first=True)
        # fc2 = nn.Linear(768, 256)

        # h_20 = Variable(torch.zeros(1, e2.size(0), 768))
        # c_20 = Variable(torch.zeros(1, e2.size(0), 768))
        # ula2, (h_out2, _) = lstm2(e2, (h_20, c_20))
        # h_out2 = h_out2.view(-1, 768)
        # out2 = fc2(h_out2)
        

    #     print(ids.shape)
    #     print(masks.shape)
    #     print(token_type_ids.shape)
    
        # print(out2)


        
        '''
        debug ends here
        '''
        print('shape of x1 = x_numerical : ', x_numerical.shape)    
        y_pred = model(x_numerical, ids, masks, token_type_ids)
        print('y_pred:', y_pred)
        print('target:', targets)
        loss = criterion(y_pred, targets.reshape(-1))
        
         # Zero out gradient, else they will accumulate between epochs
        optimiser.zero_grad()

        # Backward pass
        loss.backward()

        # Update parameters
        optimiser.step()
        
        
        
        train_loss.append(loss.data.cpu())
        train_loss_sum.append(loss.data.cpu())
        
        if epoch % 10 == 0 and epoch !=0:
            print("Epoch ", epoch, "CELoss: ", loss.item())   

        wandb.log({'avg train loss in this batch': loss.item(), 'epoch': epoch, 'batch_id': idx})

    # Record at every epoch
    print('Train Loss at epoch {}: {}\n'.format(epoch, np.mean(train_loss_sum)))
    train_loss_record.append(np.mean(train_loss_sum))
    wandb.log({'avg train loss in this epoch': np.mean(train_loss_sum), 'epoch': epoch})
    
    
    
    
    
    
    # evaluate on test set every epoch
    print('starting testing..')
    tloss = []
    test_loss_sum = []
    
    correct = 0
    total = 0
    
    for idx, data in tqdm(enumerate(test_loader, 0)):
        test_x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        test_ids = data['ids'].to(device, dtype = torch.long)
        test_masks = data['mask'].to(device, dtype = torch.long)
        test_token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        test_targets = data['targets'].to(device, dtype = torch.long)
        
        print('shape of x1 = test_x_numerical : ', test_x_numerical.shape)    
        y_pred = model(test_x_numerical, test_ids, test_masks, test_token_type_ids)
        _, pred_label = torch.max(y_pred.data, 1)

#         print('y_pred:', y_pred)
        test_loss = criterion(y_pred, test_targets.reshape(-1))
    
        tloss.append(test_loss.data.cpu())
        test_loss_sum.append(test_loss.data.cpu()) 

        wandb.log({'avg test loss in this batch': test_loss.item(), 'epoch': epoch, 'batch_id': idx})
        
        # Get accuracy
        total += test_targets.reshape(-1).size(0)
        correct += (pred_label == test_targets.reshape(-1)).sum()
    accuracy = 100 * correct / total

    # Record at every epoch
    print('test Loss at epoch {}: {}\n'.format(epoch, np.mean(test_loss_sum)))
    wandb.log({'avg test loss in this epoch': np.mean(test_loss_sum), 'epoch': epoch})
    wandb.log({'test accuracy in this epoch': accuracy, 'epoch': epoch})
    
    
    


            
    
        
        
        

In [28]:
# roberta = RobertaModel.from_pretrained("roberta-large").to(device)
train_loss_record = []
for epoch in range(num_epochs):
    train_loss = []
    train_loss_sum = []
    
    for idx, data in tqdm(enumerate(train_loader, 0)):
        x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        ids = data['ids'].to(device, dtype = torch.long)
        masks = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        # debugging roberta encoder and second lstm
        '''
        debug starts here
        '''
        # if idx > 1:
        #     break
        # batch_size_here = data['ids'].shape[0]
        # print('batch_size_here:', batch_size_here)
        # e2 = torch.zeros(batch_size_here, max_text_per_iter, 1024)
        # print('ids shape:', ids.shape)
        
        # for k in range(ids.shape[1]):  #number of sentences in sequence = max_text_per_iter
        #     print('k:', k)
        #     seq_ids = ids[:,k,:].to(device)
        #     seq_masks = masks[:,k,:].to(device)
        #     seq_token_type_ids = token_type_ids[:,k,:].to(device)


        #     e2k = roberta(input_ids= seq_ids, attention_mask=seq_masks, token_type_ids=seq_token_type_ids)
        #     print(e2.shape)
        #     print(e2k[1].shape)
        #     #first 0 is for last_hidden_state: https://huggingface.co/docs/transformers/main/en/model_doc/roberta#transformers.RobertaModel.forward.returns:~:text=transformers.modeling_outputs.-,BaseModelOutputWithPoolingAndCrossAttentions%20or%20tuple(torch.FloatTensor),-A%20transformers.modeling_outputs
        #     # the shape of e2k[0] is (batch_size, sequence_length (<=MAX_LEN), hidden_size (=1024))
        #     e2k1 = e2k[0][:, 0, :]  
        #     e2[:,k,:] = e2k1
    
    
        

        # lstm2 = nn.LSTM(1024, 768, 1, batch_first=True)
        # fc2 = nn.Linear(768, 256)

        # h_20 = Variable(torch.zeros(1, e2.size(0), 768))
        # c_20 = Variable(torch.zeros(1, e2.size(0), 768))
        # ula2, (h_out2, _) = lstm2(e2, (h_20, c_20))
        # h_out2 = h_out2.view(-1, 768)
        # out2 = fc2(h_out2)
        

    #     print(ids.shape)
    #     print(masks.shape)
    #     print(token_type_ids.shape)
    
        # print(out2)


        
        '''
        debug ends here
        '''
        print('shape of x1 = x_numerical : ', x_numerical.shape)    
        y_pred = model(x_numerical, ids, masks, token_type_ids)
        print('y_pred:', y_pred)
        print('target:', targets)
        loss = criterion(y_pred, targets.reshape(-1))
        
         # Zero out gradient, else they will accumulate between epochs
        optimiser.zero_grad()

        # Backward pass
        loss.backward()

        # Update parameters
        optimiser.step()
        
        
        
        train_loss.append(loss.data.cpu())
        train_loss_sum.append(loss.data.cpu())
        
        if epoch % 10 == 0 and epoch !=0:
            print("Epoch ", epoch, "CELoss: ", loss.item())   

        wandb.log({'avg train loss in this batch': loss.item(), 'epoch': epoch, 'batch_id': idx})

    # Record at every epoch
    print('Train Loss at epoch {}: {}\n'.format(epoch, np.mean(train_loss_sum)))
    train_loss_record.append(np.mean(train_loss_sum))
    wandb.log({'avg train loss in this epoch': np.mean(train_loss_sum), 'epoch': epoch})
    
    
    
    
    
    
    # evaluate on test set every epoch
    print('starting testing..')
    tloss = []
    test_loss_sum = []
    
    correct = 0
    total = 0
    
    for idx, data in tqdm(enumerate(test_loader, 0)):
        test_x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        test_ids = data['ids'].to(device, dtype = torch.long)
        test_masks = data['mask'].to(device, dtype = torch.long)
        test_token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        test_targets = data['targets'].to(device, dtype = torch.long)
        
        print('shape of x1 = test_x_numerical : ', test_x_numerical.shape)    
        y_pred = model(test_x_numerical, test_ids, test_masks, test_token_type_ids)
        _, pred_label = torch.max(y_pred.data, 1)

#         print('y_pred:', y_pred)
        test_loss = criterion(y_pred, test_targets.reshape(-1))
    
        tloss.append(test_loss.data.cpu())
        test_loss_sum.append(test_loss.data.cpu()) 

        wandb.log({'avg test loss in this batch': test_loss.item(), 'epoch': epoch, 'batch_id': idx})
        
        # Get accuracy
        
        total += test_targets.reshape(-1).size(0)
        correct += (pred_label == test_targets.reshape(-1)).sum()
        print('in this epoch, total, correct:', total, correct)
    accuracy = 100 * correct / total

    # Record at every epoch
    print('test Loss at epoch {}: {}\n'.format(epoch, np.mean(test_loss_sum)))
    wandb.log({'avg test loss in this epoch': np.mean(test_loss_sum), 'epoch': epoch})
    wandb.log({'test accuracy in this epoch': accuracy, 'epoch': epoch})
    
    
    


            
    
        
        
        

In [29]:
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.001)
loss_arr = np.zeros(num_epochs)

In [30]:
wandb.init(project="stock_prediction")

<wandb.sdk.wandb_run.Run at 0x12aaa9220>

0,1
avg test loss in this batch,▃▁▄▅▃▅▁▃▄▅▃▆▅▄▁▃▁▅▆▆▅▆▁▃▅▃▃▄▃▃▅▅▃▁▇▁▅▅▁█
avg test loss in this epoch,█▇▇█▇▇█▇▇▆▂▇▇▁█▇▇▁▇▁▇▁▁▁▇▇▇▇▇▇▁▇▂▁▇▇▁▇▇▇
avg train loss in this batch,▅▃▅▁▆▆▆▃▆▄▅▆▆▆▃█▆▃▃▅▅█▃▅▆▄▄▄▃▅▆▇▆▆▅▃█▄▆▃
avg train loss in this epoch,▁▁▁▆▃▆▁▃▃▆▃▆▃▃█▃▃▃▆▃▆▆▃▁▆▃▆▃▆▆▃▃▃█▁▃▆▆▃▆
batch_id,▄▁▂▃▁▂▃▄▂▃▄▂▃▄▅▃▄▅▂▃▅▆▃▄▆▃▄▅▆▄▅▆▄▅▆▇▅▆▇█
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test accuracy in this epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
avg test loss in this batch,1.13943
avg test loss in this epoch,1.10414
avg train loss in this batch,1.1101
avg train loss in this epoch,1.11512
batch_id,7.0
epoch,19.0
test accuracy in this epoch,33.33333


In [31]:
# roberta = RobertaModel.from_pretrained("roberta-large").to(device)
train_loss_record = []
for epoch in range(num_epochs):
    train_loss = []
    train_loss_sum = []
    
    for idx, data in tqdm(enumerate(train_loader, 0)):
        x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        ids = data['ids'].to(device, dtype = torch.long)
        masks = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        # debugging roberta encoder and second lstm
        '''
        debug starts here
        '''
        # if idx > 1:
        #     break
        # batch_size_here = data['ids'].shape[0]
        # print('batch_size_here:', batch_size_here)
        # e2 = torch.zeros(batch_size_here, max_text_per_iter, 1024)
        # print('ids shape:', ids.shape)
        
        # for k in range(ids.shape[1]):  #number of sentences in sequence = max_text_per_iter
        #     print('k:', k)
        #     seq_ids = ids[:,k,:].to(device)
        #     seq_masks = masks[:,k,:].to(device)
        #     seq_token_type_ids = token_type_ids[:,k,:].to(device)


        #     e2k = roberta(input_ids= seq_ids, attention_mask=seq_masks, token_type_ids=seq_token_type_ids)
        #     print(e2.shape)
        #     print(e2k[1].shape)
        #     #first 0 is for last_hidden_state: https://huggingface.co/docs/transformers/main/en/model_doc/roberta#transformers.RobertaModel.forward.returns:~:text=transformers.modeling_outputs.-,BaseModelOutputWithPoolingAndCrossAttentions%20or%20tuple(torch.FloatTensor),-A%20transformers.modeling_outputs
        #     # the shape of e2k[0] is (batch_size, sequence_length (<=MAX_LEN), hidden_size (=1024))
        #     e2k1 = e2k[0][:, 0, :]  
        #     e2[:,k,:] = e2k1
    
    
        

        # lstm2 = nn.LSTM(1024, 768, 1, batch_first=True)
        # fc2 = nn.Linear(768, 256)

        # h_20 = Variable(torch.zeros(1, e2.size(0), 768))
        # c_20 = Variable(torch.zeros(1, e2.size(0), 768))
        # ula2, (h_out2, _) = lstm2(e2, (h_20, c_20))
        # h_out2 = h_out2.view(-1, 768)
        # out2 = fc2(h_out2)
        

    #     print(ids.shape)
    #     print(masks.shape)
    #     print(token_type_ids.shape)
    
        # print(out2)


        
        '''
        debug ends here
        '''
        print('shape of x1 = x_numerical : ', x_numerical.shape)    
        y_pred = model(x_numerical, ids, masks, token_type_ids)
        print('y_pred:', y_pred)
        print('target:', targets)
        loss = criterion(y_pred, targets.reshape(-1))
        
         # Zero out gradient, else they will accumulate between epochs
        optimiser.zero_grad()

        # Backward pass
        loss.backward()

        # Update parameters
        optimiser.step()
        
        
        
        train_loss.append(loss.data.cpu())
        train_loss_sum.append(loss.data.cpu())
        
        if epoch % 10 == 0 and epoch !=0:
            print("Epoch ", epoch, "CELoss: ", loss.item())   

        wandb.log({'avg train loss in this batch': loss.item(), 'epoch': epoch, 'batch_id': idx})

    # Record at every epoch
    print('Train Loss at epoch {}: {}\n'.format(epoch, np.mean(train_loss_sum)))
    train_loss_record.append(np.mean(train_loss_sum))
    wandb.log({'avg train loss in this epoch': np.mean(train_loss_sum), 'epoch': epoch})
    
    
    
    
    
    
    # evaluate on test set every epoch
    print('starting testing..')
    tloss = []
    test_loss_sum = []
    
    correct = 0
    total = 0
    
    for idx, data in tqdm(enumerate(test_loader, 0)):
        test_x_numerical = data['x_numerical'].to(device, dtype = torch.float)
        test_ids = data['ids'].to(device, dtype = torch.long)
        test_masks = data['mask'].to(device, dtype = torch.long)
        test_token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        test_targets = data['targets'].to(device, dtype = torch.long)
        
        print('shape of x1 = test_x_numerical : ', test_x_numerical.shape)    
        y_pred = model(test_x_numerical, test_ids, test_masks, test_token_type_ids)
        _, pred_label = torch.max(y_pred.data, 1)

#         print('y_pred:', y_pred)
        test_loss = criterion(y_pred, test_targets.reshape(-1))
    
        tloss.append(test_loss.data.cpu())
        test_loss_sum.append(test_loss.data.cpu()) 

        wandb.log({'avg test loss in this batch': test_loss.item(), 'epoch': epoch, 'batch_id': idx})
        
        # Get accuracy
        
        total += test_targets.reshape(-1).size(0)
        correct += (pred_label == test_targets.reshape(-1)).sum()
        print('in this epoch, total, correct:', total, correct)
    accuracy = 100 * correct / total

    # Record at every epoch
    print('test Loss at epoch {}: {}\n'.format(epoch, np.mean(test_loss_sum)))
    wandb.log({'avg test loss in this epoch': np.mean(test_loss_sum), 'epoch': epoch})
    wandb.log({'test accuracy in this epoch': accuracy, 'epoch': epoch})
    
    
    


            
    
        
        
        

In [32]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

tokenizer = RobertaTokenizer.from_pretrained('roberta-large', truncation=True, do_lower_case=True)

class SiameseDataloader(Dataset):
    
    def __init__(self, X_numerical_train, y_train, X_text_train, MAX_LEN, tokenizer):
        self.X_numerical_train = X_numerical_train
        self.X_text_train = X_text_train
        self.MAX_LEN = MAX_LEN
        self.tokenizer = tokenizer
        self.y_train = y_train
        
    def __getitem__(self, index):

        
        input_ids = []
        attention_masks = []
        token_type_ids = []
        input_seq = []

        for sent in self.X_text_train[index]:
            encoded_sent = self.tokenizer.encode_plus(
                text=sent,
                add_special_tokens=True,        # Add `[CLS]` and `[SEP]` special tokens
                max_length=self.MAX_LEN,             # Choose max length to truncate/pad
                pad_to_max_length=True,         # Pad sentence to max length 
                #return_attention_mask=True      # Return attention mask
                return_token_type_ids=True
                )
            input_ids.append(encoded_sent.get('input_ids'))
            attention_masks.append(encoded_sent.get('attention_mask'))
            token_type_ids.append(encoded_sent.get('token_type_ids'))

        # Convert lists to tensors
        input_ids = torch.tensor(input_ids)
        attention_masks = torch.tensor(attention_masks)
        token_type_ids = torch.tensor(token_type_ids)


        return {
            'x_numerical': aelf.X_numerical_train[index],
            'ids': torch.tensor(input_ids, dtype=torch.long),
            'mask': torch.tensor(attention_masks, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.y_train[index], dtype=torch.long)
        }
    
    
    
    

    def __len__(self):
        return len(self.X_numerical_train)

In [33]:
train_set = SiameseDataloader(X_numerical_train, y_train, X_text_train, MAX_LEN, tokenizer)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)


test_set = SiameseDataloader(X_numerical_test, y_test, X_text_test, MAX_LEN, tokenizer)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)

In [34]:
for idx, data in tqdm(enumerate(train_loader, 0)):
    if idx > 1:
        break
        
    
    x_numerical = data['x_numerical'].to(device, dtype = torch.float)
    ids = data['ids'].to(device, dtype = torch.long)
    masks = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.long)
    

In [35]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

tokenizer = RobertaTokenizer.from_pretrained('roberta-large', truncation=True, do_lower_case=True)

class SiameseDataloader(Dataset):
    
    def __init__(self, X_numerical_train, y_train, X_text_train, MAX_LEN, tokenizer):
        self.X_numerical_train = X_numerical_train
        self.X_text_train = X_text_train
        self.MAX_LEN = MAX_LEN
        self.tokenizer = tokenizer
        self.y_train = y_train
        
    def __getitem__(index):

        
        input_ids = []
        attention_masks = []
        token_type_ids = []
        input_seq = []

        for sent in self.X_text_train[index]:
            encoded_sent = self.tokenizer.encode_plus(
                text=sent,
                add_special_tokens=True,        # Add `[CLS]` and `[SEP]` special tokens
                max_length=self.MAX_LEN,             # Choose max length to truncate/pad
                pad_to_max_length=True,         # Pad sentence to max length 
                #return_attention_mask=True      # Return attention mask
                return_token_type_ids=True
                )
            input_ids.append(encoded_sent.get('input_ids'))
            attention_masks.append(encoded_sent.get('attention_mask'))
            token_type_ids.append(encoded_sent.get('token_type_ids'))

        # Convert lists to tensors
        input_ids = torch.tensor(input_ids)
        attention_masks = torch.tensor(attention_masks)
        token_type_ids = torch.tensor(token_type_ids)


        return {
            'x_numerical': aelf.X_numerical_train[index],
            'ids': torch.tensor(input_ids, dtype=torch.long),
            'mask': torch.tensor(attention_masks, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.y_train[index], dtype=torch.long)
        }
    
    
    
    

    def __len__(self):
        return len(self.X_numerical_train)

In [36]:
train_set = SiameseDataloader(X_numerical_train, y_train, X_text_train, MAX_LEN, tokenizer)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)


test_set = SiameseDataloader(X_numerical_test, y_test, X_text_test, MAX_LEN, tokenizer)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)

In [37]:
train_set = SiameseDataloader(X_numerical_train, y_train, X_text_train, MAX_LEN, tokenizer)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)


test_set = SiameseDataloader(X_numerical_test, y_test, X_text_test, MAX_LEN, tokenizer)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)

In [38]:
for idx, data in tqdm(enumerate(train_loader, 0)):
    if idx > 1:
        break
        
    
    x_numerical = data['x_numerical'].to(device, dtype = torch.float)
    ids = data['ids'].to(device, dtype = torch.long)
    masks = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.long)
    

In [39]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

tokenizer = RobertaTokenizer.from_pretrained('roberta-large', truncation=True, do_lower_case=True)

class SiameseDataloader(Dataset):
    
    def __init__(self, X_numerical_train, y_train, X_text_train, MAX_LEN, tokenizer):
        self.X_numerical_train = X_numerical_train
        self.X_text_train = X_text_train
        self.MAX_LEN = MAX_LEN
        self.tokenizer = tokenizer
        
    def __getitem__(self, index):

        
        input_ids = []
        attention_masks = []
        token_type_ids = []
        input_seq = []

        for sent in X_text_train[index]:
            encoded_sent = self.tokenizer.encode_plus(
                text=sent,
                add_special_tokens=True,        # Add `[CLS]` and `[SEP]` special tokens
                max_length=self.MAX_LEN,             # Choose max length to truncate/pad
                pad_to_max_length=True,         # Pad sentence to max length 
                #return_attention_mask=True      # Return attention mask
                return_token_type_ids=True
                )
            input_ids.append(encoded_sent.get('input_ids'))
            attention_masks.append(encoded_sent.get('attention_mask'))
            token_type_ids.append(encoded_sent.get('token_type_ids'))

        # Convert lists to tensors
        input_ids = torch.tensor(input_ids)
        attention_masks = torch.tensor(attention_masks)
        token_type_ids = torch.tensor(token_type_ids)


        return {
            'x_numerical': X_numerical_train[index],
            'ids': torch.tensor(input_ids, dtype=torch.long),
            'mask': torch.tensor(attention_masks, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(y_train[index], dtype=torch.long)
        }
    
    
    
    

    def __len__(self):
        return len(self.X_numerical_train)

In [40]:
train_set = SiameseDataloader(X_numerical_train, y_train, X_text_train, MAX_LEN, tokenizer)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)


test_set = SiameseDataloader(X_numerical_test, y_test, X_text_test, MAX_LEN, tokenizer)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)

In [41]:
for idx, data in tqdm(enumerate(train_loader, 0)):
    if idx > 1:
        break
        
    
    x_numerical = data['x_numerical'].to(device, dtype = torch.float)
    ids = data['ids'].to(device, dtype = torch.long)
    masks = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.long)
    

In [42]:
print(x_numerical.shape)
x_numerical

tensor([[[0.7056, 0.4321, 0.5952, 0.4241, 0.2539, 0.0468, 0.7715, 0.3079],
         [0.3989, 0.2382, 0.3931, 0.3906, 0.3120, 0.1181, 0.8105, 0.4438],
         [0.7271, 0.6226, 0.7925, 0.6406, 0.2598, 0.1996, 0.9521, 0.4407],
         [0.5898, 0.3608, 0.5508, 0.4673, 0.1748, 0.0541, 0.9111, 0.3813],
         [0.5610, 0.4919, 0.5488, 0.5229, 0.3037, 0.2698, 0.9233, 0.4604]],

        [[0.5996, 0.4429, 0.6064, 0.5288, 0.2595, 0.0425, 0.9399, 0.4458],
         [0.6074, 0.4851, 0.6113, 0.5342, 0.2245, 0.1251, 0.9795, 0.4658],
         [0.5903, 0.4114, 0.5547, 0.5298, 0.2646, 0.0901, 0.9443, 0.5078],
         [0.5503, 0.3628, 0.4619, 0.3594, 0.2693, 0.0499, 0.8105, 0.3350],
         [0.5088, 0.3552, 0.5752, 0.4983, 0.1159, 0.0986, 0.9790, 0.4609]],

        [[0.9736, 1.0000, 0.9937, 0.9990, 0.7095, 0.4316, 0.9209, 0.5410],
         [0.6455, 0.3428, 0.5498, 0.4832, 0.1078, 0.1014, 0.7695, 0.4150],
         [0.5815, 0.3745, 0.5830, 0.4495, 0.1290, 0.0080, 0.8062, 0.3406],
         [0.4460, 0.3

In [43]:
train_set = SiameseDataloader(X_numerical_train, y_train, X_text_train, MAX_LEN, tokenizer)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=False)


test_set = SiameseDataloader(X_numerical_test, y_test, X_text_test, MAX_LEN, tokenizer)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [44]:
for idx, data in tqdm(enumerate(train_loader, 0)):
    if idx > 1:
        break
        
    
    x_numerical = data['x_numerical'].to(device, dtype = torch.float)
    ids = data['ids'].to(device, dtype = torch.long)
    masks = data['mask'].to(device, dtype = torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.long)
    

In [45]:
print(x_numerical.shape)
x_numerical

tensor([[[0.4482, 0.3164, 0.5532, 0.4597, 0.2559, 0.1724, 0.7974, 0.3616],
         [0.3933, 0.1324, 0.3101, 0.3062, 0.2703, 0.0145, 0.6846, 0.3652],
         [0.5269, 0.3479, 0.5117, 0.3589, 0.2173, 0.0000, 0.6943, 0.2324],
         [0.5186, 0.7314, 0.6587, 0.9082, 0.3604, 0.8525, 0.9980, 1.0000],
         [0.6636, 0.3513, 0.6284, 0.3013, 0.2277, 0.4827, 0.9590, 0.5181]],

        [[0.3933, 0.1324, 0.3101, 0.3062, 0.2703, 0.0145, 0.6846, 0.3652],
         [0.5269, 0.3479, 0.5117, 0.3589, 0.2173, 0.0000, 0.6943, 0.2324],
         [0.5186, 0.7314, 0.6587, 0.9082, 0.3604, 0.8525, 0.9980, 1.0000],
         [0.6636, 0.3513, 0.6284, 0.3013, 0.2277, 0.4827, 0.9590, 0.5181],
         [0.5435, 0.3472, 0.4666, 0.4692, 0.2900, 0.4233, 0.8477, 0.5400]],

        [[0.5269, 0.3479, 0.5117, 0.3589, 0.2173, 0.0000, 0.6943, 0.2324],
         [0.5186, 0.7314, 0.6587, 0.9082, 0.3604, 0.8525, 0.9980, 1.0000],
         [0.6636, 0.3513, 0.6284, 0.3013, 0.2277, 0.4827, 0.9590, 0.5181],
         [0.5435, 0.3

In [46]:
print(ids.shape)
ids

tensor([[[    0,  9089, 43903,  ...,    30,   389,     2],
         [    0, 28216,   852,  ...,   274,  4216,     2],
         [    0, 15228,   261,  ...,   112,     4,     2],
         ...,
         [    0, 14773,  7458,  ...,  1782,   814,     2],
         [    0, 34543,   195,  ...,  5090,   389,     2],
         [    0,   698,    35,  ...,   189,   236,     2]],

        [[    0,  9089, 43903,  ...,    30,   389,     2],
         [    0, 28216,   852,  ...,   274,  4216,     2],
         [    0, 15228,   261,  ...,   112,     4,     2],
         ...,
         [    0, 14773,  7458,  ...,  1782,   814,     2],
         [    0, 34543,   195,  ...,  5090,   389,     2],
         [    0,   698,    35,  ...,   189,   236,     2]],

        [[    0,  9089, 43903,  ...,    30,   389,     2],
         [    0, 28216,   852,  ...,   274,  4216,     2],
         [    0, 15228,   261,  ...,   112,     4,     2],
         ...,
         [    0, 14773,  7458,  ...,  1782,   814,     2],
         [

In [47]:
print(targets.shape)
targets

tensor([[2],
        [2],
        [0],
        [0],
        [2],
        [2],
        [1],
        [1]])

In [48]:



class SiameseModel(nn.Module):
    def __init__(self, input_dim1, input_dim2, 
                 hidden_dim1, hidden_dim2, hidden_dim3, hidden_dim4,
                 num_layers1, num_layers2, output_dim1, output_dim2):
        super(SiameseModel, self).__init__()
        self.input_dim1 = input_dim1
        self.input_dim2 = input_dim2
        self.hidden_dim1 = hidden_dim1
        self.hidden_dim2 = hidden_dim2
        self.hidden_dim3 = hidden_dim3
        self.hidden_dim4 = hidden_dim4
        self.num_layers1 = num_layers1
        self.num_layers2 = num_layers2
        self.output_dim1 = output_dim1
        self.output_dim2 = output_dim2
        
        

#         self.roberta = RobertaModel.from_pretrained("roberta-large").to(device)
        
        
#         self.lstm1 = nn.LSTM(input_dim1, hidden_dim1, num_layers1, batch_first=True)
#         self.lstm2 = nn.LSTM(input_dim2, hidden_dim2, num_layers2, batch_first=True)


        self.fc1 = nn.Linear(hidden_dim1, output_dim1)
        self.fc2 = nn.Linear(hidden_dim2, output_dim2)
        self.fc3 = nn.Linear(input_dim1, hidden_dim3)
#         self.fc3 = nn.Linear(output_dim1+output_dim2, hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, hidden_dim4)
        self.fc5 = nn.Linear(hidden_dim4, 3)
        
        
        
    def forward(self, x1, ids, masks, token_type_ids):
        #left tower with numerical features
#         h_10 = Variable(torch.zeros(self.num_layers1, x1.size(0), self.hidden_dim1)).to(device)
#         c_10 = Variable(torch.zeros(self.num_layers1, x1.size(0), self.hidden_dim1)).to(device)
#         ula1, (h_out1, _) = self.lstm1(x1, (h_10, c_10))
#         h_out1 = h_out1.view(-1, self.hidden_dim1)
#         out1 = self.fc1(h_out1)
        
        
        

        # right tower with roberta on textual features  
        #TODO
#         batch_size_here = ids.shape[0]
#         e2 = torch.zeros(batch_size_here, max_text_per_iter,1024).to(device)
        
#         for k in range(ids.shape[1]):
#             seq_ids = ids[:,k,:]
#             seq_masks = masks[:,k,:]
#             seq_token_type_ids = token_type_ids[:,k,:]


#             e2k = self.roberta(input_ids= seq_ids, attention_mask=seq_masks, token_type_ids=seq_token_type_ids)
#             # print(e2.shape)
#             # print(e2k[1].shape)
#             #first 0 is for last_hidden_state: https://huggingface.co/docs/transformers/main/en/model_doc/roberta#transformers.RobertaModel.forward.returns:~:text=transformers.modeling_outputs.-,BaseModelOutputWithPoolingAndCrossAttentions%20or%20tuple(torch.FloatTensor),-A%20transformers.modeling_outputs
#             # the shape of e2k[0] is (batch_size, sequence_length (<=MAX_LEN), hidden_size (=1024))
#             e2k1 = e2k[0][:, 0, :]  
#             e2[:,k,:] = e2k1
    
    
#         print('e2 shape: ', e2.shape)        
#         h_20 = Variable(torch.zeros(self.num_layers2, e2.size(0), self.hidden_dim2)).to(device)
#         c_20 = Variable(torch.zeros(self.num_layers2, e2.size(0), self.hidden_dim2)).to(device)
#         ula2, (h_out2, _) = self.lstm2(e2, (h_20, c_20))
#         h_out2 = h_out2.view(-1, self.hidden_dim2)
#         out2 = self.fc2(h_out2)
        
        
        
        
        # siamese merging layers
        x1 = torch.squeeze(x1[:, 0, :], 1) #x1[:, 0, :].squeeze()
        print('shape of x1 after squeeze:', x1.shape)
#         output = torch.cat((out1, out2),1)
#         output = F.relu(self.fc3(output))
        output = F.relu(self.fc3(x1))
        output = F.relu(self.fc4(output))
        output = self.fc5(output)
        return output
    
#TODO : correct these values
model = SiameseModel(input_dim1 = 8, input_dim2 = 1024, 
                 hidden_dim1 = 20, hidden_dim2 = 768, hidden_dim3 = 128, hidden_dim4 = 64,
                 num_layers1 = 1, num_layers2 = 1, output_dim1 = 10, output_dim2 = 256).to(device)


    
    

In [49]:
print(model)
print(len(list(model.parameters())))
# for i in range(len(list(model.parameters()))):
#     print(list(model.parameters())[i].size())

In [50]:
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.01)
loss_arr = np.zeros(num_epochs)

In [51]:
wandb.init(project="stock_prediction")