In [1]:
import json
import pandas as pd
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
import torch.optim as optim
from tqdm import tqdm, trange
from transformers import AutoModel, AutoTokenizer
from torch.optim.lr_scheduler import MultiplicativeLR
import matplotlib.pyplot as plt
import os

## Device Setting & data directory

In [2]:
path = os.getcwd()
data_dir = os.path.dirname(path)
device = torch.device('cuda:0')
batch_size = 64

## Tokenizer

In [3]:
# Load_transformer
print('Loading transformers...')
transformer_tag = "activebus/BERT-DK_rest"
tokenizer = AutoTokenizer.from_pretrained(transformer_tag)
transformer = AutoModel.from_pretrained(transformer_tag, add_pooling_layer=False)
transformer.to(device)


Loading transformers...


Some weights of the model checkpoint at activebus/BERT-DK_rest were not used when initializing BertModel: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

## Load Data 

In [4]:
def load_data(split_name='train'):
    print('Loading {} data...'.format(split_name))
    df = pd.read_csv(data_dir + f'/{split_name}.csv')
    return df
df = load_data('Combined_News_DJIA')
stock_df = load_data('upload_DJIA_table')
df['price'] = stock_df.Close

train_df = df[:int(len(df) * 0.8)]
valid_df = df[int(len(df) * 0.8):]
num_train_batches = (len(train_df) + batch_size - 1) // batch_size
num_valid_batches = (len(valid_df) + batch_size - 1) // batch_size
# news = pd.read_csv("../Combined_News_DJIA.csv")

print("Loaded")
# train_df

Loading Combined_News_DJIA data...
Loading upload_DJIA_table data...
Loaded


## Model 
Pre_trained_BERT Finished <br>
Need Linear & LSTM


In [5]:
class Model(nn.Module):
    def __init__(self, hidden_size, num_linear=1):
        super(Model, self).__init__()
        self.pooler = nn.Sequential(
            nn.Linear(768, 768),
            nn.Tanh(),
        )

        self.layers = nn.ModuleList([nn.Linear(256, hidden_size)])
        self.layers.extend([nn.Linear(hidden_size, hidden_size) for i in range(num_linear)])
        self.dropout = nn.Dropout(p=0.5)

        self.output = nn.Linear(256, 1)
        self.timesteps = 60
        
        self.LayerNorm = nn.LayerNorm([64,60,769])
        
#         Add LSTM HERE
        self.LSTM = nn.LSTM(input_size=769,hidden_size=256,
                           num_layers=1,batch_first=True)
#        End LSTM HERE
    def forward(self, text_embeddings, other_inputs):
        pooled_output = self.pooler(text_embeddings)  # (batch_size, 768)
        h = torch.cat([pooled_output, other_inputs*0.00001], 1)  # (batch_size, 768 + 1)
        
        temp = torch.tensor([],dtype=torch.float32).to(device)
        temp = torch.cat((temp,h[0:self.timesteps,:]),0)

        temp_size = list(temp.shape)
        temp = temp.reshape(1,temp_size[0],temp_size[1])

        for timestep in range(1,batch_size):
            temp = torch.cat((temp, h[timestep:timestep+self.timesteps,:].reshape(1,self.timesteps,769)), 0)

        h = temp
        h = self.LayerNorm(h)

        h, (hn, cn) = self.LSTM(h, None)
        h = h[:,-1,:]

#         End LSTM HERE
        h = self.dropout(h)
        o = self.output(h)

        return o

## Train hyperparameter

In [6]:
num_epochs = 10
lr = 0.00002
criterion = nn.BCEWithLogitsLoss()

hidden_size = 128
num_linear = 1

records = []

In [7]:
def compute_predictions(outputs):
    logits = torch.sigmoid(outputs)
    threshold = 0.5
    predictions = torch.zeros(len(logits),1).to(device)
    for i in range(len(logits)):
        if logits[i] < threshold:
            predictions[i] = 0
        else:
            predictions[i] = 1
    return predictions

# Brute concatenate TopK News
def make_input_batch(i_batch, df, batch_size):
    rows_trains = df[min(i_batch*batch_size , len(df)-2*batch_size): min((i_batch+2)*batch_size-1 , len(df)-1)]
    rows_lables = df[min((i_batch+1)*batch_size , len(df)-batch_size ): min((i_batch+2)*batch_size , len(df))]
    
    text = rows_trains[rows_trains.columns.difference(['price', 'Label', 'Date'])]
    text = text.apply(lambda x :' '.join(x.astype(str)),1).tolist()
    text_inputs = tokenizer(text, padding='max_length', truncation=True, return_tensors="pt")
    text_inputs = {k : v.to(device) for k, v in text_inputs.items()}
    with torch.no_grad():
        text_embeddings = transformer(**text_inputs, return_dict=True).last_hidden_state[:,0,:]
    other_inputs = torch.tensor([rows_trains.price.tolist()], dtype=torch.float32).to(device) # (batch_size, 1)
    other_inputs = torch.transpose(other_inputs, 0, 1)
    train_labels = torch.tensor([rows_lables.Label.tolist()],dtype=torch.float32).to(device)  
    train_labels = torch.transpose(train_labels, 0, 1) # (batch_size, 1)
    #print(text_embeddings.shape)
    #print(other_inputs.shape)
    #print(train_labels.shape)
    return text_embeddings, other_inputs, train_labels

## Training

In [8]:
# save path
config = f'hid-{hidden_size}-numlin-{num_linear}'
print(config)
save_path = data_dir + '/results/' + config

# model design
model = Model(hidden_size=hidden_size, num_linear=num_linear)
# if os.path.isfile(save_path + '/model.pt'):
#     model.load_state_dict(torch.load(save_path + '/model.pt'))
model.to(device)

# Learning Rate
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
lmbda = lambda epoch: 0.95
scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda)
total_acc = 0
print(total_acc)
losses, acc_train, acc_valid = [], [], []
print("Start Training")
for epoch in range(num_epochs):
    # train
    running_loss = 0.0
    total_acc = 0
    model.train()

    # shuffle train data
    train_df = train_df.sample(frac=1, random_state=epoch).reset_index(drop=True)

    for i_batch in trange(num_train_batches, desc='ep {:2d}'.format(epoch + 1)):
        text_embeddings, other_inputs, train_labels = make_input_batch(i_batch, train_df, batch_size)

        # train step
        optimizer.zero_grad()
        outputs = model(text_embeddings, other_inputs)
        
        loss = criterion(outputs, train_labels)
        loss.backward()
        optimizer.step()

        # record
        losses.append(loss.cpu().detach().item())
        running_loss += losses[-1]
        predictions = compute_predictions(outputs)
        total_acc += (predictions == train_labels).sum().item()

    print(' loss: %.6f,  train acc: %.6f' % (running_loss / len(train_df), total_acc / len(train_df)))
    acc_train.append(total_acc / len(train_df))
    #scheduler.step()

    # validate
    if epoch % 1 == 0:
        model.eval()
        total_acc = 0
        ## need to find valid data
        with torch.no_grad():
            for i_batch in trange(num_valid_batches, desc='valid'):
                text_embeddings, other_inputs, valid_labels = make_input_batch(i_batch, valid_df, batch_size)
                outputs = model(text_embeddings, other_inputs)
                predictions = compute_predictions(outputs)
                total_acc += (predictions == valid_labels).sum().item()

        print('valid acc', total_acc / len(valid_df))
        acc_valid.append(total_acc / len(valid_df))

print('Finished Training')

if not os.path.isdir(save_path):
    os.mkdir(save_path)
torch.save(model.state_dict(), save_path + '/model.pt')

record = json.dumps({
    'losses' : losses, 'acc_train' : acc_train, 'acc_valid' : acc_valid, 
    'num_epochs' : num_epochs, 'batch_size' : batch_size, 'lr' : lr, 'hidden_size':hidden_size, 'num_linear':num_linear,
    'transformer_tag' : transformer_tag
}, sort_keys=True, indent=4)
records.append(record)
with open(save_path + f'/record-{num_epochs}.json', 'w') as f:
    f.write(record)

hid-128-numlin-1


ep  1:   0%|                                                                                    | 0/25 [00:00<?, ?it/s]

0
Start Training


ep  1: 100%|███████████████████████████████████████████████████████████████████████████| 25/25 [02:11<00:00,  5.25s/it]
valid:   0%|                                                                                     | 0/7 [00:00<?, ?it/s]

 loss: 0.011072,  train acc: 0.482087


valid: 100%|█████████████████████████████████████████████████████████████████████████████| 7/7 [00:38<00:00,  5.49s/it]
ep  2:   0%|                                                                                    | 0/25 [00:00<?, ?it/s]

valid acc 0.5477386934673367


ep  2: 100%|███████████████████████████████████████████████████████████████████████████| 25/25 [02:06<00:00,  5.06s/it]
valid:   0%|                                                                                     | 0/7 [00:00<?, ?it/s]

 loss: 0.010994,  train acc: 0.485229


valid: 100%|█████████████████████████████████████████████████████████████████████████████| 7/7 [00:36<00:00,  5.25s/it]
ep  3:   0%|                                                                                    | 0/25 [00:00<?, ?it/s]

valid acc 0.5552763819095478


ep  3: 100%|███████████████████████████████████████████████████████████████████████████| 25/25 [02:05<00:00,  5.03s/it]
valid:   0%|                                                                                     | 0/7 [00:00<?, ?it/s]

 loss: 0.010875,  train acc: 0.521056


valid: 100%|█████████████████████████████████████████████████████████████████████████████| 7/7 [00:35<00:00,  5.10s/it]
ep  4:   0%|                                                                                    | 0/25 [00:00<?, ?it/s]

valid acc 0.542713567839196


ep  4: 100%|███████████████████████████████████████████████████████████████████████████| 25/25 [02:05<00:00,  5.03s/it]
valid:   0%|                                                                                     | 0/7 [00:00<?, ?it/s]

 loss: 0.010938,  train acc: 0.512885


valid: 100%|█████████████████████████████████████████████████████████████████████████████| 7/7 [00:34<00:00,  4.99s/it]
ep  5:   0%|                                                                                    | 0/25 [00:00<?, ?it/s]

valid acc 0.5804020100502513


ep  5: 100%|███████████████████████████████████████████████████████████████████████████| 25/25 [02:04<00:00,  4.99s/it]
valid:   0%|                                                                                     | 0/7 [00:00<?, ?it/s]

 loss: 0.010860,  train acc: 0.530484


valid: 100%|█████████████████████████████████████████████████████████████████████████████| 7/7 [00:34<00:00,  4.97s/it]
ep  6:   0%|                                                                                    | 0/25 [00:00<?, ?it/s]

valid acc 0.535175879396985


ep  6: 100%|███████████████████████████████████████████████████████████████████████████| 25/25 [02:05<00:00,  5.04s/it]
valid:   0%|                                                                                     | 0/7 [00:00<?, ?it/s]

 loss: 0.010911,  train acc: 0.519170


valid: 100%|█████████████████████████████████████████████████████████████████████████████| 7/7 [00:35<00:00,  5.10s/it]
ep  7:   0%|                                                                                    | 0/25 [00:00<?, ?it/s]

valid acc 0.5678391959798995


ep  7: 100%|███████████████████████████████████████████████████████████████████████████| 25/25 [02:06<00:00,  5.07s/it]
valid:   0%|                                                                                     | 0/7 [00:00<?, ?it/s]

 loss: 0.010914,  train acc: 0.527341


valid: 100%|█████████████████████████████████████████████████████████████████████████████| 7/7 [00:35<00:00,  5.05s/it]
ep  8:   0%|                                                                                    | 0/25 [00:00<?, ?it/s]

valid acc 0.5778894472361809


ep  8: 100%|███████████████████████████████████████████████████████████████████████████| 25/25 [02:08<00:00,  5.14s/it]
valid:   0%|                                                                                     | 0/7 [00:00<?, ?it/s]

 loss: 0.010953,  train acc: 0.503457


valid: 100%|█████████████████████████████████████████████████████████████████████████████| 7/7 [00:36<00:00,  5.18s/it]
ep  9:   0%|                                                                                    | 0/25 [00:00<?, ?it/s]

valid acc 0.5678391959798995


ep  9: 100%|███████████████████████████████████████████████████████████████████████████| 25/25 [02:12<00:00,  5.32s/it]
valid:   0%|                                                                                     | 0/7 [00:00<?, ?it/s]

 loss: 0.010906,  train acc: 0.521684


valid: 100%|█████████████████████████████████████████████████████████████████████████████| 7/7 [00:39<00:00,  5.58s/it]
ep 10:   0%|                                                                                    | 0/25 [00:00<?, ?it/s]

valid acc 0.5728643216080402


ep 10: 100%|███████████████████████████████████████████████████████████████████████████| 25/25 [02:19<00:00,  5.59s/it]
valid:   0%|                                                                                     | 0/7 [00:00<?, ?it/s]

 loss: 0.010866,  train acc: 0.534884


valid: 100%|█████████████████████████████████████████████████████████████████████████████| 7/7 [00:51<00:00,  7.40s/it]

valid acc 0.5778894472361809
Finished Training





## Record

In [9]:
records_p = data_dir + f'/results/hid-128-numlin-1/record-{num_epochs}.json'

with open(records_p, 'r') as f:
    records = json.loads(f.read())

records = [json.loads(record) for record in records]
records = {(r['hidden_size'], r['num_linear']) : r for r in records}
#records = {(records['hidden_size'], records['num_linear'])}
#hidden_sizes = list(records)[0][0]
#num_linears = list(records)[0][1]

fig, axs = plt.subplots(len(hidden_sizes), len(num_linears), sharex=True, sharey=True, figsize=(12,8))
#fig, axs = plt.subplots(len(hidden_sizes), len(num_linears), sharex=True, sharey=True, figsize=(12,8))
axs = axs if len(hidden_sizes) == 1 and len(num_linears) == 1 else axs[ihs, inl]
for ihs, hidden_size in enumerate(hidden_sizes):
    for inl, num_linear in enumerate(num_linears):
        r = records[(hidden_size, num_linear)]
        axs.plot([i for i in range(num_epochs)], r['acc_train'])
        axs.plot([i for i in range(0, num_epochs, 3)], r['acc_valid'], 'tab:orange')
        axs.set_title('hs={}, nl={} ({:.1f})'.format(hidden_size, num_linear, r['acc_valid'][-1]*100))

# for ax in axs.flat:
#     ax.set(xlabel='epochs', ylabel='accuracy')
#     # Hide x labels and tick labels for top plots and y ticks for right plots.
#     ax.label_outer()

fig.savefig(data_dir + f'/results/plt-{num_epochs}.png')
fig.show()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)