In [1]:
import json
import pandas as pd
import torch
from torch import nn
import torch.optim as optim
from tqdm import tqdm, trange
from transformers import AutoModel, AutoTokenizer
from torch.optim.lr_scheduler import MultiplicativeLR
import matplotlib.pyplot as plt
import os

## Device Setting & data directory

In [2]:
path = os.getcwd()
data_dir = os.path.dirname(path)
device = torch.device('cuda:0')
batch_size = 64

## Tokenizer

In [3]:
# Load_transformer
print('Loading transformers...')
transformer_tag = "activebus/BERT-DK_rest"
tokenizer = AutoTokenizer.from_pretrained(transformer_tag)
transformer = AutoModel.from_pretrained(transformer_tag, add_pooling_layer=False)
transformer.to(device)


Loading transformers...


Some weights of the model checkpoint at activebus/BERT-DK_rest were not used when initializing BertModel: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

## Load Data 

In [4]:
def load_data(split_name='train'):
    print('Loading {} data...'.format(split_name))
    df = pd.read_csv(data_dir + f'/{split_name}.csv')
    return df
df = load_data('Combined_News_DJIA')
stock_df = load_data('upload_DJIA_table')
df['price'] = stock_df.Close

train_df = df[:int(len(df) * 0.9)]
valid_df = df[int(len(df) * 0.9):]
num_train_batches = (len(train_df) + batch_size - 1) // batch_size
num_valid_batches = (len(valid_df) + batch_size - 1) // batch_size
# news = pd.read_csv("../Combined_News_DJIA.csv")

print("Loaded")
# train_df

Loading Combined_News_DJIA data...
Loading upload_DJIA_table data...
Loaded


## Model 
Pre_trained_BERT Finished <br>
Need Linear & LSTM


In [37]:
class Model(nn.Module):
    def __init__(self, hidden_size, num_linear=1):
        super(Model, self).__init__()
        self.pooler = nn.Sequential(
            nn.Linear(768, 768),
            nn.Tanh(),
        )
        self.layers = nn.ModuleList([nn.Linear(768 + 1, hidden_size)])
        self.layers.extend([nn.Linear(hidden_size, hidden_size) for i in range(num_linear)])
        self.dropout = nn.Dropout(p=0.5)
        self.output = nn.Linear(hidden_size, 1)
    def forward(self, text_embeddings, other_inputs):
        pooled_output = self.pooler(text_embeddings)  # (batch_size, 768)
        h = torch.cat([pooled_output, other_inputs], 1)  # (batch_size, 768 + 1)
#         Add LSTM HERE
# 
# 
        for layer in self.layers:
            h = nn.functional.leaky_relu(layer(h))  # (batch_size, hidden_size)
            h = self.dropout(h)
        o = self.output(h)  # (batch_size, 1)
        return o

## Train hyperparameter

In [6]:
num_epochs = 10
lr = 0.01
criterion = nn.BCEWithLogitsLoss()

hidden_size = 128
num_linear = 1

records = []

In [7]:
def compute_predictions(outputs):
    logits = torch.sigmoid(outputs)
    threshold = 0.5
    predictions = torch.zeros(len(logits),1).to(device)
    for i in range(len(logits)):
        if logits[i] < threshold:
            predictions[i] = 0
        else:
            predictions[i] = 1
    return predictions

# Brute concatenate TopK News
def make_input_batch(i_batch, df, batch_size):
    rows = df[i_batch* batch_size : min((i_batch+1) * batch_size, len(df))]
    text = rows[rows.columns.difference(['price', 'Label', 'Date'])]
    text = text.apply(lambda x :' '.join(x.astype(str)),1).tolist()
    text_inputs = tokenizer(text, padding='max_length', truncation=True, return_tensors="pt")
    text_inputs = {k : v.to(device) for k, v in text_inputs.items()}
    with torch.no_grad():
        text_embeddings = transformer(**text_inputs, return_dict=True).last_hidden_state[:,0,:]
    other_inputs = torch.tensor([rows.price.tolist()], dtype=torch.float32).to(device) # (batch_size, 1)
    other_inputs = torch.transpose(other_inputs, 0, 1)
    train_labels = torch.tensor([rows.Label.tolist()],dtype=torch.float32).to(device)  
    train_labels = torch.transpose(train_labels, 0, 1) # (batch_size, 1)
    return text_embeddings, other_inputs, train_labels

## Training

In [38]:
# save path
config = f'hid-{hidden_size}-numlin-{num_linear}'
print(config)
save_path = data_dir + '/results/' + config

# model design
model = Model(hidden_size=hidden_size, num_linear=num_linear)
# if os.path.isfile(save_path + '/model.pt'):
#     model.load_state_dict(torch.load(save_path + '/model.pt'))
model.to(device)

# Learning Rate
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
lmbda = lambda epoch: 0.95
scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda)
total_acc = 0
print(total_acc)
losses, acc_train, acc_valid = [], [], []
print("Start Training")
for epoch in range(num_epochs):
    # train
    running_loss = 0.0
    total_acc = 0
    model.train()

    # shuffle train data
    train_df = train_df.sample(frac=1, random_state=epoch).reset_index(drop=True)

    for i_batch in trange(num_train_batches, desc='ep {:2d}'.format(epoch + 1)):
        text_embeddings, other_inputs, train_labels = make_input_batch(i_batch, train_df, batch_size)

        # train step
        optimizer.zero_grad()
        outputs = model(text_embeddings, other_inputs)
        
        loss = criterion(outputs, train_labels)
        loss.backward()
        optimizer.step()

        # record
        losses.append(loss.cpu().detach().item())
        running_loss += losses[-1]
        predictions = compute_predictions(outputs)
        total_acc += (predictions == train_labels).sum().item()

    print(' loss: %.6f,  train acc: %.6f' % (running_loss / len(train_df), total_acc / len(train_df)))
    acc_train.append(total_acc / len(train_df))
    scheduler.step()

    # validate
    if epoch % 3 == 0:
        model.eval()
        total_acc = 0
        ## need to find valid data
        with torch.no_grad():
            for i_batch in trange(num_valid_batches, desc='valid'):
                text_embeddings, other_inputs, valid_labels = make_input_batch(i_batch, valid_df, batch_size)
                outputs = model(text_embeddings, other_inputs)
                predictions = compute_predictions(outputs)
                total_acc += (predictions == valid_labels).sum().item()

        print('valid acc', total_acc / len(valid_df))
        acc_valid.append(total_acc / len(valid_df))

print('Finished Training')

if not os.path.isdir(save_path):
    os.mkdir(save_path)
torch.save(model.state_dict(), save_path + '/model.pt')

record = json.dumps({
    'losses' : losses, 'acc_train' : acc_train, 'acc_valid' : acc_valid, 
    'num_epochs' : num_epochs, 'batch_size' : batch_size, 'lr' : lr, 'hidden_size':hidden_size, 'num_linear':num_linear,
    'transformer_tag' : transformer_tag
}, sort_keys=True, indent=4)
records.append(record)
with open(save_path + f'/record-{num_epochs}.json', 'w') as f:
    f.write(record)

ep  1:   0%|          | 0/28 [00:00<?, ?it/s]

hid-128-numlin-1
0
Start Training


ep  1:   4%|▎         | 1/28 [00:03<01:43,  3.82s/it]

tensor([[ 0.0528,  0.1368,  0.3317,  ..., -0.1397,  0.2129,  0.0703],
        [ 0.0669,  0.0609,  0.3438,  ..., -0.2793,  0.2101,  0.0338],
        [-0.0527,  0.0516,  0.0574,  ..., -0.1062,  0.2186,  0.2217],
        ...,
        [-0.1044,  0.1968,  0.3347,  ..., -0.3156,  0.2320,  0.2724],
        [-0.0208,  0.0211,  0.0596,  ..., -0.2565,  0.2270,  0.2994],
        [ 0.0378,  0.1423,  0.1913,  ..., -0.3356,  0.3413,  0.3159]],
       device='cuda:0')
tensor([[ 0.2056, -0.1079,  0.3447,  ..., -0.2117, -0.2746,  0.1264],
        [ 0.2381, -0.0581,  0.2658,  ..., -0.2303, -0.3174, -0.0706],
        [ 0.2331, -0.1598,  0.3548,  ..., -0.3596, -0.2658, -0.1136],
        ...,
        [ 0.1292, -0.0400,  0.1758,  ..., -0.1401, -0.3659,  0.0030],
        [ 0.2233, -0.0077,  0.2368,  ..., -0.2051, -0.2201, -0.0569],
        [ 0.1848, -0.0899,  0.2260,  ..., -0.3027, -0.2684, -0.0993]],
       device='cuda:0', grad_fn=<TanhBackward>)
tensor(32.7403, device='cuda:0',
       grad_fn=<BinaryCross

ep  1:   7%|▋         | 2/28 [00:07<01:39,  3.83s/it]

tensor([[ 0.1594,  0.1761,  0.2423,  ..., -0.1152,  0.2215,  0.2846],
        [ 0.0288,  0.1082,  0.2337,  ..., -0.2565,  0.1103,  0.1016],
        [ 0.0245,  0.1386,  0.2352,  ..., -0.1961,  0.1942,  0.0708],
        ...,
        [ 0.0473,  0.1723,  0.2108,  ..., -0.3635,  0.2265,  0.2330],
        [ 0.0869,  0.2110,  0.3642,  ..., -0.2941,  0.1863,  0.0749],
        [ 0.0328,  0.0576,  0.0036,  ..., -0.3382,  0.3109,  0.2312]],
       device='cuda:0')
tensor([[ 0.2145,  0.0138,  0.2159,  ..., -0.1874, -0.3421, -0.0052],
        [ 0.1664, -0.1602,  0.3561,  ..., -0.3209, -0.2472,  0.0517],
        [ 0.1601, -0.1517,  0.2886,  ..., -0.2874, -0.3364,  0.0330],
        ...,
        [ 0.1187, -0.0902,  0.3227,  ..., -0.2334, -0.3442, -0.0601],
        [ 0.2408, -0.0955,  0.2090,  ..., -0.2491, -0.2936,  0.0224],
        [ 0.2438,  0.0500,  0.2413,  ..., -0.1353, -0.3078, -0.0842]],
       device='cuda:0', grad_fn=<TanhBackward>)
tensor(14244.6260, device='cuda:0',
       grad_fn=<BinaryCr

ep  1:  11%|█         | 3/28 [00:11<01:36,  3.84s/it]

tensor([[-0.1467,  0.1756,  0.0402,  ..., -0.2999,  0.2433,  0.2089],
        [ 0.0563,  0.1418,  0.1762,  ..., -0.2989,  0.1550,  0.0309],
        [ 0.0502,  0.0938,  0.1868,  ..., -0.2943,  0.2382,  0.0758],
        ...,
        [-0.0173,  0.1722,  0.1765,  ..., -0.2861,  0.3367,  0.1425],
        [-0.0705,  0.1652,  0.3381,  ..., -0.2891,  0.1791,  0.1993],
        [ 0.1203,  0.0419,  0.4789,  ..., -0.0901,  0.0998,  0.1019]],
       device='cuda:0')
tensor([[ 0.1485, -0.0433,  0.3751,  ..., -0.3541, -0.3148, -0.1854],
        [ 0.1732, -0.0780,  0.3524,  ..., -0.2841, -0.2535,  0.1884],
        [ 0.1508, -0.0391,  0.2202,  ..., -0.2506, -0.3051,  0.0177],
        ...,
        [ 0.1559, -0.0663,  0.3251,  ..., -0.2631, -0.2227,  0.0214],
        [ 0.1890,  0.0935,  0.1782,  ..., -0.2004, -0.2612, -0.0571],
        [ 0.1932, -0.0474,  0.1989,  ..., -0.2249, -0.2516,  0.0493]],
       device='cuda:0', grad_fn=<TanhBackward>)
tensor(75547624., device='cuda:0',
       grad_fn=<BinaryCro

ep  1:  14%|█▍        | 4/28 [00:15<01:32,  3.84s/it]

tensor([[ 0.0231,  0.0695,  0.3479,  ..., -0.2291,  0.1403,  0.0182],
        [-0.0620,  0.2309, -0.0333,  ..., -0.4138,  0.3357,  0.1346],
        [-0.0225,  0.1090,  0.2486,  ..., -0.2101,  0.1970, -0.0440],
        ...,
        [ 0.0351, -0.0560,  0.3379,  ..., -0.0970,  0.0276,  0.1317],
        [ 0.0344,  0.0331,  0.2794,  ..., -0.2025,  0.1657,  0.1602],
        [ 0.1264,  0.0237,  0.3267,  ..., -0.1974,  0.1912,  0.1050]],
       device='cuda:0')
tensor([[ 1., -1.,  1.,  ...,  1., -1., -1.],
        [ 1., -1.,  1.,  ...,  1., -1., -1.],
        [ 1., -1.,  1.,  ...,  1., -1., -1.],
        ...,
        [ 1., -1.,  1.,  ...,  1., -1., -1.],
        [ 1., -1.,  1.,  ...,  1., -1., -1.],
        [ 1., -1.,  1.,  ...,  1., -1., -1.]], device='cuda:0',
       grad_fn=<TanhBackward>)
tensor(2.0270e+15, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward>)


ep  1:  18%|█▊        | 5/28 [00:19<01:28,  3.84s/it]

tensor([[-0.0736,  0.1307,  0.2086,  ..., -0.4036,  0.2623,  0.1591],
        [ 0.0649,  0.1284,  0.0900,  ..., -0.2708,  0.2072,  0.1988],
        [ 0.0283,  0.1098,  0.2476,  ..., -0.2765,  0.2070,  0.0937],
        ...,
        [-0.0223,  0.1586,  0.1503,  ..., -0.3481,  0.3078,  0.1897],
        [-0.0507,  0.2048,  0.0374,  ..., -0.2438,  0.3101,  0.0294],
        [ 0.3009, -0.3155,  0.4283,  ...,  0.2845,  0.1838,  0.3308]],
       device='cuda:0')
tensor([[ 1., -1.,  1.,  ...,  1., -1., -1.],
        [ 1., -1.,  1.,  ...,  1., -1., -1.],
        [ 1., -1.,  1.,  ...,  1., -1., -1.],
        ...,
        [ 1., -1.,  1.,  ...,  1., -1., -1.],
        [ 1., -1.,  1.,  ...,  1., -1., -1.],
        [ 1., -1.,  1.,  ...,  1., -1., -1.]], device='cuda:0',
       grad_fn=<TanhBackward>)
tensor(7.0649e+29, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward>)


ep  1:  21%|██▏       | 6/28 [00:23<01:24,  3.84s/it]

tensor([[ 0.1222, -0.0125, -0.0975,  ..., -0.1493,  0.3086,  0.3336],
        [ 0.0158,  0.0928, -0.0326,  ..., -0.3675,  0.2282,  0.2922],
        [ 0.1206, -0.0533,  0.1103,  ..., -0.0772,  0.3654,  0.2117],
        ...,
        [ 0.3317,  0.2781, -0.2040,  ..., -0.0864,  0.2765,  0.5427],
        [ 0.2506, -0.0282,  0.2501,  ..., -0.2563,  0.2408,  0.0518],
        [ 0.0643,  0.0836,  0.0956,  ..., -0.2507,  0.1690,  0.2622]],
       device='cuda:0')
tensor([[ 1., -1.,  1.,  ...,  1., -1., -1.],
        [ 1., -1.,  1.,  ...,  1., -1., -1.],
        [ 1., -1.,  1.,  ...,  1., -1., -1.],
        ...,
        [ 1., -1.,  1.,  ...,  1., -1., -1.],
        [ 1., -1.,  1.,  ...,  1., -1., -1.],
        [ 1., -1.,  1.,  ...,  1., -1., -1.]], device='cuda:0',
       grad_fn=<TanhBackward>)
tensor(nan, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)


ep  1:  21%|██▏       | 6/28 [00:26<01:38,  4.48s/it]


KeyboardInterrupt: 

## Record

In [None]:
records_p = data_dir + f'results/records-{num_epochs}.json'

with open(records_p, 'r') as f:
    records = json.loads(f.read())
records = [json.loads(record) for record in records]
records = {(r['hidden_size'], r['num_linear']) : r for r in records}

fig, axs = plt.subplots(len(hidden_sizes), len(num_linears), sharex=True, sharey=True, figsize=(12,8))
axs = axs if len(hidden_sizes) == 1 and len(num_linears) == 1 else axs[ihs, inl]
for ihs, hidden_size in enumerate(hidden_sizes):
    for inl, num_linear in enumerate(num_linears):
        r = records[(hidden_size, num_linear)]
        axs.plot([i for i in range(num_epochs)], r['acc_train'])
        axs.plot([i for i in range(0, num_epochs, 3)], r['acc_valid'], 'tab:orange')
        axs.set_title('hs={}, nl={} ({:.1f})'.format(hidden_size, num_linear, r['acc_valid'][-1]*100))

# for ax in axs.flat:
#     ax.set(xlabel='epochs', ylabel='accuracy')
#     # Hide x labels and tick labels for top plots and y ticks for right plots.
#     ax.label_outer()

fig.savefig(data_dir + f'results/plt-{num_epochs}.png')
fig.show()