In [1]:
from __future__ import division
from __future__ import print_function

import datetime
from glob import glob
import numpy as np
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import pandas as pd
import pickle
import random
from sklearn.metrics import (
    classification_report,
    matthews_corrcoef,
    roc_auc_score,
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
)
import time
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, RandomSampler
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter("runs/mansf-stocknet")
from tqdm import tqdm

from dataloader import MixDataset
from model import MANSF
from utils import build_wiki_relation, accuracy

class Args:
    def __init__(self):
        self.no_cuda = False
        self.fastmode = False
        self.sparse = False
        self.seed = 14
        self.epochs = 50
        self.batch_size = 8
        self.lr = 5e-4
        self.weight_decay = 5e-4
        self.hidden = 64
        self.nb_heads = 8
        self.dropout = 0.38
        self.alpha = 0.2
        self.patience = 100
        self.window = 5
        self.max_tweet_num = 5
        self.max_tweet_len = 30
        self.text_ft_dim = 384

args = Args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)


def gen_dataset(dataframe_dict: dict, start_date: str, end_date: str):
    """generate data with different date range"""
    for name in dataframe_dict.keys():
#         dataframe_dict[name] = (
#             dataframe_dict[name].drop_duplicates().reset_index(drop=True)
#         )
        dataframe_dict[name] = dataframe_dict[name][
            (dataframe_dict[name]["date"].notnull())
            & (dataframe_dict[name]["date"] >= start_date)
            & (dataframe_dict[name]["date"] <= end_date)
        ].reset_index(drop=True)
    prices, tweets = dataframe_dict["price"], dataframe_dict["tweet"]
    prices = prices.sort_values("date").reset_index(drop=True)
    tweets = (
        tweets.groupby(["stock", "date"], as_index=False)
        .agg({"text": "\n".join})
        .fillna("")
        .reset_index(drop=True)
    )
    mix = (
        pd.merge(prices, tweets, on=["stock", "date"], how="left")
        .fillna("")
        .reset_index(drop=True)
    )
    mix_pv = pd.pivot(mix, index="date", columns="stock").reset_index()
    mix_pv["text"] = mix_pv["text"].fillna("")
    for col in ["movement_perc", "high", "low", "close"]:
        mix_pv[col] = mix_pv[col].fillna(0.0)
    return mix_pv


stock_dir = "stocknet-dataset/price/preprocessed"
tweet_dir = "stocknet-dataset/tweet_preprocessed"


rels_dir = "Temporal_Relational_Stock_Ranking/data/relation/wikidata"
market_names = ["NASDAQ", "NYSE"]

args.stock_list = sorted(
    list(
        set(
            [path.split("/")[-1].replace(".txt", "") for path in glob(stock_dir + "/*")]
        )
        & set([path.split("/")[-1].split('.')[0] for path in glob(tweet_dir + "/*")])
    )
)
args.n_stock = len(args.stock_list)  # the number of stocks
n_day = 5  # the backward-looking window T
n_tweet = 5  # max num of tweets per day, I suppose 1 tweet per stock per day
n_price_feat = 3  # price feature dim  (normalized high/low/close)
n_tweet_feat = 384  # text embedding dim

prices, tweets = pd.DataFrame(), pd.DataFrame()
for stock in tqdm(args.stock_list):
    _p = pd.read_table(os.path.join(stock_dir, f"{stock}.txt"), header=None)
    _t = pd.read_json(os.path.join(tweet_dir, f"{stock}.json"), orient="records", lines=True)
    
    _t['text'] = [' '.join(map(str, l)) for l in _t['text']]
    
    _p["stock"], _t["stock"] = stock, stock
    prices, tweets = pd.concat([prices, _p]), pd.concat([tweets, _t])

prices.columns = [
    "date",
    "movement_perc",
    "open",
    "high",
    "low",
    "close",
    "volume",
    "stock",
]
prices = prices.drop(["open", "volume"], axis=1)
tweets["date"] = tweets.created_at.apply(lambda x: x.date())
tweets.date = tweets.date.astype(str)

mix_pv_train = gen_dataset(
    dataframe_dict={"price": prices, "tweet": tweets},
    start_date="2014-01-01",
    end_date="2015-06-30",
)
mix_pv_val = gen_dataset(
    dataframe_dict={"price": prices, "tweet": tweets},
    start_date="2015-07-01",
    end_date="2015-12-31",
)
mix_pv_test = gen_dataset(
    dataframe_dict={"price": prices, "tweet": tweets},
    start_date="2016-01-01",
    end_date="2016-03-31",
)

# preprocess relation data & load
adj = build_wiki_relation(rels_dir, market_names, args.stock_list)
edge_index = torch.index_select(torch.nonzero(adj).t(), 0, torch.tensor([0, 1]))
edge_type = torch.index_select(torch.nonzero(adj).t(), 0, torch.tensor([2])).squeeze(0)

model = MANSF(
    nfeat=64,
    nhid=args.hidden,
    nrel=adj.shape[2],
    nclass=2,
    dropout=args.dropout,
    nheads=args.nb_heads,
    alpha=args.alpha,
    stock_num=args.n_stock,
    text_ft_dim=args.text_ft_dim,
)
# model = nn.DataParallel(model, device_ids=[0, 1])
if args.cuda:
    model.cuda()
    edge_index = edge_index.type(torch.LongTensor).cuda()
    edge_type = edge_type.type(torch.LongTensor).cuda()
    # adj = adj.type(torch.LongTensor).cuda()
    args.device = "cuda"

trainset = MixDataset(
    mode="train",
    data=mix_pv_train,
    window_num=args.window,
    max_tweet_num=args.max_tweet_num,
    max_tweet_len=args.max_tweet_len,
    stock_list=args.stock_list,
)
valset = MixDataset(
    mode="val",
    data=mix_pv_val,
    window_num=args.window,
    max_tweet_num=args.max_tweet_num,
    max_tweet_len=args.max_tweet_len,
    stock_list=args.stock_list,
)
testset = MixDataset(
    mode="test",
    data=mix_pv_test,
    window_num=args.window,
    max_tweet_num=args.max_tweet_num,
    max_tweet_len=args.max_tweet_len,
    stock_list=args.stock_list,
)
trainsampler = RandomSampler(trainset)
valsampler = RandomSampler(valset)
testsampler = RandomSampler(testset)
trainloader = DataLoader(
    trainset, sampler=trainsampler, batch_size=args.batch_size, drop_last=True
)
valloader = DataLoader(
    valset, sampler=valsampler, batch_size=args.batch_size, drop_last=True
)
testloader = DataLoader(
    testset, sampler=testsampler, batch_size=args.batch_size, drop_last=True
)
print(len(trainset), len(valset), len(testset))

# train(args, model, trainloader, valloader, edge_index, edge_type)
# print("Optimization Finished!")
# results = test_dict()
# print(results)


100%|███████████████████████████████████████████| 87/87 [00:15<00:00,  5.67it/s]


['CHTR', 'MDT', 'RDS-B', 'BRK-A', 'AGFS', 'BABA']
['unknown', 'unknown', 'unknown', 'unknown', 'unknown']
#tickers selected: (87, 2)
#paths selected: 114
#connection items: 1108
#tickers aligned: 68
#valid paths: 57
connections count: 732 ratio: 0.09671026555687673
(87, 87, 58)
376 128 61


In [4]:
model.train()
optimizer = optim.Adam(
    model.parameters(), lr=args.lr, weight_decay=args.weight_decay
)
loss_fn = nn.BCELoss().cuda()
for i, data in enumerate(tqdm(trainloader, desc="Training")):
    text_input, price_input, label_input = data
    break


Collect from all stocks: 100%|██████████| 86/86 [00:03<00:00, 22.24it/s]
Collect from all stocks: 100%|██████████| 86/86 [00:03<00:00, 22.21it/s]
Collect from all stocks: 100%|██████████| 86/86 [00:03<00:00, 23.01it/s]
Collect from all stocks: 100%|██████████| 86/86 [00:03<00:00, 22.53it/s]
Collect from all stocks: 100%|██████████| 86/86 [00:03<00:00, 23.33it/s]
Collect from all stocks: 100%|██████████| 86/86 [00:03<00:00, 22.55it/s]
Collect from all stocks: 100%|██████████| 86/86 [00:03<00:00, 22.00it/s]
Collect from all stocks: 100%|██████████| 86/86 [00:03<00:00, 22.25it/s]
Training:   0%|          | 0/47 [00:30<?, ?it/s]


In [5]:
nfeat=64
nhid=args.hidden
nrel=adj.shape[2]
nclass=2
dropout=args.dropout
nheads=args.nb_heads
alpha=args.alpha
stock_num=args.n_stock
text_ft_dim=args.text_ft_dim
# layer func
from torch_geometric.nn import RGATConv
class gru(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(gru, self).__init__()
        self.gru1 = nn.GRU(
            input_size=input_size, hidden_size=hidden_size, batch_first=True
        )

    def forward(self, inputs):
        full, last = self.gru1(inputs)
        return full, last


class attn(nn.Module):
    def __init__(self, in_shape, out_shape):
        super(attn, self).__init__()
        self.W1 = nn.Linear(in_shape, out_shape)
        self.W2 = nn.Linear(in_shape, out_shape)
        self.V = nn.Linear(in_shape, 1)

    def forward(self, full, last):
        score = self.V(torch.tanh(self.W1(last) + self.W2(full)))
        attention_weights = F.softmax(score, dim=1)
        context_vector = attention_weights * full
        context_vector = torch.sum(context_vector, dim=1)
        return context_vector

# layers 
grup = [gru(3, nhid).cuda() for _ in range(stock_num)]
attnp = [attn(nhid, nhid).cuda() for _ in range(stock_num)]
tweet_gru = [gru(text_ft_dim, nhid).cuda() for _ in range(stock_num)]
grut = [gru(nhid, nhid).cuda() for _ in range(stock_num)]
attn_tweet = [attn(nhid, nhid).cuda() for _ in range(stock_num)]
attnt = [attn(nhid, nhid).cuda() for _ in range(stock_num)]
bilinear = [nn.Bilinear(nhid, nhid, nhid).cuda() for _ in range(stock_num)]
linear_x = nn.Linear(nhid, 2).cuda()
dropout_ratio = dropout
attentions = RGATConv(nfeat, nhid, nrel, dropout=dropout_ratio, alpha=alpha, concat=True).cuda()
out_att = RGATConv(nhid, nclass, nrel, dropout=dropout_ratio, alpha=alpha, concat=False).cuda()
out_classify = nn.Softmax(dim=1).cuda()


In [6]:
li = []
batch_size = text_input.size(0)
num_tw, tw_ft_dim = text_input.size(3), text_input.size(4)
num_d = price_input.size(2)
pr_ft = price_input.size(3)
num_stocks = price_input.size(1)

for i in range(num_stocks):  # n_stock
    # price data
    x = grup[i](
        price_input[:, i, :, :].reshape((batch_size, num_d, pr_ft)).float()
    )  # [[(b, 5, 64), (1, b, 64)], [(b, 5, 64), (1, b, 64)], ,,,]
    x = (x[0], x[1].reshape((batch_size, 1, nhid)))
    x = attnp[i](*x).reshape((batch_size, nhid))  # (b, 64)
    # x = layer_normp[i](x).reshape(batch_size, 64)
    han_li1 = []
    for j in range(num_d):  # n_day
        # tweet of each day
        y = tweet_gru[i](
            text_input[:, i, j, :, :].reshape((batch_size, num_tw, tw_ft_dim))
        )  # [(b, num_tw, emb_dim), (1, b, emb_dim),...]
        y = (y[0], y[1].reshape((batch_size, 1, nhid)))
        y = attn_tweet[i](*y).reshape((batch_size, nhid))  # (b, 64)
        han_li1.append(y)
    # tweets in window days
    # news_vector = torch.Tensor((batch_size, num_d, 64))
    news_vector = torch.cat(han_li1)  # (b * 5, 64)
    text = grut[i](
        news_vector.reshape(batch_size, num_d, nhid)
    )  # [(b, 5, 64), (1, b, 64),...]
    text = (text[0], text[1].reshape((batch_size, 1, nhid)))
    text = attnt[i](*text).reshape((batch_size, nhid))  # (b, 64)
    # tweet X price
    combined = torch.tanh(
        bilinear[i](text, x).reshape((batch_size, nhid))
    )  # (b, 64)
    li.append(combined.reshape(batch_size, nhid))

ft_vec = torch.cat(li).reshape((batch_size, num_stocks, nhid))  # (b, n_stock, 64)
out_1 = torch.tanh(linear_x(ft_vec))  # (b, n_stock, 2)
x = F.dropout(ft_vec, dropout_ratio)  # (b, n_stock, 64)

x, att1 = attentions(x[0], edge_index, edge_type, return_attention_weights=True)  # (n_stock, 64)
x = F.dropout(x, dropout_ratio)  # (n_stock, 64)
x, att2 = out_att(x, edge_index, edge_type, return_attention_weights=True)
x = F.elu(x)  # (n_stock, 2)
res = out_classify(x + out_1)  # (n_stock, 2)


In [18]:
att2[1].size(), edge_index.size()
att2[1]

tensor([[0.0974],
        [0.0861],
        [0.0876],
        [0.0814],
        [0.0809],
        [0.0639],
        [0.0629],
        [0.0560],
        [0.0609],
        [0.0549],
        [1.0000],
        [0.0461],
        [0.0449],
        [0.0388],
        [0.0544],
        [0.1344],
        [0.0551],
        [0.0656],
        [0.1377],
        [0.1391],
        [0.0372],
        [0.0329],
        [0.0433],
        [0.0454],
        [0.0403],
        [0.0462],
        [0.1174],
        [0.0514],
        [0.0661],
        [0.0299],
        [0.0331],
        [0.0551],
        [1.0000],
        [0.1259],
        [0.0816],
        [0.1120],
        [0.1105],
        [0.0828],
        [0.1134],
        [0.0620],
        [0.0545],
        [0.1230],
        [0.0729],
        [0.1087],
        [0.0783],
        [0.0860],
        [0.1075],
        [0.0834],
        [0.0656],
        [0.0610],
        [0.0619],
        [0.0582],
        [0.0536],
        [0.0547],
        [0.2636],
        [0

In [67]:
from torch_geometric.nn import global_mean_pool


class BatchRGAT(torch.nn.Module):
    def __init__(self, nfeat, nhid, nrel, nclass, dropout, alpha):
        super(BatchRGAT, self).__init__()
        torch.manual_seed(999)
        self.conv1 = RGATConv(
            nfeat, nhid, nrel, dropout=dropout, alpha=alpha, concat=True
        )
        self.conv2 = RGATConv(
            nhid, nclass, nrel, dropout=dropout, alpha=alpha, concat=False
        )
        self.dropout_ratio = dropout
        self.out_classify = nn.Softmax(dim=1)
    
    def forward(self, x, out_1, edge_index, edge_type, batch):
        x = self.conv1(x, edge_index, edge_type)
        x = F.dropout(x, self.dropout_ratio)
        x = self.conv2(x, edge_index, edge_type)
        x = F.elu(x)
        x = global_mean_pool(x, batch)
        x = F.dropout(x, p=0.5, training=self.training)
        return self.out_classify(x + out_1)


batch_rgat = BatchRGAT(nfeat, nhid, nrel, nclass, dropout=dropout_ratio, alpha=alpha).cuda()


In [56]:
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

middle_loader = DataLoader([
    Data(x=x[i], edge_index=edge_index, edge_type=edge_type) 
    for i in range(batch_size)
], batch_size=batch_size)

In [86]:
# for d in tqdm(middle_loader):
#     g_out = batch_rgat(d.x, out_1, d.edge_index, d.edge_type, d.batch)
d.x.size()

torch.Size([688, 64])

In [None]:
def train(args, model, trainloader, valloader, edge_index, edge_type):
    model.train()
    optimizer = optim.Adam(
        model.parameters(), lr=args.lr, weight_decay=args.weight_decay
    )
    loss_fn = nn.BCELoss().cuda()
    for epoch in tqdm(range(args.epochs), desc="Epochs"):
        loss_train = 0.0
        metrics = {k: [] for k in ["accuracy", "roc_auc", "f1", "recall", "precision"]}
        for i, data in enumerate(tqdm(trainloader, desc="Training")):
            text_tensor, price_tensor, label_tensor = data
            optimizer.zero_grad()
            # forward pass
            output = model(text_tensor, price_tensor, edge_index, edge_type)
            _output, _label = (
                output.argmax(axis=1).unsqueeze(1).float(),
                label_tensor.squeeze(0).float(),
            )
            _loss_train = Variable(loss_fn(_output, _label), requires_grad=True)
            metrics["accuracy"].append(accuracy_score(_output.cpu(), _label.cpu()))
            metrics["roc_auc"].append(roc_auc_score(_output.cpu(), _label.cpu()))
            metrics["f1"].append(f1_score(_output.cpu(), _label.cpu()))
            metrics["recall"].append(recall_score(_output.cpu(), _label.cpu()))
            metrics["precision"].append(precision_score(_output.cpu(), _label.cpu()))
            # backward
            _loss_train.backward()
            optimizer.step()
            loss_train += _loss_train.mean().item()
            torch.cuda.empty_cache()
        # 每 epoch 計算分類準確率
        # trainset
        print("[Epoch %s] Train Loss: %.3f \n" % (epoch, loss_train))
        # _, labelss, metrics = get_predictions(clf, trainloader, compute_acc=True, compute_loss=False)
        print(
            "    Train F1: %.3f \n" % (np.mean(metrics["f1"])),
            "    Train Recall: %.3f \n" % (np.mean(metrics["recall"])),
            "    Train Precision: %.3f \n" % (np.mean(metrics["precision"])),
            "    Train ROC AUC: %.3f \n " % (np.mean(metrics["roc_auc"])),
            "    Train Accuracy: %.3f \n " % (np.mean(metrics["accuracy"])),
        )
        # valset
        _, val_labelss, val_metrics, loss_val = get_predictions(
            model, valloader, edge_index, edge_type, validation=True
        )
        print("[Epoch %s] Val Loss: %.3f \n" % (epoch, loss_val))
        print(
            "    Val F1: %.3f \n" % (np.mean(val_metrics["f1"])),
            "    Val Recall: %.3f \n" % (np.mean(val_metrics["recall"])),
            "    Val Precision: %.3f \n" % (np.mean(val_metrics["precision"])),
            "    Val ROC AUC: %.3f \n " % (np.mean(val_metrics["roc_auc"])),
            "    Val Accuracy: %.3f \n " % (np.mean(val_metrics["accuracy"])),
        )
        writer.add_scalars(
            "Training vs. Validation Loss",
            {"Training": loss_train, "Validation": loss_val},
            epoch * len(trainloader) + i,
        )
        for name in metrics:
            writer.add_scalars(
                f"Training vs. Validation {name.upper()}",
                {
                    "Training": np.mean(metrics[name]),
                    "Validation": np.mean(val_metrics[name]),
                },
                epoch * len(trainloader) + i,
            )
        torch.save(model.state_dict(), "finetuned/mansf-stocknet.pth")
        writer.flush()

