In [63]:
import os
from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch
import torchsort
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [2]:
folder_path = os.readlink("data/symlink")
stock = {}

for idx, path in tqdm(enumerate(glob(f"{folder_path}/*.csv"))):
    curr_df = pd.read_csv(path)
    stock_name = path.split("/")[-1].split(".")[0]

    if len(curr_df):
        stock[stock_name] = curr_df

1087it [00:04, 247.38it/s]


In [3]:
subset = stock["2330"]

In [25]:
from typing import Union


def sharpe_ratio(ret: Union[pd.Series, np.ndarray], rf: float) -> float:
    mean = ret.mean()
    std = ret.std()

    if std == 0:
        return 0

    return (mean - rf) / std


def get_next_n_day_sharpe(sequence: pd.Series, window: int):
    sharpe_ratios = [None] * len(sequence)

    for i in range(len(sequence) - window):
        next_window_ret = sequence[i + 1 : i + window + 1]
        sharpe_ratios[i] = sharpe_ratio(next_window_ret, 0)

    return pd.Series(sharpe_ratios, index=sequence.index)


def get_next_n_day_avgret(sequence: pd.Series, window: int):
    avg_rets = [None] * len(sequence)

    for i in range(len(sequence) - window):
        next_window_ret = sequence[i + 1 : i + window + 1]

        if window > 1:
            avg_rets[i] = next_window_ret.mean()
        else:
            avg_rets[i] = next_window_ret.values[0]

    return pd.Series(avg_rets).fillna(0)

In [None]:
subset["ret"] = subset["收盤價"].pct_change().fillna(0)
subset["date"] = pd.to_datetime(subset["date"])
subset["label"] = get_next_n_day_avgret(subset["ret"], 1)

In [28]:
subset.head()

Unnamed: 0,date,證券代號,開盤價,最高價,最低價,收盤價,成交股數,adj_close,ret,label
0,2007-04-23,2330,68.0,69.2,67.9,68.6,50260857.0,76.149633,0.0,0.017493
1,2007-04-24,2330,68.8,70.0,68.6,69.8,61860515.0,77.481696,0.017493,-0.007163
2,2007-04-25,2330,69.4,69.7,68.7,69.3,26492554.0,76.92667,-0.007163,0.008658
3,2007-04-26,2330,70.4,70.5,69.7,69.9,58301539.0,77.592702,0.008658,-0.012876
4,2007-04-27,2330,69.9,70.0,68.7,69.0,36622773.0,76.593654,-0.012876,-0.004348


In [29]:
# 250d train, 125d valid, 20d test
train_scale, val_scale, test_scale = 250, 20, 125
tot_scale = train_scale + val_scale + test_scale
train, val, test = [], [], []

In [38]:
feature_names = ["開盤價", "最高價", "最低價", "收盤價", "成交股數"]
label_name = ["label"]

In [157]:
first_train = subset.iloc[:train_scale][feature_names + label_name]
window = 20
window_data = []

for i in range(len(first_train) - window):
    window_data.append(first_train.iloc[i : i + window])

window_data = np.array(window_data)

In [158]:
window_data.shape

(230, 20, 6)

In [159]:
class TrainAlphaDataset(Dataset):
    def __init__(self, data: np.ndarray) -> None:
        self.data = data
        self.features = data[:, :, :-1]
        self.labels = data[:, :, -1]
        self._feature_normalization()

    def _feature_normalization(self):
        self.features = (
            self.features - self.features.mean(axis=0)
        ) / self.features.std(axis=0)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [160]:
first_train_dataset = TrainAlphaDataset(window_data)
first_train_loader = DataLoader(first_train_dataset, batch_size=1, shuffle=True)

In [161]:
from typing import Literal

import torch
from torch import nn


class ICLoss(nn.Module):
    def __init__(self, correlation_type: Literal = ["pearson", "spearman"]):
        """
        Initialize the loss function.
        :param correlation_type: Type of correlation ('pearson' or 'spearman'). Default is 'pearson'.
        """
        super().__init__()
        assert correlation_type in [
            "pearson",
            "spearman",
        ], "correlation_type must be 'pearson' or 'spearman'."
        self.correlation_type = correlation_type

    def forward(self, predictions, targets):
        """
        Compute the loss to maximize absolute Information Coefficient (IC).
        :param predictions: Tensor of model predictions (batch_size,).
        :param targets: Tensor of true values (batch_size,).
        :return: Loss value (to minimize).
        """

        if self.correlation_type == "pearson":
            pred_mean = torch.mean(predictions)
            target_mean = torch.mean(targets)
            cov = torch.sum((predictions - pred_mean) * (targets - target_mean))
            pred_std = torch.std(predictions)
            target_std = torch.std(targets)
            correlation = cov / (pred_std * target_std + 1e-6)

        elif self.correlation_type == "spearman":
            pred_rank = torchsort.soft_rank(predictions)
            target_rank = torchsort.soft_rank(targets)
            pred_n = pred_rank - pred_rank.mean()
            target_n = target_rank - target_rank.mean()
            pred_n = pred_n / pred_n.norm()
            target_n = target_n / target_n.norm()
            correlation = (pred_n * target_n).sum()

        loss = -torch.abs(correlation)
        return loss

In [162]:
class NeuralAlpha(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        return self.model(x)

In [166]:
class Args:
    def __init__(self):
        self.hidden_dim = 128
        self.output_dim = 1
        self.n_epochs = 50
        self.lr = 1e-3


args = Args()
model = NeuralAlpha(window_data.shape[-1] - 1, args.hidden_dim, args.output_dim, 0.5)
criterion = ICLoss(correlation_type="spearman")
# criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

In [168]:
for epoch in tqdm(range(args.n_epochs)):
    epoch_loss = 0
    model.train(True)
    for feature, label in first_train_loader:
        feature, label = feature.to(torch.float32).squeeze(0), label.to(torch.float32)
        optimizer.zero_grad()
        y_pred = model(feature).transpose(0, 1)
        loss = criterion(y_pred, label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f"Epoch {epoch} | Train Loss: {epoch_loss / len(first_train_loader)}")

  2%|▏         | 1/50 [00:00<00:15,  3.08it/s]

Epoch 0 | Train Loss: -0.5558868295472601


  4%|▍         | 2/50 [00:00<00:15,  3.13it/s]

Epoch 1 | Train Loss: -0.5553111881665561


  6%|▌         | 3/50 [00:00<00:14,  3.13it/s]

Epoch 2 | Train Loss: -0.5559055832660068


  8%|▊         | 4/50 [00:01<00:14,  3.15it/s]

Epoch 3 | Train Loss: -0.548792903708375


 10%|█         | 5/50 [00:01<00:14,  3.12it/s]

Epoch 4 | Train Loss: -0.5544002009798651


 12%|█▏        | 6/50 [00:01<00:13,  3.26it/s]

Epoch 5 | Train Loss: -0.5574514596682528


 14%|█▍        | 7/50 [00:02<00:12,  3.41it/s]

Epoch 6 | Train Loss: -0.5637235919985434


 16%|█▌        | 8/50 [00:02<00:11,  3.51it/s]

Epoch 7 | Train Loss: -0.5693113582613676


 18%|█▊        | 9/50 [00:02<00:11,  3.58it/s]

Epoch 8 | Train Loss: -0.56189568011981


 20%|██        | 10/50 [00:02<00:10,  3.64it/s]

Epoch 9 | Train Loss: -0.558287047210109


 22%|██▏       | 11/50 [00:03<00:10,  3.69it/s]

Epoch 10 | Train Loss: -0.5718779959675411


 24%|██▍       | 12/50 [00:03<00:10,  3.71it/s]

Epoch 11 | Train Loss: -0.5696520335081479


 26%|██▌       | 13/50 [00:03<00:09,  3.73it/s]

Epoch 12 | Train Loss: -0.5673199440960003


 28%|██▊       | 14/50 [00:04<00:09,  3.75it/s]

Epoch 13 | Train Loss: -0.5745666960011357


 30%|███       | 15/50 [00:04<00:09,  3.75it/s]

Epoch 14 | Train Loss: -0.5755582585525901


 32%|███▏      | 16/50 [00:04<00:09,  3.72it/s]

Epoch 15 | Train Loss: -0.5816632326044466


 34%|███▍      | 17/50 [00:04<00:08,  3.75it/s]

Epoch 16 | Train Loss: -0.5822327381323861


 36%|███▌      | 18/50 [00:05<00:08,  3.77it/s]

Epoch 17 | Train Loss: -0.5860198752873618


 38%|███▊      | 19/50 [00:05<00:08,  3.70it/s]

Epoch 18 | Train Loss: -0.5727129515665381


 40%|████      | 20/50 [00:05<00:08,  3.50it/s]

Epoch 19 | Train Loss: -0.5914458381902913


 42%|████▏     | 21/50 [00:05<00:08,  3.39it/s]

Epoch 20 | Train Loss: -0.5887528449377936


 44%|████▍     | 22/50 [00:06<00:08,  3.32it/s]

Epoch 21 | Train Loss: -0.5866119328562333


 46%|████▌     | 23/50 [00:06<00:08,  3.28it/s]

Epoch 22 | Train Loss: -0.5889250192629254


 48%|████▊     | 24/50 [00:06<00:08,  3.23it/s]

Epoch 23 | Train Loss: -0.6012370716051563


 50%|█████     | 25/50 [00:07<00:07,  3.21it/s]

Epoch 24 | Train Loss: -0.5935256474853857


 52%|█████▏    | 26/50 [00:07<00:07,  3.20it/s]

Epoch 25 | Train Loss: -0.604860110985844


 54%|█████▍    | 27/50 [00:07<00:07,  3.19it/s]

Epoch 26 | Train Loss: -0.6015929624357301


 56%|█████▌    | 28/50 [00:08<00:06,  3.17it/s]

Epoch 27 | Train Loss: -0.6036911980046527


 58%|█████▊    | 29/50 [00:08<00:06,  3.17it/s]

Epoch 28 | Train Loss: -0.6123349517421878


 60%|██████    | 30/50 [00:08<00:06,  3.16it/s]

Epoch 29 | Train Loss: -0.601037861722643


 62%|██████▏   | 31/50 [00:09<00:05,  3.17it/s]

Epoch 30 | Train Loss: -0.5983778388806336


 64%|██████▍   | 32/50 [00:09<00:05,  3.16it/s]

Epoch 31 | Train Loss: -0.605926869518083


 66%|██████▌   | 33/50 [00:09<00:05,  3.14it/s]

Epoch 32 | Train Loss: -0.6134305518120527


 68%|██████▊   | 34/50 [00:10<00:05,  3.14it/s]

Epoch 33 | Train Loss: -0.6127296073443216


 70%|███████   | 35/50 [00:10<00:04,  3.14it/s]

Epoch 34 | Train Loss: -0.6150567598397965


 72%|███████▏  | 36/50 [00:10<00:04,  3.14it/s]

Epoch 35 | Train Loss: -0.6135314994289175


 74%|███████▍  | 37/50 [00:11<00:04,  3.13it/s]

Epoch 36 | Train Loss: -0.6223691525059226


 76%|███████▌  | 38/50 [00:11<00:03,  3.13it/s]

Epoch 37 | Train Loss: -0.6211588457550692


 78%|███████▊  | 39/50 [00:11<00:03,  3.13it/s]

Epoch 38 | Train Loss: -0.6305402093040554


 80%|████████  | 40/50 [00:12<00:03,  3.12it/s]

Epoch 39 | Train Loss: -0.6223810281442559


 82%|████████▏ | 41/50 [00:12<00:02,  3.12it/s]

Epoch 40 | Train Loss: -0.6226992321484115


 84%|████████▍ | 42/50 [00:12<00:02,  3.12it/s]

Epoch 41 | Train Loss: -0.6388694108868747


 86%|████████▌ | 43/50 [00:12<00:02,  3.12it/s]

Epoch 42 | Train Loss: -0.6351990585741789


 88%|████████▊ | 44/50 [00:13<00:01,  3.11it/s]

Epoch 43 | Train Loss: -0.6274104411349348


 90%|█████████ | 45/50 [00:13<00:01,  3.11it/s]

Epoch 44 | Train Loss: -0.636331901051428


 92%|█████████▏| 46/50 [00:13<00:01,  3.11it/s]

Epoch 45 | Train Loss: -0.6504222961223644


 94%|█████████▍| 47/50 [00:14<00:00,  3.10it/s]

Epoch 46 | Train Loss: -0.6477749374735614


 96%|█████████▌| 48/50 [00:14<00:00,  3.09it/s]

Epoch 47 | Train Loss: -0.6559344680838125


 98%|█████████▊| 49/50 [00:14<00:00,  3.09it/s]

Epoch 48 | Train Loss: -0.6529668174684048


100%|██████████| 50/50 [00:15<00:00,  3.28it/s]

Epoch 49 | Train Loss: -0.6555453391907655





(torch.Size([1, 20]), torch.Size([20, 1]))

In [None]:
# for i in tqdm(range(0, len(subset), test_scale)):
#     if i + tot_scale > len(subset):
#         break

#     train.append(subset.iloc[i: i+train_scale])
#     val.append(subset.iloc[i+train_scale: i+train_scale+val_scale])
#     test.append(subset.iloc[i+train_scale+val_scale: i+train_scale+val_scale+test_scale])

 91%|█████████▏| 32/35 [00:00<00:00, 1926.42it/s]


In [31]:
train = np.array(train)
val = np.array(val)
test = np.array(test)

In [5]:
class AlphaDataset(Dataset):
    def __init__(self, stock_data, label):
        self.stock_data = stock_data
        self.price_features = ["開盤價", "最高價", "最低價", "收盤價"]
        self.volume_features = ["成交股數"]
        self.features = self.price_features + self.volume_features
        self.label = label
        self._normalize()

    def _normalize(self):
        self.stock_data[self.price_features] = (
            self.stock_data[self.price_features]
            .sub(self.stock_data[self.price_features].mean(axis=1), axis=0)
            .div(self.stock_data[self.price_features].std(axis=1), axis=0)
        )

        self.stock_data[self.volume_features] = (
            self.stock_data[self.volume_features]
            - self.stock_data[self.volume_features].min()
        ) / (
            self.stock_data[self.volume_features].max()
            - self.stock_data[self.volume_features].min()
        )

    def __len__(self):
        return len(self.stock_data) - 1

    def __getitem__(self, idx):
        features = self.stock_data[self.features]
        label = self.stock_data[self.label]

        return features.iloc[idx].values, label.iloc[idx]

In [49]:
class NeuralAlpha(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(),
            nn.Linear(hidden_dim, 64),
            nn.Linear(64, 32),
            nn.Linear(32, output_dim),
        )

    def forward(self, x):
        return self.model(x)

In [50]:
class Args:
    def __init__(self):
        self.hidden_dim = 128
        self.output_dim = 1
        self.n_epochs = 10
        self.lr = 1e-3
        self.batch_size = 256


args = Args()

In [51]:
train_dataset = AlphaDataset(subset_train, "next_day_ret")
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
test_dataset = AlphaDataset(subset_test, "next_day_ret")
test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)
model = NeuralAlpha(
    input_dim=len(train_dataset.features),
    hidden_dim=args.hidden_dim,
    output_dim=args.output_dim,
    dropout=0.0,
)
criterion = nn.MSELoss()
optimier = torch.optim.Adam(model.parameters(), lr=args.lr)

In [None]:
for epoch in tqdm(range(args.n_epochs)):
    epoch_loss = 0
    for feature, label in train_loader:
        feature, label = feature.to(torch.float32), label.to(torch.float32)
        model.train()
        y_pred = model(feature).squeeze()
        loss = criterion(y_pred, label)
        loss.backward()
        nn.utils.clip_grad_value_(model.parameters(), 1)
        optimier.step()
        optimier.zero_grad()
        epoch_loss += loss.item()

        print(f"Epoch {epoch} | Train Loss: {epoch_loss}")
        # break

# model.eval()
# test_loss = 0
# for feature, label in test_loader:
#     feature, label = feature.to(torch.float32), label.to(torch.float32)
#     with torch.no_grad():
#         model.eval()
#         y_pred = model(feature).squeeze()
#         loss = criterion(y_pred, label)
#         test_loss += loss.item()

# print(f"Test Loss: {test_loss / len(test_loader)}")