In [None]:
from pathlib import Path
from typing import Any
import random
import datetime

from pydantic import BaseModel
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import polars as pl

import dl_engine as engine

In [None]:
data_csv_path = Path("../data/daily/0000.csv")
df = pl.read_csv(data_csv_path)
df = df.with_columns(
    pl.col("date").str.to_date("%Y/%m/%d").alias("datetime")
)

In [None]:
def create_gaf_image(arr: np.ndarray):
    """
    Deep Learning and Time Series-to-Image Encoding for Financial Forecasting.
    """
    cos_arr = ((arr - arr.max()) + (arr - arr.min())) / (arr.max() - arr.min())
    sin_arr = np.sqrt(1 - cos_arr * cos_arr)
    gasf = cos_arr[..., None].dot(cos_arr[None, ...]) - sin_arr[..., None].dot(sin_arr[None, ...])
    gadf = sin_arr[..., None].dot(cos_arr[None, ...]) - cos_arr[..., None].dot(sin_arr[None, ...])
    return np.stack([gasf, gadf], axis=-1)

In [None]:
def read_csv(csv_path: Path):
    df = pl.read_csv(csv_path)
    df = df.with_columns(
        pl.col("date").str.to_date("%Y/%m/%d").alias("datetime")
    )    
    return df

class GAFDataloader(engine.base.BaseDataloader):
    class Params(BaseModel):
        batch_size: int = 32
        csv_path_list: list[Path] = [Path("../data/daily/0000.csv")]
        sample_size: int = 20
        steps_per_epoch: int = 1000
        test_start_date: datetime.date = datetime.date.today()
        test_end_date: datetime.date = datetime.date.today()

    def __init__(self, params: Params, is_train: bool = True):
        self.params = params
        self.dfs = [read_csv(p) for p in params.csv_path_list]

        if is_train:
            self.dfs = [df.filter(pl.col("datetime") < self.params.test_start_date)  for df in self.dfs]
        else:
            self.dfs = [df.filter(pl.col("datetime").is_between(self.params.test_start_date, self.params.test_end_date)) for df in self.dfs]

    @property
    def steps_per_epoch(self) -> int:
        return self.params.steps_per_epoch

    @property
    def output_keys(self) -> list[str]:
        return ["input", "y_true"]

    @property
    def output_shape(self) -> list[int]:
        return [self.params.sample_size, self.params.sample_size, 2]

    def get_next(self) -> dict[str, Any]:
        sample_rate = np.array([len(df) for df in self.dfs])
        sample_rate = sample_rate / sample_rate.sum()
        sample_num = [rate * self.params.batch_size for rate in sample_rate]
        sample_num_int = [int(n) for n in sample_num]
        sample_num_float = [nf - ni for nf, ni in zip(sample_num, sample_num_int)]
        residual = self.params.batch_size - sum(sample_num_int)
        
        if residual > 0:
            for idx in random.choices([i for i in range(len(self.dfs))], weights=sample_num_float, k=residual):
                sample_num_int[idx] += 1
        
        inputs = []
        y_trues = []
        for idx, df in enumerate(self.dfs):
            for _ in range(sample_num_int[idx]):
                input, y_true = sample_image(df, self.params.sample_size)
                inputs.append(input)
                y_trues.append(y_true)
        
        return {
            "input": np.stack(inputs, axis=0),
            "y_true": np.array(y_trues)
        }

def sample_image(df, sample_size, threshold=0.7):
    min_idx, max_idx = sample_size, len(df) - sample_size - 1
    target_idx = random.randint(min_idx, max_idx)
    
    arr = df["close"][target_idx - sample_size + 1:target_idx + 1].to_numpy()
    min_val = df["close"][target_idx + 1:target_idx + sample_size + 1].min()
    max_val = df["close"][target_idx + 1:target_idx + sample_size + 1].max()

    image = create_gaf_image(arr)
    y_true = (max_val - df["open"][target_idx]) / (max_val - min_val)

    return image, float(y_true > threshold)

In [None]:
class MSELoss(engine.base.BaseLoss):
    class Params(BaseModel):
        label_smoothing: float = 0.05
        scale: float = 1.0

    def __init__(self, params: Params):
        super().__init__()
        self.params = params
        self.metrics = {
            "loss": tf.keras.metrics.Mean(),
        }

    @property
    def output_keys(self) -> list[str]:
        return ["loss"]

    def update_metrics(self, data: dict[str, Any]) -> None:
        self.metrics["loss"](data["loss"])

    def get_metrics(self) -> dict[str, float]:
        return {key: val.result() for key, val in self.metrics.items()}

    def reset_metrics(self) -> None:
        self.metrics["loss"].reset_state()

    def call(self, y_true, y_pred) -> dict[str, Any]:
        diff = (tf.abs(y_true - y_pred) - self.params.label_smoothing) * self.params.scale
        return {
            "loss": tf.reduce_mean(diff * diff)
        }

In [None]:
params = GAFDataloader.Params()
dataloader = GAFDataloader(params=params)
data = dataloader.get_next()

In [None]:
engine.dataloader.dataloader_list["gaf"] = GAFDataloader
engine.losses.loss_list["mse_loss"] = MSELoss

params_path = Path("../params/20250119.json")

if not params_path.exists():
    params = engine.Trainer.Params.get_default_params(loss_name="mse_loss", dataloader_name="gaf", network_name="conv_net")
    with open(params_path, "w") as f:
        f.write(params.model_dump_json(indent=2))

with open(params_path, "r") as f:
    params = engine.Trainer.Params.model_validate_json(f.read())

In [None]:
output_root_dir = Path("../results") / params_path.stem / "train"
output_root_dir.mkdir(exist_ok=True, parents=True)

callbacks = engine.callbacks.CallbackList([
    engine.callbacks.Checkpoint(output_root_dir / "checkpoint"),
    engine.callbacks.TrainLogger(),
])
trainer = engine.Trainer(params=params, callbacks=callbacks)

In [None]:
trainer.train()

In [None]:
data = trainer.train_dataloader.get_next()
y_pred = trainer.network(data["input"])

In [None]:
data["y_true"]

In [None]:
y_pred