In [None]:
# 機械学習導入

In [None]:
import datetime 
import random

import numpy as np
import polars as pl
import tensorflow as tf
import matplotlib.pyplot as plt

import stock

train_data_dir = stock.PROJECT_ROOT / "data" / "train"
output_file_path = train_data_dir / "{}.npz".format(datetime.date.today().strftime("%Y%m%d"))

In [None]:
# eps、純利益から時価総額を計算する
def calc_estimated_capitalization(code, current_date=datetime.date.today()):
    fdf = stock.kabutan.read_financial_csv(code).filter(
        (pl.col("duration") == 3) & (pl.col("eps").abs() > 1e-5)
    ).sort(pl.col("annoounce_date"))
    df = stock.kabutan.read_data_csv(code, end_date=current_date).sort(pl.col("date"))

    if len(fdf) == 0:
        return -1
    num_stock = fdf["net_income"][-1] * 1000000 / fdf["eps"][-1]
    est_capit = num_stock * df["close"][-1]
    return est_capit

In [None]:
# まずは学習データ準備
target_data_dict = {}
stacked = []
codes = stock.kabutan.get_code_list()
max_hold_days = 10

for code in codes:
    capt = calc_estimated_capitalization(code)
    if capt > 100000000000: # 時価総額1000億円以上の場合はスキップ
        continue
    
    df = stock.trend_template.calc_for_watch_list(code)
    df = df.with_columns(
        (pl.col("close").rolling_max(window_size=max_hold_days).shift(-max_hold_days) / pl.col("open").shift(-1)).alias("growing_rate")
    )
    df = df.with_columns(
        ((pl.col("growing_rate") - 1.0)* 100).log().alias("log_growing_rate")
    )
    target_data_dict[code] = df
    stacked.append(df.filter(pl.col("watch_list")).with_columns(pl.lit(code).alias("code")))

stacked_df = pl.concat(stacked)

In [None]:
# trainとvalidの分割日を決定する
dates = stacked_df.sort(pl.col("date"))["date"]
# この日付までをtrain、これより先をvalidationとする
split_date = dates[int(len(dates) * 0.8)]

train_df = stacked_df.filter(pl.col("date") <= split_date)
valid_df = stacked_df.filter(pl.col("date") > split_date)
print("Split date = {}, num train = {}, num_valid = {}".format(split_date, len(train_df), len(valid_df)))

In [None]:
# 学習データは直前x日分のcloseとvolumeにする
data_days = 30

def get_data_list(df):
    input_data_list = []
    true_data_list = []

    for i in range(len(df)):
        code = df["code"][i]
        date = df["date"][i]
        fdf = target_data_dict[code].filter(pl.col("date") <= date)
        if len(fdf) < data_days:
            continue

        open = fdf["open"].to_numpy()[-data_days:]
        high = fdf["high"].to_numpy()[-data_days:]
        low = fdf["low"].to_numpy()[-data_days:]
        close = fdf["close"].to_numpy()[-data_days:]
        volume = fdf["volume"].to_numpy()[-data_days:]
        #return date, close, volume
        data  = np.concatenate([
            open / close[-1], 
            high / close[-1],
            low / close[-1],
            close / close[-1], 
            volume / volume[-1]
        ])
        input_data_list.append(data)
        #true_data_list.append(train_df["log_growing_rate"][i])
        true_data_list.append((train_df["growing_rate"][i] - 1.0) * 100)

    return np.array(input_data_list, dtype=np.float32), np.array(true_data_list, dtype=np.float32)

In [None]:
train_input, train_true = get_data_list(train_df)
valid_input, valid_true = get_data_list(valid_df)

train_data_dir.mkdir(exist_ok=True)
np.savez(output_file_path, train_input, train_true, valid_input, valid_true)

In [None]:
output_file_path

In [None]:
npz = np.load(output_file_path)
train_input, train_true, valid_input, valid_true = npz["arr_0"], npz["arr_1"], npz["arr_2"], npz["arr_3"]
# train_true = (train_true > 20).astype(np.float32)[..., None]
# valid_true = (valid_true > 20).astype(np.float32)[..., None]

In [None]:
def loss_fn(y_true, y_pred):
    #return tf.reduce_mean(tf.square(y_true - y_pred))
    bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    return bce(y_true, y_pred)


class Model(tf.keras.Layer):

    def __init__(self):
        super().__init__()

        self.layers = [
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(1),
        ]

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    

def grad(model: tf.keras.Layer, x: tf.Tensor, y: tf.Tensor):
    with tf.GradientTape() as tape:
        pred = model(x)
        loss = loss_fn(y, pred)
    return pred, loss, tape.gradient(loss, model.trainable_variables)


In [None]:
model = Model()
optimizer = tf.keras.optimizers.Adam()

In [None]:
batch_size = 1000
steps_per_epoch = 50000 // batch_size
pos_ratio = 0.3
pos_num = int(batch_size * pos_ratio)
neg_num = batch_size - pos_num

true_arr = np.concatenate([np.ones((pos_num, 1), dtype=np.float32), np.zeros((neg_num, 1), dtype=np.float32)])

pos_train_input = train_input[(train_true > 20).reshape(-1)]
neg_train_input = train_input[(train_true < 10).reshape(-1)]

for epoch in range(100):
    for step in range(steps_per_epoch):
        pos_input = pos_train_input[np.random.randint(len(pos_train_input), size=pos_num), :]
        neg_input = neg_train_input[np.random.randint(len(neg_train_input), size=neg_num), :]
        input = np.concatenate([pos_input, neg_input])

        pred, loss, grads = grad(model, input, true_arr)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        print("epoch = {}, step = {:2d}, loss = {}".format(epoch, step, loss), end="\r")

In [None]:
valid_pred = model(valid_input)
valid_pred_y = valid_pred > 0

In [None]:
recall = valid_pred_y[valid_true > 20].numpy().sum() / (valid_true > 20).sum()
specificity = 1.0 - valid_pred_y[valid_true < 10].numpy().sum() / (valid_true < 10).sum()

num_tp = valid_pred_y[valid_true > 20].numpy().sum()
num_fp = valid_pred_y[valid_true < 10].numpy().sum()
precision = num_tp / (num_tp + num_fp)

print("recall = {:.3f}, precision = {:.3f}, specificity = {:.3f}".format(recall, precision, specificity))

In [None]:
num_tp, num_fp