# 線形回帰モデルの作成

In [None]:
from pathlib import Path
import csv
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import stock

In [None]:
dataset_params = stock.dl.dataset.DatasetParams()
dataset = stock.dl.dataset.Dataset(dataset_params)

us_data = dataset.us_data
jp_data = dataset.jp_data
us_symbols = dataset.us_symbols
jp_symbols = dataset.jp_symbols
print("{}, {}, len(us_symbols) = {}, len(jp_symbols) = {}".format(us_data.shape, jp_data.shape, len(us_symbols), len(jp_symbols)))

## データの整合性を確認（ソースコードが間違っていないことの確認）

In [None]:
symbol = jp_symbols[0]

csv_path = dataset_params.jp_data_dir / f"{symbol}.csv"
assert csv_path.exists()

with open(csv_path, "r") as f:
    csv_reader = csv.reader(f)
    header = next(csv_reader)
    data = np.array([[float(val) for val in row] for row in csv_reader], dtype=np.float32)

    start = data[:, 1]
    end = data[:, 4]
    change = (end - start) / start

In [None]:
plt.plot(jp_data[:, 1])

In [None]:
plt.plot(change)

In [None]:
us_data = dataset.data[:, dataset._us_data_indices]
us_data = np.concatenate((us_data, np.ones((us_data.shape[0], 1), dtype=np.float32)), axis=1)
jp_data = dataset.data[:, dataset._jp_data_indices]
us_data.shape, jp_data.shape

n_train = int(us_data.shape[0] * 0.8)

train_us = us_data[:n_train]
train_jp = jp_data[:n_train]
test_us = us_data[n_train:]
test_jp = jp_data[n_train:]

train_us.shape, train_jp.shape, test_us.shape, test_jp.shape

In [None]:
def least_square(a, b):
    """
    """
    inv_a_at = np.linalg.inv(a.T @ a)
    print(inv_a_at.shape, a.shape, b.T.shape)
    return b.T @ a @ inv_a_at


def plot_pred(pred, true):
    print(np.corrcoef(true, pred)[0, 1])
    plt.axvline(x=0, color="black")
    plt.axhline(y=0, color="black")
    plt.scatter(true, pred)        

In [None]:
# 変動が大きい日を取り出す
abs_us = np.abs(train_us).mean(axis=1)
thr = abs_us.mean() + abs_us.std() 

# 最小二乗法で行列を求める
matrix = least_square(train_us[abs_us < thr], train_jp[abs_us < thr])
pred_jp = (matrix @ test_us.T).T

In [None]:
# usとjpのデータの相関係数を求める
threshold = 0
target_us = train_us[abs_us > threshold][:, :-1]
target_jp = train_jp[abs_us > threshold]

corrmat = np.zeros((target_us.shape[1], target_jp.shape[1]))
for ius in range(target_us.shape[1]):
    for ijp in range(target_jp.shape[1]):
        corrmat[ius, ijp] = np.corrcoef(target_us[:, ius], target_jp[:, ijp])[0, 1]

In [None]:
us_idx, jp_idx = corrmat.argmax() // corrmat.shape[1], corrmat.argmax() % corrmat.shape[1]
plot_pred(target_us[:, us_idx], target_jp[:, jp_idx])
plt.xlim([-0.05, 0.05])
plt.ylim([-0.01, 0.01])

In [None]:
threshold = 0
train_pred = (matrix @ train_us[abs_us > threshold].T).T
train_true = train_jp[abs_us > threshold]

for i in range(5):
    plot_pred(train_pred[:, i], train_true[:, i])

# もっと単純にnasdaq or dowが上がった日に東証買ってあがるかチェックする