# Import

In [2]:
# import yfinance as yf
import pandas as pd
import os
from os.path import join
from datetime import datetime, timezone, timedelta
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
import logging
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from yforseer.networks import StockNet
from yforseer.datasets import StockDataset
from yforseer.trainers import StockNetTrainer
import mlflow
from tqdm import tqdm



# Dataset and loader

In [2]:

# Load dataset
load_array_pth = 'data/yahoo/artifacts/data_array.npz'
data = torch.from_numpy(np.load(load_array_pth)['data']).to(torch.float32)
num_days = data.shape[1]
test_size = int(0.1 * num_days)
train_size = num_days - test_size
train_data = data[:, :train_size]
test_data = data[:, train_size:]
train_dataset = StockDataset(data = train_data, memory=60, lookahead=30, mode='last')
test_dataset = StockDataset(data = test_data, memory=60, lookahead=30, mode='last')
print('train_dataset:', len(train_dataset))
print('test_dataset:', len(test_dataset))


train_dataset: 5283
test_dataset: 507


In [None]:
batch_size = 32
epochs = 20
lr = 0.0001


train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Train
trainer = StockNetTrainer(lr=1e-4)    
for epoch in tqdm(range(epochs)):
    train_loss_list, test_loss_list = [], []
    i = 0
    trainer.model.train()
    for X_train, y_train in tqdm(train_dataloader, leave=False):
        train_loss, _ = trainer.train(X_train, y_train)
        train_loss_list.append(train_loss)
        if i > 10:
            break
        i += 1

    trainer.model.eval()
    for X_test, y_test in tqdm(test_dataloader, leave=False):
        test_loss, _ = trainer.test(X_test, y_test)
        test_loss_list.append(test_loss)


    # Log metrics
#     mlflow.log_metric('train_loss', np.mean(train_loss_list), step=epoch)
#     mlflow.log_metric('test_loss', np.mean(test_loss_list), step=epoch)


# mlflow.end_run()

# Data Augmentation (Cauchy noise)

In [3]:
from scipy.stats import cauchy

load_array_pth = 'data/yahoo/artifacts/data_array.npz'

data = np.load(load_array_pth)['data']

data.shape

(37, 5968)

In [4]:
xdiff = np.diff(data, axis=1)
mds = np.median(xdiff, axis=1)
gammas = np.median(np.abs(xdiff - mds.reshape(-1, 1)), axis=1)
print(mds)
print(gammas)

[-1.34581207e-04 -3.73345853e-04 -1.87918788e-04 -6.57480518e-04
  7.47870637e-05  4.80577388e-05 -2.96552997e-04 -1.23780490e-04
 -4.78253810e-04 -2.17964666e-04 -3.73262899e-04 -2.63399862e-04
 -1.62304603e-04 -6.95632948e-05 -7.41880536e-05 -4.83317354e-05
 -1.70676309e-06 -1.65517486e-04 -4.94415664e-05  3.98083220e-06
 -1.16658931e-04 -1.07414616e-04 -9.09776322e-05  4.21008238e-05
 -7.77901132e-05 -1.34647649e-04 -2.29164183e-04 -4.35837442e-04
  9.85217158e-07 -3.44214456e-04 -8.16614127e-05 -2.70074302e-05
  1.67275089e-06 -1.03781213e-04 -1.08698463e-04  1.33341211e-06
 -4.16730064e-04]
[0.00456617 0.01406928 0.00936564 0.0135501  0.00558139 0.00475629
 0.00768252 0.01183807 0.00674872 0.00756358 0.0135226  0.01390011
 0.00651684 0.00445058 0.0037361  0.0037749  0.00553962 0.0081209
 0.00587823 0.00494116 0.01051899 0.00808454 0.00993664 0.00451333
 0.00405005 0.00456602 0.01164874 0.01710603 0.00505093 0.00645618
 0.00640125 0.01284077 0.00625733 0.00875654 0.0094962  0.00883

In [5]:

def truncated_cauchy(scale, N, bound):
    noise = np.ones(N) * bound *2
    mask = noise > bound
    while N > 0:
        noise[mask] = cauchy.rvs(loc=0, scale=scale, size=N)
        mask = np.abs(noise) > bound
        N = np.sum(mask)
    return noise




In [6]:
%matplotlib qt
i = 2
t = 1000

scale = gammas[i]
bound = 5* scale
foox = data[i, :t]

noise = truncated_cauchy(scale, t-1, bound)
# noise = cauchy.rvs(loc=0, scale=scale, size=t-1)

foox_change = foox[:-1] + noise
foox2 = foox.copy()

foox2[1:] = foox_change 

fig, ax = plt.subplots( figsize=(10, 5))

ax.plot(foox, marker='x', color='navy')
ax.plot(foox2, marker='x', color='r', alpha=0.4)
ax.set_title(f'Noise = Cauchy(0, {scale:0.5f})')


Text(0.5, 1.0, 'Noise = Cauchy(0, 0.00937)')

In [65]:
xax = np.linspace(-2, 2, 1000)
out = cauchy.pdf(xax, scale=gammas[i])
plt.plot(xax, out)

[<matplotlib.lines.Line2D at 0x25e0534ae90>]

In [69]:
cauchy.ppf(0.9999, scale=gammas[i])

29.811767309109776

In [11]:
xdiff = np.diff(data, axis=1)

mu_each = xdiff.mean(axis=1)
std_each = xdiff.std(axis=1)



fig, ax = plt.subplots(8, 5, figsize=(20, 28))
ax = ax.flatten()
for i in range(37):

    mu, std = mu_each[i], std_each[i]
    min_val = mu - 3 * std
    max_val = mu + 3 * std

    md = np.median(xdiff[i])
    # gamma = np.quantile(xdiff[i], 0.50 + 0.125) - np.quantile(xdiff[i], 0.50 - 0.125)

    gamma = np.median(np.abs(xdiff[i] - md))


    edges = np.linspace(min_val, max_val, 100)
    hist, _ = np.histogram(xdiff[i], bins=edges)
    xax = np.linspace(min_val, max_val, 100)
    std = std/2
    # gau = cauchy.pdf(xax, loc=md, scale=gamma)
    gau = np.exp(-0.5 * ((xax - mu) / std) ** 2) / (std * np.sqrt(2 * np.pi))
    gau = gau / gau.max() * hist.max()
    
    ax[i].plot(xax, gau, c='r', lw=1)

    _ = ax[i].bar(edges[:-1], hist, width = edges[1] - edges[0], color='b', alpha=0.5)


In [14]:
std_each

array([0.00952226, 0.03400638, 0.01979952, 0.03061999, 0.01425464,
       0.01106359, 0.02056497, 0.02997977, 0.01538702, 0.043559  ,
       0.03189735, 0.03558561, 0.01935995, 0.0097435 , 0.00778699,
       0.00800005, 0.01111082, 0.02270477, 0.01288581, 0.01082773,
       0.03310014, 0.02263263, 0.02330984, 0.01035722, 0.00815965,
       0.00947122, 0.02459829, 0.03968533, 0.01234657, 0.01421282,
       0.01470881, 0.03176873, 0.01671189, 0.02044283, 0.02069599,
       0.03890058, 0.03643183])

In [21]:
stds = torch.tensor([0, 10], dtype=torch.float32).reshape(1, 2, 1)
torch.normal(mean=0, std=stds)

tensor([[[ 0.0000],
         [-7.3291]]])