In [2]:
import numpy as np
import pandas as pd

In [16]:
stock_price_df = pd.read_csv("stock_prices.csv")

In [17]:
print('(行数, 列数) =', stock_price_df.shape)
stock_price_df.tail()

(行数, 列数) = (2332531, 12)


Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
2332526,20211203_9990,2021-12-03,9990,514.0,528.0,513.0,528.0,44200,1.0,,False,0.034816
2332527,20211203_9991,2021-12-03,9991,782.0,794.0,782.0,794.0,35900,1.0,,False,0.025478
2332528,20211203_9993,2021-12-03,9993,1690.0,1690.0,1645.0,1645.0,7200,1.0,,False,-0.004302
2332529,20211203_9994,2021-12-03,9994,2388.0,2396.0,2380.0,2389.0,6500,1.0,,False,0.009098
2332530,20211203_9997,2021-12-03,9997,690.0,711.0,686.0,696.0,381100,1.0,,False,0.018414


In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTM(nn.Module):
    def __init__(self, input_size=8, sequence_num=31, lstm_dim=128,
                 num_layers=2, output_size=1):
        super().__init__()
        
        self.lstm = nn.LSTM(input_size, lstm_dim, num_layers, batch_first=True, bidirectional=True)
        self.linear1 = nn.Linear(lstm_dim*sequence_num*2, 1)
        self.bn1 = nn.BatchNorm1d(lstm_dim*sequence_num*2)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        x = lstm_out.reshape(lstm_out.shape[0], -1)
        x = self.linear1(self.bn1(x))
        return x

In [19]:
stock_price_df['ExpectedDividend'] = stock_price_df['ExpectedDividend'].fillna(0)
stock_price_df['SupervisionFlag'] = stock_price_df['SupervisionFlag'].map({True: 1, False: 0})
stock_price_df['Date'] = pd.to_datetime(stock_price_df['Date'])
stock_price_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2332531 entries, 0 to 2332530
Data columns (total 12 columns):
 #   Column            Dtype         
---  ------            -----         
 0   RowId             object        
 1   Date              datetime64[ns]
 2   SecuritiesCode    int64         
 3   Open              float64       
 4   High              float64       
 5   Low               float64       
 6   Close             float64       
 7   Volume            int64         
 8   AdjustmentFactor  float64       
 9   ExpectedDividend  float64       
 10  SupervisionFlag   int64         
 11  Target            float64       
dtypes: datetime64[ns](1), float64(7), int64(3), object(1)
memory usage: 213.5+ MB


In [20]:
stock_price_df = stock_price_df.dropna(how='any')
# 欠損情報確認
stock_price_df_na = (stock_price_df.isnull().sum() / len(stock_price_df)) * 100
stock_price_df_na = stock_price_df_na.drop(stock_price_df_na[stock_price_df_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :stock_price_df_na})
missing_data.head(22)

Unnamed: 0,Missing Ratio


In [21]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag']
stock_price_df[columns] = stdsc.fit_transform(stock_price_df[columns])
stock_price_df.head()

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
0,20170104_1301,2017-01-04,1301,0.038994,0.035493,0.047764,0.041374,-0.169192,-0.007493,-0.053484,-0.025162,0.00073
1,20170104_1332,2017-01-04,1332,-0.566509,-0.566547,-0.56551,-0.565637,0.537161,-0.007493,-0.053484,-0.025162,0.012324
2,20170104_1333,2017-01-04,1333,0.155286,0.161205,0.163796,0.172227,-0.108081,-0.007493,-0.053484,-0.025162,0.006154
3,20170104_1376,2017-01-04,1376,-0.303174,-0.297439,-0.297504,-0.291909,-0.174323,-0.007493,-0.053484,-0.025162,0.011053
4,20170104_1377,2017-01-04,1377,0.188832,0.199886,0.200587,0.205779,-0.138713,-0.007493,-0.053484,-0.025162,0.003026


In [22]:
dataset_dict = {}
for sc in stock_price_df['SecuritiesCode'].unique():
    dataset_dict[str(sc)] = stock_price_df[stock_price_df['SecuritiesCode'] == sc].values[:, 3:].astype(np.float32)
print(dataset_dict['1301'].shape)

(1201, 9)


In [23]:
from torch.utils.data.sampler import SubsetRandomSampler
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, X, sequence_num=31, y=None, mode='train'):
        self.data = X
        self.teacher = y
        self.sequence_num = sequence_num
        self.mode = mode
    def __len__(self):
        return len(self.teacher)

    def __getitem__(self, idx):
        out_data = self.data[idx]
        if self.mode == 'train':
            out_label =  self.teacher[idx[-1]]
            return out_data, out_label
        else:
            return out_data
def create_dataloader(dataset, dataset_num, sequence_num=31, input_size=8, batch_size=32, shuffle=False):
    sampler = np.array([list(range(i, i+sequence_num)) for i in range(dataset_num-sequence_num+1)])
    if shuffle == True:
        np.random.shuffle(sampler)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size, sampler=sampler)
    return dataloader
#### Check operation ####
X_check, y_check = dataset_dict['1301'][:, :-1], dataset_dict['1301'][:, -1]
dataset_check = MyDataset(X_check, y=y_check, sequence_num=31, mode='train')
dataloader_check = create_dataloader(dataset_check, X_check.shape[0], sequence_num=31, input_size=8, batch_size=32, shuffle=False)
for b, tup in enumerate(dataloader_check):
    print('---------')
    print(tup[0].shape, tup[1].shape)
    break

---------
torch.Size([32, 31, 8]) torch.Size([32])


In [24]:
from tqdm import tqdm
epochs = 10
batch_size = 512
# Check wheter GPU is available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Model Instantiation
model = LSTM(input_size=8, sequence_num=31, lstm_dim=128, num_layers=2, output_size=1)
model.to(device)
model.train()
# setting optimizer
lr = 0.0001
weight_decay = 1.0e-05
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr, weight_decay=weight_decay)
# setting criterion
criterion = nn.MSELoss()
# set iteration counter
iteration = 0
# 
log_train = [[0], [np.inf]]
for epoch in range(epochs):
    epoch_loss = 0.0
    for sc in tqdm(stock_price_df['SecuritiesCode'].unique()):
        X, y = dataset_dict[str(sc)][:, :-1], dataset_dict[str(sc)][:, -1]
        dataset = MyDataset(X, y=y, sequence_num=31, mode='train')
        dataloader = create_dataloader(dataset, X.shape[0], sequence_num=31, input_size=8, batch_size=batch_size, shuffle=True)
        for data, targets in dataloader:
            data, targets = data.to(device), targets.to(device)
            
            optimizer.zero_grad()
            
            data = data.to(torch.float32)
            output = model.forward(data)
            targets = targets.to(torch.float32)
            
            loss = criterion(output.view(1,-1)[0], targets)
            
            loss.backward()
            
            optimizer.step()
            
            epoch_loss += loss.item()
            
            iteration += 1
    epoch_loss /= iteration
    print('epoch_loss={}'.format(epoch_loss))
    log_train[0].append(iteration)
    log_train[1].append(epoch_loss)

  0%|          | 9/2000 [00:36<2:14:55,  4.07s/it]


KeyboardInterrupt: 

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [None]:
count = 0
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    prices = prices.fillna(0)
    prices['SupervisionFlag'] = prices['SupervisionFlag'].map({True: 1, False: 0})
    prices['Date'] = pd.to_datetime(prices['Date'])
    pred_df = predict(model, prices)
    print(pred_df)
    env.predict(pred_df)
    count += 1

In [None]:
pred_df