In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import (DataLoader, RandomSampler, TensorDataset)
import csv
from sklearn.preprocessing import MinMaxScaler

In [2]:
class STOCK_RNN(nn.Module):
    # 생성자 오버라이드
    def __init__(self, config):
        super(STOCK_RNN, self).__init__()

        self.input_size = config['input_size']
        self.hidden_size = config['hidden_size']
        self.output_size = config['output_size']
        self.num_layers = config['num_layers']
        self.batch_size = config['batch_size']

        self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, bidirectional=False, batch_first=True)

        ''' self.num_layers
            ㅁ-ㅁ-ㅁ-...-ㅁ -> 이런 층이 몇 개있는가?
        '''

        ''' batch_first = True
            batch에 들어오는 순서대로 처리
        '''

        self.linear = nn.Linear(self.hidden_size, self.output_size)

    # forward 오버라이드 -> hypothesis 형성
    def forward(self, input_features):

        x, (h_n, c_n) = self.lstm(input_features)

        ''' lstm의 return
            output, (last hidden state, last cell state)

            - output : (batch, sequence, hidden)
        '''

        # output에서 맨 마지막 시퀀스의 (batch, hidden) 정보를 가져옴
        h_t = x[:, -1, :]

        hypothesis = self.linear(h_t)

        return hypothesis

In [3]:
def load_dataset(file_name):

    f = open(file_name, 'r', encoding='cp949')

    data = csv.reader(f, delimiter=',')

    # header 건너뛰기
    next(data)

    data_X, data_Y = [], []

    for row in data:
        # 오픈, 공가, 저가, 거래량
        data_X.append([float(i) for i in row[2:]])

        # 종가
        data_Y.append(float(row[1]))

    # data 범위가 매우 상이하기 때문에 MinMax '정규화' 적용 (단, 종속변수는 제외)
    scaler = MinMaxScaler()
    scaler.fit(data_X)
    data_X = scaler.transform(data_X)

    data_num = len(data_X)
    sequence_len = config['sequence_len']
    seq_data_X, seq_data_Y = [], []

    # window 크기만큼 슬라이딩 하면서 데이터 생성 : 원본 데이터의 시퀀스화
    for i in range(data_num - sequence_len):
        window_size = i + sequence_len
        seq_data_X.append(data_X[i:window_size])
        seq_data_Y.append([data_Y[window_size-1]])

    ''' 현재 우리가 하려는 건 매 시퀀스마다 output을 내는 과정이 아님

        예를 들어,
        [[1,2], [3,4], [5,6], [7,8]]와 같이 입력데이터가 구성돼있으면,
        [[1,2], [3,4], [5,6]]이 들어왔을 때 다음 예측, [[3,4], [5,6], [7,8]]이 들어왔을 때 다음 예측

        때문에 입력 데이터를 window size만큼 잘라서 구성해야함
        [[1,2], [3,4], [5,6], [7,8]]
        ->[[[1,2], [3,4], [5,6]], [[3,4], [5,6], [7,8]]]
    '''

    (train_X, train_Y) = (np.array(seq_data_X[:]), np.array(seq_data_Y[:]))
    train_X = torch.tensor(train_X, dtype=torch.float)
    train_Y = torch.tensor(train_Y, dtype=torch.float)

    print(train_X.shape) # (73, 3, 4)
    print(train_Y.shape) # (73, 1)

    return (train_X, train_Y)

In [4]:
def tensor2list(input_tensor):
    return input_tensor.cpu().detach().numpy().tolist()

In [5]:
def do_test(model, test_dataloader):
    model.eval()

    predicts, golds = [], []

    with torch.no_grad():
        for step, batch in enumerate(test_dataloader):
            batch = tuple(t.cuda() for t in batch)

            input_features, labels = batch
            hypothesis = model(input_features)

            # logits = (hypothesis > 0.5).float()
            '''
                우리가 지금 하려는 거는 classification이 아닌 regression이므로 argmax는 필요없다
            '''

            x = tensor2list(hypothesis[:, 0])
            y = tensor2list(labels)

            predicts.extend(x)
            golds.extend(y)

    # 소수점 이하 1자리로 변환
    predicts = [round(i, 1) for i in predicts]
    golds = [round(i[0], 1) for i in golds]

    print("pred = ", predicts)
    print("gold = ", golds)

In [6]:
def test(config):
    model = STOCK_RNN(config).cuda()
    model.load_state_dict(torch.load(os.path.join['output_dir'], config['model_name']))

    features, labels = load_dataset(config['file_name'])
    test_features = TensorDataset(features, labels)
    test_dataloader = DataLoader(test_features, shuffle=True, batch_size=config['batch_size'])

    do_test(model, test_dataloader)

In [7]:
def train(config):
    model = STOCK_RNN(config).cuda()

    (input_features, labels) = load_dataset(config['file_name'])
    tensor_features = TensorDataset(input_features, labels)
    train_dataloader = DataLoader(tensor_features, shuffle=True, batch_size=config['batch_size'])

    loss_func = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learn_rate'])

    for epoch in range(config['epoch']+1):
        model.train()

        costs = []

        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.cuda() for t in batch)

            input_features, labels = batch

            optimizer.zero_grad()

            hypothesis = model(input_features)

            cost = loss_func(hypothesis, labels)

            cost.backward()

            optimizer.step()

            costs.append(cost.data.item())

        print(f"Average Loss = {0:f}".format(np.mean(costs)))
        torch.save(model.state_dict(), os.path.join(config['output_dir'], 'epoch_{0:d}.pt').format(epoch))

        do_test(model, train_dataloader)

In [8]:
if __name__ == "__main__":

    root_dir = "/content/drive/MyDrive/연구실/기계학습/rnn"
    output_dir = os.path.join(root_dir, "output")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    config = {"mode": "train",
            "model_name":"epoch_{0:d}.pt".format(10),
            "output_dir":output_dir,
            "file_name": os.path.join(root_dir, "samsung-2020.csv"),
            "sequence_len": 3,
            "input_size": 4,
            "hidden_size": 10,
            "output_size": 1,
            "num_layers": 1,
            "batch_size": 1,
            "learn_rate": 0.1,
            "epoch": 10,
            }

    if config["mode"] == "train":
        train(config)
    else:
        test(config)

torch.Size([72, 3, 4])
torch.Size([72, 1])
Average Loss = 0.000000
pred =  [53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.0, 53.0, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.0, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.1, 53.0, 53.1, 53.1, 53.1, 53.1]
gold =  [54.2, 62.3, 56.4, 57.2, 60.7, 55.9, 45.4, 48.7, 61.3, 57.8, 60.8, 58.9, 60.5, 61.8, 59.7, 59.1, 60.2, 43.0, 59.0, 59.2, 54.6, 59.5, 55.5, 56.5, 59.2, 54.2, 48.3, 47.3, 47.8, 57.4, 56.8, 52.1, 57.9, 61.5, 48.3, 60.4, 55.4, 61.8, 57.2, 60.8, 60.0, 55.0, 42.5, 61.3, 59.5, 59.8, 45.4, 47.0, 56.4, 55.5, 59.9, 60.0, 56.5, 55.8, 60.4, 50.8, 61.4, 50.0, 56.8, 56.5, 61.1, 58.8, 54.2, 45.6, 60.7, 59.5, 62.4, 47.8, 48.9, 58.6, 50.0, 60.0]
Average Loss = 0.000000
pred =  [56.8, 56.8, 56.8, 