In [22]:
import warnings
import FinanceDataReader as fdr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, Dense
# from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau
import math
warnings.filterwarnings('ignore')

In [23]:
tf.compat.v1.reset_default_graph() 
tf.compat.v1.disable_eager_execution()

In [24]:
# 표준화
def data_standardization(x):
    x_np = np.asarray(x)
    return (x_np - x_np.mean()) / x_np.std()
 
# 너무 작거나 너무 큰 값이 학습을 방해하는 것을 방지하고자 정규화한다
# x가 양수라는 가정하에 최소값과 최대값을 이용하여 0~1사이의 값으로 변환
# Min-Max scaling
def min_max_scaling(x):
    x_np = np.asarray(x)
    return (x_np - x_np.min()) / (x_np.max() - x_np.min() + 1e-7) # 1e-7은 0으로 나누는 오류 예방차원
 
# 정규화된 값을 원래의 값으로 되돌린다
# 정규화하기 이전의 org_x값과 되돌리고 싶은 x를 입력하면 역정규화된 값을 리턴한다
def reverse_min_max_scaling(org_x, x):
    org_x_np = np.asarray(org_x)
    x_np = np.asarray(x)
    return (x_np * (org_x_np.max() - org_x_np.min() + 1e-7)) + org_x_np.min()

In [25]:
input_data_column_cnt = 3  # 입력데이터의 컬럼 개수(Variable 개수)
output_data_column_cnt = 1 # 결과데이터의 컬럼 개수

seq_length = 30      # 1개 시퀀스의 길이
rnn_cell_hidden_dim = 20   # 각 셀의 (hidden)출력 크기
forget_bias = 1.0          # 망각편향
num_stacked_layers = 1     # stacked LSTM layers 개수
keep_prob = 0.2            # dropout할 때 keep할 비율

epoch_num = 1000           # 에폭 횟수(학습에서 훈련 데이터를 모두 소진했을 때의 횟수)
learning_rate = 0.01       # 학습률 

In [26]:
data = fdr.DataReader('005930','2002-01-01','')
all_data = data[['Close','Open','High','Low','Volume']].round(2)
all_data

Unnamed: 0_level_0,Close,Open,High,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2002-01-02,6160,5660,6160,5620,1142079
2002-01-03,6040,6100,6190,6020,885636
2002-01-04,6380,6380,6410,6290,1009482
2002-01-07,6360,6260,6430,6170,1029132
2002-01-08,6230,6390,6390,6190,760142
...,...,...,...,...,...
2022-12-01,62600,63100,63200,62300,16631445
2022-12-02,60400,62500,62500,60400,15331184
2022-12-05,60300,60900,61100,60000,13767787
2022-12-06,59200,59800,60100,59200,13888300


In [27]:
stock_info = data.values[:].astype(np.float) # 금액&거래량 문자열을 부동소수점형으로 변환한다

In [28]:
price = stock_info[:,:-2] # 시작가, 최고가, 최저가, 종료가
price.shape
len(stock_info)
for i in range(len(stock_info)):
    if price[i][1]==0:
        price[i][1]=price[i][3]

for i in range(len(stock_info)):
    if price[i][2]==0:
        price[i][2]=price[i][3]


In [29]:
norm_price = min_max_scaling(price) # 가격형태 데이터 정규화 처리

In [30]:
# 거래량형태 데이터를 정규화한다
# ['Open','High','Low','Close','Adj Close','Volume']에서 마지막 'Volume'만 취함
# [:,-1]이 아닌 [:,-1:]이므로 주의하자! 스칼라가아닌 벡터값 산출해야만 쉽게 병합 가능
volume = stock_info[:,-2:-1]
norm_volume = min_max_scaling(volume) # 거래량형태 데이터 정규화 처리
print("volume.shape: ", volume.shape)
print("volume[0]: ", volume[0])
print("norm_volume[0]: ", norm_volume[0])

volume.shape:  (5175, 1)
volume[0]:  [1142079.]
norm_volume[0]:  [0.01264674]


In [31]:
# 행은 그대로 두고 열을 우측에 붙여 합친다
x = np.concatenate((norm_price, norm_volume), axis=1) # axis=1, 세로로 합친다
print("x.shape: ", x.shape)
print("x[0]: ", x[0])    # x의 첫 값
print("x[-1]: ", x[-1])  # x의 마지막 값

x.shape:  (5175, 5)
x[0]:  [0.05847107 0.06363636 0.05805785 0.06363636 0.01264674]
x[-1]:  [0.60743802 0.61570248 0.60433884 0.60847107 0.13271139]


In [32]:
y = x[:, [-2]] # 타켓은 주식 종가이다
print("y[0]: ",y[0])     # y의 첫 값
print("y[-1]: ",y[-1])   # y의 마지막 값

y[0]:  [0.06363636]
y[-1]:  [0.60847107]


In [33]:
dataX = [] # 입력으로 사용될 Sequence Data
dataY = [] # 출력(타켓)으로 사용

for i in range(0, len(y) - seq_length):
    _x = x[i : i+seq_length]
    _y = y[i + seq_length] # 다음 나타날 주가(정답)
    dataX.append(_x) # dataX 리스트에 추가
    dataY.append(_y) # dataY 리스트에 추가

In [34]:
# def ts_train_test(data, time_steps, for_periods):
#     ts_train= data['2002-01-01':'2016-10-14'].iloc[:,0:1].values
#     ts_test= data['2016-10-14':].iloc[:,0:1].values
#     ts_train_len = len(ts_train)
#     ts_test_len = len(ts_test)
    
#     #정규화
#     from sklearn.preprocessing import MinMaxScaler
#     sc= MinMaxScaler(feature_range=(0,1))
#     ts_train_scaled = sc.fit_transform(ts_train)
    
#     #training data의 samples와 time steps로 원본데이터 슬라이싱하기
#     X_train=[]
#     Y_train=[]
#     Y_train_stacked=[]
#     for i in range(time_steps,ts_train_len-1):
#         X_train.append(ts_train[i-time_steps:i,0])
#         Y_train.append(ts_train[i:i+for_periods,0])
#     X_train,Y_train=np.array(X_train), np.array(Y_train)
    
#     X_train = np.reshape(X_train,(X_train.shape[0], X_train.shape[1],1))
    
#     inputs = pd.concat((data['Close']['2002-01-01':'2016-10-14'],data['Close']['2016-10-14':]), axis=0).values
#     inputs = inputs[len(inputs)-len(ts_test)-time_steps:]
#     inputs = inputs.reshape(-1,1)
    
#     X_test = []
#     for i in range(time_steps, ts_test_len+ time_steps-for_periods):
#         X_test.append(inputs[i-time_steps:i,0])
#     X_test = np.array(X_test)
#     X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1],1))
    
#     return X_train, Y_train, X_test

In [35]:
# X_train, Y_train, X_test = ts_train_test(data,seq_length,3)

# X_train_see= pd.DataFrame(np.reshape(X_train, (X_train.shape[0], X_train.shape[1])))
# Y_train_see= pd.DataFrame(Y_train)
# pd.concat([X_train_see, Y_train_see], axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,0.1
0,5660,6100,6380,6260,6390,6210,6360,6240,6050,6430,...,7219,7370,7100,7000,6880,6910,6890,6680,6699,"[6660, 7019, 6800]"
1,6100,6380,6260,6390,6210,6360,6240,6050,6430,6150,...,7370,7100,7000,6880,6910,6890,6680,6699,6660,"[7019, 6800, 6900]"
2,6380,6260,6390,6210,6360,6240,6050,6430,6150,5900,...,7100,7000,6880,6910,6890,6680,6699,6660,7019,"[6800, 6900, 7059]"
3,6260,6390,6210,6360,6240,6050,6430,6150,5900,6010,...,7000,6880,6910,6890,6680,6699,6660,7019,6800,"[6900, 7059, 6770]"
4,6390,6210,6360,6240,6050,6430,6150,5900,6010,5800,...,6880,6910,6890,6680,6699,6660,7019,6800,6900,"[7059, 6770, 7000]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3606,30120,30000,30000,30240,30660,30400,31380,31360,30960,30380,...,31640,31420,31000,31080,31459,31800,32200,32020,33920,"[34000, 33000, 31999]"
3607,30000,30000,30240,30660,30400,31380,31360,30960,30380,30580,...,31420,31000,31080,31459,31800,32200,32020,33920,34000,"[33000, 31999, 29900]"
3608,30000,30240,30660,30400,31380,31360,30960,30380,30580,31320,...,31000,31080,31459,31800,32200,32020,33920,34000,33000,"[31999, 29900, 31000]"
3609,30240,30660,30400,31380,31360,30960,30380,30580,31320,31480,...,31080,31459,31800,32200,32020,33920,34000,33000,31999,"[29900, 31000, 30960]"


In [36]:
def ts_train_test_normalize(all_data,time_steps,for_periods):

    # create training and test set
    ts_train= data['2002-01-01':'2016-10-14'].iloc[:,0:1].values
    ts_test= data['2016-10-14':].iloc[:,0:1].values
    ts_train_len = len(ts_train)
    ts_test_len = len(ts_test)

    # scale the data
    from sklearn.preprocessing import MinMaxScaler
    sc = MinMaxScaler(feature_range=(0,1))
    ts_train_scaled = sc.fit_transform(ts_train)

    # create training data of s samples and t time steps
    X_train = []
    y_train = []
    y_train_stacked = []
    for i in range(time_steps,ts_train_len-1): 
        X_train.append(ts_train_scaled[i-time_steps:i,0])
        y_train.append(ts_train_scaled[i:i+for_periods,0])
    X_train, y_train = np.array(X_train), np.array(y_train)

    # Reshaping X_train for efficient modelling
    X_train = np.reshape(X_train, (X_train.shape[0],X_train.shape[1],1))

    inputs = pd.concat((all_data["Close"]['2002-01-01':'2016-10-14'], all_data["Close"]['2016-10-14':]),axis=0).values
    inputs = inputs[len(inputs)-len(ts_test) - time_steps:]
    inputs = inputs.reshape(-1,1)
    inputs  = sc.transform(inputs)

    # Preparing X_test
    X_test = []
    for i in range(time_steps,ts_test_len+time_steps-for_periods):
        X_test.append(inputs[i-time_steps:i,0])
        
    X_test = np.array(X_test)
    X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1))

    return X_train, y_train , X_test, sc

In [45]:
ts_train= data['2002-01-01':'2016-10-14'].iloc[:,0:1].values
ts_test= data['2016-10-14':].iloc[:,0:1].values
ts_train_len = len(ts_train)
ts_test_len = len(ts_test)
for a in range(ts_test_len):
    if ts_test[a]==0:
        ts_test[a]=ts_test[a-1]
        
for a in range(ts_train_len):
    if ts_train[a]==0:
        ts_train[a]=ts_train[a-1]

In [59]:
X_train, Y_train, X_test, sc = ts_train_test_normalize(data,30,3)

X_train_see= pd.DataFrame(np.reshape(X_train, (X_train.shape[0], X_train.shape[1])))
Y_train_see= pd.DataFrame(Y_train)
pd.concat([X_train_see, Y_train_see], axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,0.1
0,0.011855,0.027197,0.036960,0.032775,0.037308,0.031032,0.036262,0.032078,0.025453,0.038703,...,0.029289,0.030683,0.030683,0.030683,0.037657,0.034170,0.034868,0.049477,0.056485,"[0.052266387726638774, 0.05509065550906553, 0...."
1,0.027197,0.036960,0.032775,0.037308,0.031032,0.036262,0.032078,0.025453,0.038703,0.028940,...,0.030683,0.030683,0.030683,0.037657,0.034170,0.034868,0.049477,0.056485,0.052266,"[0.05509065550906553, 0.04811715481171547, 0.0..."
2,0.036960,0.032775,0.037308,0.031032,0.036262,0.032078,0.025453,0.038703,0.028940,0.020223,...,0.030683,0.030683,0.037657,0.034170,0.034868,0.049477,0.056485,0.052266,0.055091,"[0.04811715481171547, 0.05369595536959554, 0.0..."
3,0.032775,0.037308,0.031032,0.036262,0.032078,0.025453,0.038703,0.028940,0.020223,0.024059,...,0.030683,0.037657,0.034170,0.034868,0.049477,0.056485,0.052266,0.055091,0.048117,"[0.05369595536959554, 0.042887029288702916, 0...."
4,0.037308,0.031032,0.036262,0.032078,0.025453,0.038703,0.028940,0.020223,0.024059,0.016736,...,0.037657,0.034170,0.034868,0.049477,0.056485,0.052266,0.055091,0.048117,0.053696,"[0.042887029288702916, 0.04497907949790794, 0...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3626,0.981869,0.975593,0.986053,0.951185,0.934449,0.931660,0.963040,0.958856,0.918410,0.926778,...,0.917713,0.910042,0.895397,0.898187,0.911402,0.923291,0.937238,0.930962,0.997211,"[1.0, 0.9651324965132496, 0.9302301255230127]"
3627,0.975593,0.986053,0.951185,0.934449,0.931660,0.963040,0.958856,0.918410,0.926778,0.923291,...,0.910042,0.895397,0.898187,0.911402,0.923291,0.937238,0.930962,0.997211,1.000000,"[0.9651324965132496, 0.9302301255230127, 0.857..."
3628,0.986053,0.951185,0.934449,0.931660,0.963040,0.958856,0.918410,0.926778,0.923291,0.935146,...,0.895397,0.898187,0.911402,0.923291,0.937238,0.930962,0.997211,1.000000,0.965132,"[0.9302301255230127, 0.8570432357043235, 0.895..."
3629,0.951185,0.934449,0.931660,0.963040,0.958856,0.918410,0.926778,0.923291,0.935146,0.963738,...,0.898187,0.911402,0.923291,0.937238,0.930962,0.997211,1.000000,0.965132,0.930230,"[0.8570432357043235, 0.8953974895397491, 0.894..."


In [47]:
def actual_pred_plot(preds):
    
    actual_pred = pd.DataFrame(columns = ['Close','prediction'])
    actual_pred['Close']= data.loc['2002-01-01':'2016-10-14'][0:len(preds)]
    actual_pred['prediction'] = preds[:,0]
    
    from keras.metrics import MeanSquaredError
    m=MeanSquaredError()
    m.update_state(np.array(actual_pred['Close']), np.array(actual_pred['prediction']))
    return (m.result().numpy(), actual_pred.plot())

In [60]:
def LSTM_model(X_train, y_train, X_test, sc):
    # create a model
    from keras.models import Sequential
    from keras.layers import Dense, LSTM, Dropout,SimpleRNN
    from keras.optimizers import SGD
    
    # The LSTM architecture
    my_LSTM_model = Sequential()
    my_LSTM_model.add(LSTM(units=50, return_sequences=True, input_shape=(seq_length,5), activation='tanh'))
    #my_LSTM_model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1],1), activation='tanh'))
    #my_LSTM_model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1],1), activation='tanh'))
    my_LSTM_model.add(LSTM(units=64, activation='tanh'))
    #my_LSTM_model.add(Dropout(keep_prob))
    my_LSTM_model.add(Dense(units=2))

    # Compiling
    my_LSTM_model.compile(optimizer=SGD(lr=0.01, decay=1e-7, momentum=0.9, nesterov=False),loss='mean_squared_error')
    # Fitting to the training set
    my_LSTM_model.fit(X_train,y_train,epochs=50,batch_size=150, verbose=0)

    LSTM_prediction = my_LSTM_model.predict(X_test)
    LSTM_prediction = sc.inverse_transform(LSTM_prediction)

    return my_LSTM_model, LSTM_prediction


In [49]:
# # 학습용/테스트용 데이터 생성
# train_size = int(len(dataY) * 0.7)
# test_size = len(dataY) - train_size


# # 데이터 생성
# trainX = np.array(dataX[0:train_size])
# trainY = np.array(dataY[0:train_size])

# # 테스트용 데이터 생성
# testX = np.array(dataX[train_size:len(dataX)])
# testY = np.array(dataY[train_size:len(dataY)])

# print('All data size: ',len(dataY))
# print('Training data size: ',train_size)
# print('Testing data size: ',test_size)

In [50]:
# my_LSTM_model, LSTM_prediction = LSTM_model(trainX, trainY, testX, sc)

In [61]:
my_LSTM_model, LSTM_prediction = LSTM_model(X_train, Y_train, X_test, sc)

ValueError: Error when checking input: expected lstm_6_input to have shape (50, 5) but got array with shape (30, 1)

In [None]:
# 학습용/테스트용 데이터 생성
train_size = int(len(dataY) * 0.7)
test_size = len(dataY) - train_size


# 데이터 생성
trainX = np.array(dataX[0:train_size])
trainY = np.array(dataY[0:train_size])

# 테스트용 데이터 생성
testX = np.array(dataX[train_size:len(dataX)])
testY = np.array(dataY[train_size:len(dataY)])

print('All data size: ',len(dataY))
print('Training data size: ',train_size)
print('Testing data size: ',test_size)

All data size:  5137
Training data size:  3595
Testing data size:  1542


In [None]:
# 텐서플로우 플레이스홀더 생성
# 입력 X, 출력 Y를 생성한다

X = tf.compat.v1.placeholder(tf.float32, [None, seq_length, input_data_column_cnt])
print("X: ", X)
Y = tf.compat.v1.placeholder(tf.float32, [None, 1])
print("Y: ", Y)

# 검증용 측정지표를 산출하기 위한 targets, predictions를 생성한다
targets = tf.compat.v1.placeholder(tf.float32, [None, 1])
print("targets: ", targets)

predictions = tf.compat.v1.placeholder(tf.float32, [None, 1])
print("predictions: ", predictions)

X:  Tensor("Placeholder:0", shape=(None, 30, 5), dtype=float32)
Y:  Tensor("Placeholder_1:0", shape=(None, 1), dtype=float32)
targets:  Tensor("Placeholder_2:0", shape=(None, 1), dtype=float32)
predictions:  Tensor("Placeholder_3:0", shape=(None, 1), dtype=float32)


In [None]:
x = Input((seq_length, input_dim))
lstm = LSTM(128, return_sequences=True, activation='tanh')(x)
td = TimeDistributed(Dense(out_size, activation='softmax'))(lstm)
second_input = Input((seq_len, out_size)) # object instanciated and hold as a var.
out = merge([td, second_input], mode='mul')
model = Model(input=[x, second_input], output=out) # second input provided to model.compile(...)

# then I add two inputs
model.fit([trainX, filter], trainY, ...)