In [1]:
# -*- coding: utf-8 -*-
"""
 LSTM prediction
"""
# 导入库函数
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import read_csv
import math
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


In [3]:
TRAIN_FLAG=1 # 如果是要训练模型，选择1，模型保存到my_model_.h5
DATA_FILE= './data_file/CSI.csv'
MODEL_NAME='my_model_2d'

In [4]:
def read_file(file_name):
    # 代码,名称,日期,开盘价(元),最高价(元),最低价(元),收盘价(元),成交额(百万),成交量(股)
    close_p = []
    df=pd.read_csv(file_name,index_col='日期')

    close_p=df['收盘价(元)'].to_frame()
    close_p.insert(1,'turnover',df['成交额(百万)'])
    close_p.rename(columns={'收盘价(元)':'close'},inplace=True)

    earning=[]
    close_p.dropna(axis=0,inplace=True)
    # 删除掉价格数据为0的日期（错误日期）
    for day in close_p.index:
        # print(day,close_p[day])
        if close_p.loc[day]['close']==0:
            close_p.drop(day,inplace=True)
    close_np=np.array(close_p['close'])
    # 获得前一日的收盘数据（可视为当日开盘价）
    open_np=np.roll(close_np,1)
    earning=(close_np-open_np)/open_np*100
    earning[0]=0.0
    close_p['earning']=earning
    return close_p

In [6]:
# 读取数据
close_earning=read_file(DATA_FILE)
close_earning.drop(close_earning.index[0],inplace=True)
dataset=close_earning[['earning','turnover']].values.reshape(-1,2)
print(dataset)

[[-1.30700000e+00  1.32919000e+03]
 [ 1.69211596e+00  1.91986000e+03]
 [-8.99733966e-01  1.62922000e+03]
 ...
 [ 9.47595829e-01  2.44204350e+05]
 [ 1.13659847e-01  1.94980140e+05]
 [ 1.48009887e+00  2.17238390e+05]]


In [20]:
def cal_the_return(testPredict,testY):
    compare=pd.DataFrame(testPredict,)
    compare['testy']=testY[0]
    # 测试集的实际收益率和测试集的预测收益率
    compare.columns=['predy','testy']
    compare.to_csv('compare.csv')
    accy = [1]
    accp = [[1]for i in range(6)]
    for row in compare.index:
        if row == 0:
            continue
        pred = compare['predy'].loc[row]
        real = compare['testy'].loc[row]
        accy.append(accy[row - 1] * (1 + real/100))
        for i in range(6):
            if pred > i*0.1:  # 如果预期收益率大于0.1%*i，则当日持有
                accp[i].append(accp[i][row - 1] * (1 + real/100))
            else:
                accp[i].append(accp[i][row - 1])
    compare['acc_y'] = accy
    for i in range(6):
        compare['acc_p'+str(i)] = accp[i]

    return compare

# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):  # 后一个数据和前look_back个数据有关系
    dataX, dataY = [], []
    for i in range(len(dataset) - look_back - 1):
        a = dataset[i:(i + look_back)]
        dataX.append(a)  # .apeend方法追加元素
        dataY.append(dataset[i + look_back])
    return np.array(dataX), np.array(dataY)  # 生成输入数据和输出数据


np.random.seed(7)  # 随机数生成时算法所用开始的整数值
# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))  # 归一化0-1
dataset = scaler.fit_transform(dataset)
# split into train and test sets  #训练集和测试集分割
train_size = int(len(dataset) * 0.67)  # %67的训练集，剩下测试集
test_size = len(dataset) - train_size
train, test = dataset[0:train_size, :], dataset[train_size:len(dataset), :]  # 训练集和测试集

# use this function to prepare the train and test datasets for modeling
look_back = 20
trainX, trainY = create_dataset(train, look_back)  # 训练输入输出
testX, testY = create_dataset(test, look_back)  # 测试输入输出
print(testY)
# reshape input to be [samples, time steps, features]#注意转化数据维数
trainX = np.reshape(trainX, (trainX.shape[0], 2, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 2, testX.shape[1]))
print(testX.shape)
# def mean_squared_error(y_true, y_pred):
#     return K.mean(K.square(y_pred - y_true), axis=-1)




[[0.43349171 0.60123325]
 [0.61686523 0.53465515]
 [0.5895973  0.61191388]
 ...
 [0.6754602  0.43910885]
 [0.52590691 0.50264272]
 [0.48160793 0.40077067]]
(1196, 2, 20)


In [29]:
dataset

array([[0.40614197, 0.        ],
       [0.56545608, 0.00122242],
       [0.42777609, 0.00062093],
       ...,
       [0.52590691, 0.50264272],
       [0.48160793, 0.40077067],
       [0.55419366, 0.44683527]])

In [28]:
dataset[1,:].max()

0.565456084867432

In [19]:
# 建立LSTM模型
model = Sequential()
model.add(LSTM(11, input_shape=(2, look_back)))  # 隐层11个神经元 （可以断调整此参数提高预测精度）
model.add(Dense(2))
model.compile(loss='mse', optimizer='adam')  # 评价函数mse，优化器adam
model.fit(trainX, trainY, epochs=100, batch_size=100, verbose=1)  # 100次迭代


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x2339ebb5208>

In [16]:
# save the model
# model.save_weights("my_model_weights.h5") # only save the weight
model.save('./model_file/{}.h5'.format(MODEL_NAME))


trainPredict = model.predict(trainX)
testPredict = model.predict(testX)
# 数据反归一化
# print('Pre:',trainPredict)
print('test:',testPredict)
print(trainPredict.shape)
print(trainY.shape)
trainPredict=trainPredict.reshape(trainPredict.shape[0],2)
trainY=trainY.reshape(trainY.shape[0],2)
testPredict=testPredict.reshape(testPredict.shape[0],2)
testY=testY.reshape(testY.shape[0],2)

print(scaler.data_max_)
print(trainPredict.shape)
print(trainY.shape)

test: [[0.47175276 0.5260254 ]
 [0.465431   0.5240078 ]
 [0.46902654 0.54439324]
 ...
 [0.44557127 0.43460917]
 [0.4640763  0.4637625 ]
 [0.43945122 0.46167147]]
(2448, 2)
(2448, 2)
[1. 1.]
(2448, 2)
(2448, 2)


In [18]:
scaler

MinMaxScaler(copy=True, feature_range=(0, 1))

In [17]:
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])


trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:, 0]))
print('Train Score: %.5f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:, 0]))
print('Test Score: %.5f RMSE' % (testScore))


trainPredictPlot = np.empty_like(dataset)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(trainPredict) + look_back, :] = trainPredict

# shift test predictions for plotting
testPredictPlot = np.empty_like(dataset)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(trainPredict) + (look_back * 2) + 1:len(dataset) - 1, :] = testPredict


compare=cal_the_return(testPredict,testY)
print('index    return={};\nStrategy return={}'.format(compare['acc_y'].iloc[-1],compare['acc_p0'].iloc[-1]))
# compare[['acc_p0','acc_p1','acc_p2','acc_p3','acc_p4','acc_p5','acc_y']].plot()
plt.figure(figsize=(20, 6))
l1, = plt.plot(compare['acc_y'], color='red', linewidth=5)
l2, = plt.plot(compare['acc_p0'], color='b', linewidth=2)
l3, = plt.plot(compare['acc_p5'], color='g', linewidth=2)
plt.ylabel('Height m')
plt.legend([l1, l2, l3], ('CSI500', 'S0', 'S1'), loc='best')
plt.title('LSTM Prediction--{}'.format(MODEL_NAME))
plt.savefig('./img/收益率曲线_{}.svg'.format(MODEL_NAME),format='svg')

# compare=cal_the_return(trainPredict,trainY)
# compare[['acc_p0','acc_y']].plot()

plt.show()


ValueError: Found array with dim 3. Estimator expected <= 2.