In [1]:
# 引入相关的包
import pandas as pd
import numpy as np
import talib
import matplotlib.pyplot as plt
from datetime import datetime,date
from keras.models import Sequential
from keras.layers import Dense, LSTM,Dropout
from keras import utils
from keras import optimizers
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.decomposition import PCA

In [2]:
# 从csv中读取数据
np.set_printoptions(suppress=True)
stock = pd.read_csv('.\\data\\stock.csv',encoding='gb2312')
dollar = pd.read_csv('.\\data\\dollar.csv')
rate = pd.read_csv('.\\data\\rate.csv')

In [3]:
# 去除不合理数据
stock=stock.mask(stock.eq('None')).dropna()

# 时间格式转换
stock['日期']=pd.to_datetime(stock['日期'],format='%Y/%m/%d')
dollar['日期']=pd.to_datetime(dollar['日期'],format='%Y年%m月%d日')
rate['日期']=pd.to_datetime(rate['日期'],format='%Y年%m月%d日')

# 去除%
dollar['d涨跌幅']=dollar['d涨跌幅'].str.strip("%").astype(float)
rate['r涨跌幅']=rate['r涨跌幅'].str.strip("%").astype(float)

# 合并
stock=pd.merge(stock, dollar, on='日期')
stock=pd.merge(stock, rate, on='日期')
stock.head()

Unnamed: 0,日期,股票代码,名称,收盘价,最高价,最低价,开盘价,前收盘,涨跌额,涨跌幅,...,d开盘价,d最高价,d最低价,d成交量,d涨跌幅,r收盘,r开盘,r高,r低,r涨跌幅
0,2010-01-04,'000001,上证指数,3243.76,3295.279,3243.319,3289.75,3277.139,-33.379,-1.0185,...,77.92,78.19,77.26,-,-0.42,6.8285,6.8297,6.8297,6.827,0.02
1,2010-01-05,'000001,上证指数,3282.179,3290.512,3221.462,3254.468,3243.76,38.419,1.1844,...,77.37,77.71,77.09,-,0.12,6.8268,6.8283,6.8288,6.8257,-0.02
2,2010-01-06,'000001,上证指数,3254.215,3295.868,3253.044,3277.517,3282.179,-27.964,-0.852,...,77.65,78.0,77.36,-,-0.17,6.8278,6.8277,6.8291,6.8267,0.01
3,2010-01-07,'000001,上证指数,3192.776,3268.819,3176.707,3253.991,3254.215,-61.439,-1.888,...,77.37,78.08,77.3,-,0.54,6.8281,6.8269,6.8282,6.8258,0.0
4,2010-01-08,'000001,上证指数,3195.997,3198.92,3149.017,3177.259,3192.776,3.221,0.1009,...,77.98,78.19,77.35,-,-0.56,6.8276,6.8278,6.8285,6.826,-0.01


In [4]:
# 加入技术指标
# MA
stock['ma_5_data'] = talib.MA(stock['收盘价'].values, timeperiod=5)
stock['ma_30_data'] = talib.MA(stock['收盘价'].values, timeperiod=30)

# KDJ
stock['k_data'], stock['d_data'] = talib.STOCH(stock['最高价'].values, stock['最低价'].values, stock['收盘价'].values, fastk_period=9,slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
stock['j_data'] = 3 * stock['k_data'] - 2 * stock['d_data']

# WMS
stock['wms_data'] = -talib.WILLR(stock['最高价'].values, stock['最低价'].values, stock['收盘价'].values, timeperiod=14)

# RSI
stock['rsi'] = talib.RSI(stock['收盘价'].values)

# CCI
stock['cci_data'] = talib.CCI(stock['最高价'].values, stock['最低价'].values, stock['收盘价'].values)

# MOM
stock['mom_data'] = talib.MOM(stock['收盘价'].values)

# BOLL
stock['boll_upper'], stock['boll_middle'], stock['boll_lower'] = talib.BBANDS(stock['收盘价'].values, timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)

stock.head()

Unnamed: 0,日期,股票代码,名称,收盘价,最高价,最低价,开盘价,前收盘,涨跌额,涨跌幅,...,k_data,d_data,j_data,wms_data,rsi,cci_data,mom_data,boll_upper,boll_middle,boll_lower
0,2010-01-04,'000001,上证指数,3243.76,3295.279,3243.319,3289.75,3277.139,-33.379,-1.0185,...,,,,,,,,,,
1,2010-01-05,'000001,上证指数,3282.179,3290.512,3221.462,3254.468,3243.76,38.419,1.1844,...,,,,,,,,,,
2,2010-01-06,'000001,上证指数,3254.215,3295.868,3253.044,3277.517,3282.179,-27.964,-0.852,...,,,,,,,,,,
3,2010-01-07,'000001,上证指数,3192.776,3268.819,3176.707,3253.991,3254.215,-61.439,-1.888,...,,,,,,,,,,
4,2010-01-08,'000001,上证指数,3195.997,3198.92,3149.017,3177.259,3192.776,3.221,0.1009,...,,,,,,,,,,


In [5]:
#去除Nan
stock=stock.dropna()
stock.head()

Unnamed: 0,日期,股票代码,名称,收盘价,最高价,最低价,开盘价,前收盘,涨跌额,涨跌幅,...,k_data,d_data,j_data,wms_data,rsi,cci_data,mom_data,boll_upper,boll_middle,boll_lower
29,2010-02-12,'000001,上证指数,3018.133,3018.858,2993.437,2996.088,2985.499,32.634,1.0931,...,81.448809,64.18581,115.974807,41.010328,44.070893,89.020124,28.841,3227.855957,3034.5943,2841.332643
30,2010-02-22,'000001,上证指数,3003.398,3026.659,3002.811,3016.703,3018.133,-14.735,-0.4882,...,86.243917,76.846039,105.039674,18.213755,42.473471,106.651209,62.038,3192.606098,3022.9093,2853.212502
31,2010-02-23,'000001,上证指数,2982.575,2998.907,2938.753,2998.907,3003.398,-20.823,-0.6933,...,80.500078,82.730935,76.038364,32.263353,40.253007,2.323344,47.862,3145.310711,3009.69435,2874.077989
32,2010-02-24,'000001,上证指数,3022.177,3023.739,2955.069,2964.984,2982.575,39.602,1.3278,...,79.30111,82.015035,73.873261,3.2802,46.031625,80.006144,18.342,3122.429676,3003.2107,2883.991724
33,2010-02-25,'000001,上证指数,3060.618,3063.004,3022.373,3026.656,3022.177,38.441,1.272,...,84.410916,81.404035,90.424679,1.379326,50.987051,178.449498,65.31,3097.949016,2998.29845,2898.647884


In [6]:
# 取技术指标，美元指数涨跌、USD/CNY汇率涨跌
tech_index=stock.values[:, 23:]
tech_index=np.column_stack((tech_index, stock['d涨跌幅'].values))
tech_index=np.column_stack((tech_index, -stock['r涨跌幅'].values))
tech_index=tech_index.astype('float64')
tech_index

array([[2974.0296    , 3098.66296667,   81.44880899, ..., 2841.33264288,
           0.41      ,    0.01      ],
       [2987.6744    , 3090.6509    ,   86.24391711, ..., 2853.21250175,
          -0.16      ,    0.02      ],
       [2994.421     , 3080.6641    ,   80.50007788, ..., 2874.07798851,
           0.42      ,   -0.        ],
       ...,
       [3459.63866   , 3405.93504333,   97.65374697, ..., 3307.29760543,
          -0.49      ,    0.08      ],
       [3494.00676   , 3410.48127   ,   97.18879219, ..., 3293.92806328,
           0.1       ,   -0.1       ],
       [3526.35714   , 3416.26067333,   99.02595924, ..., 3280.97408321,
           0.32      ,   -0.22      ]])

In [7]:
# 离散化
feature_tech_index = tech_index.copy()
for i in range(tech_index.shape[0]):
    # MA
    feature_tech_index[i][0]=0
    if i>0 and tech_index[i-1][0]<tech_index[i-1][1] and tech_index[i][0]>tech_index[i][1] :
        feature_tech_index[i][0]=1
    elif i>0 and tech_index[i-1][0]>tech_index[i-1][1] and tech_index[i][0]>tech_index[i][1] and tech_index[i][0]<tech_index[i-1][0] :
        feature_tech_index[i][0]=1
    elif i>0 and tech_index[i-1][0]>tech_index[i-1][1] and tech_index[i][0]<tech_index[i][1] :
        feature_tech_index[i][0]=-1
    elif i>0 and tech_index[i-1][0]<tech_index[i-1][1] and tech_index[i][0]<tech_index[i][1] and tech_index[i][0]>tech_index[i-1][0] :
        feature_tech_index[i][0]=-1
    feature_tech_index[i][1]=feature_tech_index[i][0]

    # K
    feature_tech_index[i][2]=0
    if tech_index[i][2]>90 :
        feature_tech_index[i][2]=-1
    elif tech_index[i][2]<10 :
        feature_tech_index[i][2]=1
    elif i>0 and tech_index[i-1][2]<tech_index[i-1][3] and tech_index[i][2]>tech_index[i][3] :#gold
        feature_tech_index[i][2]=1
    elif i>0 and tech_index[i-1][2]>tech_index[i-1][3] and tech_index[i][2]<tech_index[i][3] :#death
        feature_tech_index[i][2]=-1

    # D
    feature_tech_index[i][3]=0
    if tech_index[i][3]>80 :
        feature_tech_index[i][3]=-1
    elif tech_index[i][3]<20 :
        feature_tech_index[i][3]=1
    elif i>0 and tech_index[i-1][2]<tech_index[i-1][3] and tech_index[i][2]>tech_index[i][3] :#gold
        feature_tech_index[i][3]=1
    elif i>0 and tech_index[i-1][2]>tech_index[i-1][3] and tech_index[i][2]<tech_index[i][3] :#death
        feature_tech_index[i][3]=-1

    # J
    feature_tech_index[i][4]=0
    if tech_index[i][4]>100 :
        feature_tech_index[i][4]=-1
    elif feature_tech_index[i][4]<0 :
        feature_tech_index[i][4]=1

    # WMS
    feature_tech_index[i][5]=0
    if tech_index[i][5]>80 :
        feature_tech_index[i][5]=1
    elif tech_index[i][5]<20 :
        feature_tech_index[i][5]=-1

    # RST
    feature_tech_index[i][6]=0
    if i>0 and tech_index[i-1][6]<50 and tech_index[i][6]>50 :
        feature_tech_index[i][6]=1
        
    elif i>0 and tech_index[i-1][6]>50 and tech_index[i][6]<50 :
        feature_tech_index[i][6]=-1
        
    elif tech_index[i][6]>80 :
        feature_tech_index[i][6]=-1
        
    elif tech_index[i][6]<20 :
        feature_tech_index[i][6]=1

    # CCI
    feature_tech_index[i][7]=0
    if i>0 and tech_index[i-1][7]>100 and tech_index[i][7]<100 :
        feature_tech_index[i][7]=-1
    elif i>0 and tech_index[i-1][7]<100 and tech_index[i][7]>100 :
        feature_tech_index[i][7]=1
    elif i>0 and tech_index[i-1][7]>-100 and tech_index[i][7]<-100 :
        feature_tech_index[i][7]=-1
    elif i>0 and tech_index[i-1][7]>-100 and tech_index[i][7]<-100 :
        feature_tech_index[i][7]=1

    # MOM
    feature_tech_index[i][8]=0
    if i>0 and tech_index[i-1][8]>600 and tech_index[i][8]>600 and tech_index[i][8]<tech_index[i-1][8] :
        feature_tech_index[i][8]=-1
    elif i>0 and tech_index[i-1][8]<-600 and tech_index[i][8]<-600 and tech_index[i][8]>tech_index[i-1][8] :
        feature_tech_index[i][8]=1
    elif i>0 and tech_index[i-1][8]>400 and tech_index[i][8]>400 and tech_index[i][8]>tech_index[i-1][8] :
        feature_tech_index[i][8]=-1
    elif i>0 and tech_index[i-1][8]<-400 and tech_index[i][8]<-400 and tech_index[i][8]<tech_index[i-1][8] :
        feature_tech_index[i][8]=1
    elif i>0 and tech_index[i-1][8]>200 and tech_index[i][8]>200 and tech_index[i][8]>tech_index[i-1][8] :
        feature_tech_index[i][8]=-1
    elif i>0 and tech_index[i-1][8]<-200 and tech_index[i][8]<-200 and tech_index[i][8]<tech_index[i-1][8] :
        feature_tech_index[i][8]=1

    # BOLL
    feature_tech_index[i][9]=0
    if i>0 and stock['开盘价'].values[i]>tech_index[i-1][9] :
        feature_tech_index[i][9]=-1
    elif i>0 and stock['开盘价'].values[i]<tech_index[i-1][11] :
        feature_tech_index[i][9]=1
    feature_tech_index[i][10]=feature_tech_index[i][9]
    feature_tech_index[i][11]=feature_tech_index[i][9]

feature_tech_index

array([[ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.41,  0.01],
       [-1.  , -1.  ,  0.  , ...,  0.  , -0.16,  0.02],
       [-1.  , -1.  , -1.  , ...,  0.  ,  0.42, -0.  ],
       ...,
       [ 0.  ,  0.  , -1.  , ..., -1.  , -0.49,  0.08],
       [ 0.  ,  0.  , -1.  , ..., -1.  ,  0.1 , -0.1 ],
       [ 0.  ,  0.  , -1.  , ..., -1.  ,  0.32, -0.22]])

In [8]:
# 归一化
feature_tech_index[:,12:]=(feature_tech_index[:,12:]-np.mean(feature_tech_index[:,12:],axis=0))/np.std(feature_tech_index[:,12:],axis=0)
# feature_tech_index=(feature_tech_index-np.mean(feature_tech_index,axis=0))/np.std(feature_tech_index,axis=0)
feature_tech_index

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.90680123,  0.04252684],
       [-1.        , -1.        ,  0.        , ...,  0.        ,
        -0.36766685,  0.0930931 ],
       [-1.        , -1.        , -1.        , ...,  0.        ,
         0.92916031, -0.00803942],
       ...,
       [ 0.        ,  0.        , -1.        , ..., -1.        ,
        -1.10551678,  0.39649068],
       [ 0.        ,  0.        , -1.        , ..., -1.        ,
         0.21366947, -0.51370205],
       [ 0.        ,  0.        , -1.        , ..., -1.        ,
         0.70556942, -1.12049721]])

In [9]:
# PCA
estimator = PCA(n_components=5)
pca_tech_index = estimator.fit_transform(feature_tech_index)
print(pca_tech_index.shape)
pca_tech_index

(2648, 5)


array([[ 0.58358365, -0.08818434, -0.47112185, -0.49740663, -0.15754559],
       [-0.62764882, -0.24938487, -1.23692688,  1.04315487, -0.9042286 ],
       [ 0.48231212, -1.01553604, -1.37085295,  0.24907504,  1.30944396],
       ...,
       [-1.41135292, -1.8638472 ,  0.43560413,  0.64744006, -0.29388998],
       [ 0.14941547, -2.03416226,  0.17953722,  0.42793695, -0.31976051],
       [ 0.88251614, -2.25212413,  0.0901947 ,  0.60441893, -0.50407138]])

In [10]:
# 划分训练集、测试集
x_train=[]
y_train=[]
x_test=[]
y_test=[]
divide = pca_tech_index.shape[0]*9/10
for i in range(15, tech_index.shape[0]):
    if i<divide:
        tmp = pca_tech_index[i-15:i, :]
        x_train.append(tmp)
        if float(stock.iloc[i]['涨跌幅'])>0:
            y_train.append(1)
        else:
            y_train.append(0)
    elif i>2635:
        tmp=pca_tech_index[i-15:i,:]
        x_test.append(tmp)
        if float(stock.iloc[i]['涨跌幅'])>0:
            y_test.append(1)
        else:
            y_test.append(0)

x_train,y_train,x_test,y_test=np.array(x_train),np.array(y_train),np.array(x_test),np.array(y_test)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
print(x_train.dtype)
print(x_train[0])
print(y_train)

(2369, 15, 5)
(12, 15, 5)
(2369,)
(12,)
float64
[[ 0.58358365 -0.08818434 -0.47112185 -0.49740663 -0.15754559]
 [-0.62764882 -0.24938487 -1.23692688  1.04315487 -0.9042286 ]
 [ 0.48231212 -1.01553604 -1.37085295  0.24907504  1.30944396]
 [-0.33990517 -0.81921266 -1.1480932   0.82559559 -0.11598218]
 [-0.44931377 -0.27369228 -1.26673323  0.96703723 -1.03434827]
 [-1.16631821 -1.19935668 -0.56082626  1.5708175   0.39637875]
 [ 0.17495868 -1.5286561  -1.5984611   0.5367448   0.07262795]
 [-0.47763872 -1.95077684  1.46653079 -0.2871234  -0.45673749]
 [-1.2374683  -1.03519859  0.7186559   0.97695717  0.20531528]
 [ 1.28585436 -1.28562765  0.76653129 -1.58885044  0.96838018]
 [-0.13090801 -0.37008424  1.42454426 -0.49275318  0.14744538]
 [ 0.1667273   0.15853698  1.1726608  -0.74971634 -0.2124346 ]
 [ 0.44270836  0.10661     0.89227968 -0.97217437 -0.34505158]
 [-0.12995382  0.20678367  1.42815328 -0.57179833 -0.07684097]
 [-0.12536281  0.27034237  0.17848091  0.23029498  0.00280682]]
[0 0 1

In [11]:
# 独热化
label_cnt=len(Counter(y_train))
y_train=utils.to_categorical(y_train, label_cnt)
y_test=utils.to_categorical(y_test, label_cnt)
print(y_train)

[[1. 0.]
 [1. 0.]
 [0. 1.]
 ...
 [0. 1.]
 [1. 0.]
 [0. 1.]]


In [12]:
# LSTM模型搭建
model=Sequential()
model.add(LSTM(units=50, activation='relu',input_shape=(x_train.shape[1], x_train.shape[2]),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=25))
model.add(Dense(y_train.shape[1], activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 15, 50)            11200     
_________________________________________________________________
dropout (Dropout)            (None, 15, 50)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 25)                1275      
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 52        
Total params: 32,727
Trainable params: 32,727
Non-trainable params: 0
____________________________________________________

In [13]:
# 模型训练
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 
h = model.fit(x_train, y_train, validation_split=0.2, batch_size=1, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
# 测试
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
print(f'测试集损失值: {test_loss}, 测试集准确率: {test_acc}')

测试集损失值: 0.65190190076828, 测试集准确率: 0.75
