In [1]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, CuDNNLSTM, Dropout
import os
import tensorflow as tf
import keras
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from keras.losses import huber_loss
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.preprocessing import StandardScaler
import seaborn
import logging


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# 给ret分类
def label_ret(ret):
    '''
    class 0 = [-inf, -0.1]
    class 1 = [-0.1, 0.1] unprofitable
    class 2 = [0.1, inf]
    '''
    label = None
    if ret < -0.1:
        label = 0
    elif -0.1 <= ret and ret <= 0.1:
        label = 1
    else:
        label = 2
        
    return label
def label_ret_bi(ret):
    label = None
    if ret <= 0:
        label = 0
    else:
        label = 1
    return label

def label_ret2(ret):

    if -0.1 <= ret and ret <= 0.1:
        return 0
    elif 0.1 < ret and ret <= 0.3:
        return 1
    elif 0.3 < ret:
        return 2
    elif -0.3 <= ret and ret < -0.1:
        return 3
    elif ret < -0.3:
        return 4
    else:
        raise ValueError

        
def generate_sequence(X_df, y_series, seq_length):
    assert (X_df.index == y_series.index).all()
    dataX = list()
    dataY = list()
    index = list()
    for i in range(0, X_df.shape[0] - seq_length + 1):
        dataX.append(X_df[i:i+seq_length])
        dataY.append(y_series[i+seq_length-1])
        index.append(y_series.index[i+seq_length-1])
        
    return dataX, dataY, pd.Index(index)

In [3]:
variety = 'RB'
factor_store = pd.HDFStore('/home/data/vb/training_x_150.h5', mode='r')
factor_df = factor_store.get(variety)
y_store = pd.HDFStore('/home/data/vb/training_y_reg_150.h5', mode='r')
y_series = y_store.get(variety)
helper_df = pd.read_parquet('/home/data/vb/training_helper_150_{}.parquet'.format(variety))

# 对ret做分类
ret_y_series = np.exp(y_series) - 1 # 获得回报的原始收益

label_y_series = ret_y_series.transform(label_ret2).rename('Y_label') # 分类标签
ret_label_df = pd.concat([ret_y_series, label_y_series], axis=1) # 合并log ret和label
assert (factor_df.index == label_y_series.index).all() # 确认数据和标签索引一样


# 对齐日期 去掉na
pd.options.mode.use_inf_as_na = True
df = helper_df.join(factor_df, how='inner').join(label_y_series, how='inner')
df.dropna(inplace=True)

# 再次得到 factor_df, label_y_series, helper_df
factor_df = df[factor_df.columns]
label_y_series = df[label_y_series.name]
helper_df = df[helper_df.columns]

assert (factor_df.index == label_y_series.index).all() and \
        (label_y_series.index == helper_df.index).all()     # 确认数据和标签索引一样

# train val test split
factor_df_train, factor_df_test, label_y_series_train, label_y_series_test = \
train_test_split(factor_df, label_y_series, test_size=0.2, shuffle=False)
factor_df_train, factor_df_val, label_y_series_train, label_y_series_val = \
train_test_split(factor_df_train, label_y_series_train, test_size=0.1, shuffle=False)


# normalize data 在这里会丢失dataframe
scaler = StandardScaler()
scaler.fit(factor_df_train)
factor_train_normalized = scaler.transform(factor_df_train)
factor_val_normalized = scaler.transform(factor_df_val)
factor_test_normalized = scaler.transform(factor_df_test)


# 将dataframe的index和columns加回去
factor_df_train_normalized = pd.DataFrame(factor_train_normalized, 
                                          index=factor_df_train.index, columns=factor_df_train.columns)
factor_df_val_normalized = pd.DataFrame(factor_val_normalized, 
                                        index=factor_df_val.index, columns=factor_df_val.columns)
factor_df_test_normalized = pd.DataFrame(factor_test_normalized, 
                                         index=factor_df_test.index, columns=factor_df_test.columns)
del factor_train_normalized
del factor_val_normalized
del factor_test_normalized



In [4]:
# 给lstm制造时间序列数据

NUM_CLASSES = 5
seq_length = 10
X_train, y_train, index_train = generate_sequence(factor_df_train_normalized, label_y_series_train, seq_length)
X_val, y_val, index_val = generate_sequence(factor_df_val_normalized, label_y_series_val, seq_length)
X_test, y_test, index_test = generate_sequence(factor_df_test_normalized, label_y_series_test, seq_length)

X_train = np.array([factor_seq_df.values for factor_seq_df in X_train]) # 将list 转换为ndarray
X_val = np.array([factor_seq_df.values for factor_seq_df in X_val]) # 将list 转换为ndarray
X_test = np.array([factor_seq_df.values for factor_seq_df in X_test]) # 将list 转换为ndarray

y_cat_train = keras.utils.to_categorical(y_train, num_classes=NUM_CLASSES) # 标签转换为one hot
y_cat_val = keras.utils.to_categorical(y_val, num_classes=NUM_CLASSES) # 标签转换为one hot
y_cat_test = keras.utils.to_categorical(y_test, num_classes=NUM_CLASSES) # 标签转换为one hot

In [5]:
def set_gpu_option():
    os.environ["CUDA_VISIBLE_DEVICES"]="1" 
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    sess = tf.Session(config=config)
    keras.backend.tensorflow_backend.set_session(sess)
    
    return sess
    
def create_lstm_model(input_shape, num_classes):
    NUM_NEURONS = 1
    MULTIPLIER = 1
    model = Sequential()
    model.add(CuDNNLSTM(32*MULTIPLIER, input_shape=input_shape, return_sequences=True))
    model.add(CuDNNLSTM(16*MULTIPLIER, return_sequences=False))
    model.add(Dense(16*MULTIPLIER, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))  
    model.summary()
    
    return model

def create_lstm_model_dropout(input_shape, num_classes):
    NUM_NEURONS = 1
    MULTIPLIER = 2
    model = Sequential()
    model.add(CuDNNLSTM(64*MULTIPLIER, input_shape=input_shape, return_sequences=True))
    model.add(Dropout(0.25))
    model.add(CuDNNLSTM(32*MULTIPLIER, return_sequences=False))
    model.add(Dropout(0.25))
    model.add(Dense(16*MULTIPLIER, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(num_classes, activation='softmax'))  
    model.summary()
    
    return model

In [6]:
from keras import backend as K

def Precision(y_true, y_pred):
    #logging.warning('y_pred: '.format(y_pred))
    """精确率"""
    tp= K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))  # true positives
    pp= K.sum(K.round(K.clip(y_pred, 0, 1))) # predicted positives
    precision = tp/ (pp+ K.epsilon())
    return precision
    
def Recall(y_true, y_pred):
    """召回率"""
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) # true positives
    pp = K.sum(K.round(K.clip(y_true, 0, 1))) # possible positives
    recall = tp / (pp + K.epsilon())
    return recall
 
def F1(y_true, y_pred):
    """F1-score"""
    precision = Precision(y_true, y_pred)
    recall = Recall(y_true, y_pred)
    f1 = 2 * ((precision * recall) / (precision + recall + K.epsilon()))
    return f1    


In [42]:
sess = set_gpu_option()
input_shape = (X_train.shape[1], X_train.shape[2])
print('input_shape: ', input_shape)
model = create_lstm_model_dropout(input_shape=input_shape ,num_classes=NUM_CLASSES)
adam = keras.optimizers.Adam(learning_rate=1e-3)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['categorical_accuracy', keras.metrics.Precision(), keras.metrics.Recall(), F1])

input_shape:  (10, 176)
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_3 (CuDNNLSTM)     (None, 10, 128)           156672    
_________________________________________________________________
dropout_4 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
cu_dnnlstm_4 (CuDNNLSTM)     (None, 64)                49664     
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_6 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_4 (Dense)              (

In [8]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, write_grads=True)

In [7]:
majority_label_train = np.sum(y_cat_train, axis=0)/np.sum(y_cat_train)
print('majority_label_train: \n', majority_label_train)
majority_label_val = np.sum(y_cat_val, axis=0)/np.sum(y_cat_val)
print('majority_label_val: \n', majority_label_val)
majority_label_test = np.sum(y_cat_test, axis=0)/np.sum(y_cat_test)
print('majority_label_test: \n', majority_label_test)


majority_label_train: 
 [0.21709464 0.16683045 0.2442199  0.21006767 0.16178733]
majority_label_val: 
 [0.15989976 0.13099416 0.2877193  0.18730159 0.23408522]
majority_label_test: 
 [0.28852832 0.21841614 0.15291132 0.25373933 0.08640491]


In [44]:
model.fit(x=X_train, y=y_cat_train, epochs=50, verbose=True, batch_size=None, validation_data=(X_val, y_cat_val), 
          shuffle=False)#, callbacks=[tensorboard_callback])


Train on 53935 samples, validate on 5985 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x7f66a3c2a3c8>

In [45]:
model.evaluate(x=X_test, y=y_cat_test)



[4.919103020276779,
 0.22122061252593994,
 0.21677398681640625,
 0.18656517565250397,
 0.1988639384508133]

In [46]:
prediction = model.predict(X_test)
prediction

array([[2.7456611e-01, 1.1654310e-01, 1.4132613e-01, 2.5481370e-01,
        2.1275097e-01],
       [3.4269160e-01, 2.7704176e-01, 2.3469804e-01, 1.1959654e-01,
        2.5972046e-02],
       [3.7739170e-01, 9.2459977e-02, 3.5829518e-02, 3.5073563e-01,
        1.4358313e-01],
       ...,
       [3.4104082e-01, 1.1278135e-01, 8.1267700e-02, 2.9250365e-01,
        1.7240644e-01],
       [5.6892210e-01, 3.8844913e-01, 1.9255552e-02, 2.3313148e-02,
        6.0119033e-05],
       [7.6469284e-01, 2.1602356e-01, 4.6928335e-04, 1.8812284e-02,
        2.0248576e-06]], dtype=float32)

In [47]:
y_cat_test

array([[0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

In [19]:
import tensorflow as tf
sess = tf.InteractiveSession()
y_cat_test * prediction




array([[0.        , 0.47203258],
       [0.        , 0.50471234],
       [0.        , 0.49812973],
       ...,
       [0.        , 0.2736347 ],
       [0.        , 0.35306215],
       [0.        , 0.7921562 ]], dtype=float32)

In [21]:
K.clip(y_cat_test * prediction, 0, 1).eval()

array([[0.        , 0.47203258],
       [0.        , 0.50471234],
       [0.        , 0.49812973],
       ...,
       [0.        , 0.2736347 ],
       [0.        , 0.35306215],
       [0.        , 0.7921562 ]], dtype=float32)

In [22]:
K.round(K.clip(y_cat_test * prediction, 0, 1)).eval()

array([[0., 0.],
       [0., 1.],
       [0., 0.],
       ...,
       [0., 0.],
       [0., 0.],
       [0., 1.]], dtype=float32)

In [23]:
K.sum(K.round(K.clip(y_cat_test * prediction, 0, 1))).eval()

7693.0

In [28]:
K.sum(K.round(K.clip(prediction, 0, 1))).eval()

14976.0

In [29]:
Precision(y_cat_test, prediction).eval()

0.51368856

In [None]:
prediction_df = pd.DataFrame(prediction, index=index_test)

In [None]:
helper_df_test = helper_df.loc[prediction_df.index]
helper_df_test.drop_duplicates(inplace=True)

In [None]:
from pyecharts.charts import Line
from pyecharts.globals import CurrentConfig, NotebookType
from pyecharts import options as opts

CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB

line = Line()
line.add_xaxis(prediction_df.index.tolist())
line.add_yaxis('close', helper_df_test.close)
line.add_yaxis('0', prediction_df[0], yaxis_index=1)
line.add_yaxis('1', prediction_df[1], yaxis_index=1)
line.add_yaxis('2', prediction_df[2], yaxis_index=1)

line.set_global_opts(datazoom_opts=opts.DataZoomOpts())
line.extend_axis(yaxis=opts.AxisOpts())

line.load_javascript()


In [None]:
line.render_notebook()

In [None]:
pd.Index(index_test)

In [None]:
prediction.shape

In [None]:
factor_df_test_index

In [None]:
y_cat[:1000].shape


In [None]:
factor_df.join(label_y_series, how='inner')

In [None]:
factor_df.dropna(inplace=True)
factor_df[factor_df.isna().any(axis=1)]

In [None]:
factor_df.where(factor_df == 'inf', other=False).any(axis=1)

In [None]:
factor_store = pd.HDFStore('/home/data/training_x_150.h5', mode='r')
variety = 'A'
factor_df = factor_store.get(variety)

In [None]:
pd.options.mode.use_inf_as_na = True

In [None]:
factor_df[factor_df.isna().any(axis=1)]