In [86]:
import pandas as pd
from keras.models import Sequential
from keras.layers import (Dense, CuDNNLSTM, Dropout, Conv1D, Conv2D, Reshape, Activation, MaxPooling2D, Flatten,
                        BatchNormalization)
import os
import tensorflow as tf
import keras
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from keras.losses import huber_loss
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.preprocessing import StandardScaler
import seaborn


In [87]:
# 给ret分类
def label_ret(ret):
    '''
    class 0 = [-inf, -0.1]
    class 1 = [-0.1, 0.1] unprofitable
    class 2 = [0.1, inf]
    '''
    label = None
    if ret < -0.1:
        label = 0
    elif -0.1 <= ret and ret <= 0.1:
        label = 1
    else:
        label = 2
        
    return label
def label_ret_bi(ret):
    label = None
    if ret <= 0:
        label = 0
    else:
        label = 1
    return label

def label_ret2(ret):

    if -0.1 <= ret and ret <= 0.1:
        return 0
    elif 0.1 < ret and ret <= 0.3:
        return 1
    elif 0.3 < ret:
        return 2
    elif -0.3 <= ret and ret < -0.1:
        return 3
    elif ret < -0.3:
        return 4
    else:
        raise ValueError

        
def generate_sequence(X_df, y_series, seq_length):
    assert (X_df.index == y_series.index).all()
    dataX = list()
    dataY = list()
    index = list()
    for i in range(0, X_df.shape[0] - seq_length + 1):
        dataX.append(X_df[i:i+seq_length])
        dataY.append(y_series[i+seq_length-1])
        index.append(y_series.index[i+seq_length-1])
        
    return dataX, dataY, pd.Index(index)

In [88]:
variety = 'RB'
factor_store = pd.HDFStore('/home/data/vb/training_x_150.h5', mode='r')
factor_df = factor_store.get(variety)
y_store = pd.HDFStore('/home/data/vb/training_y_reg_150.h5', mode='r')
y_series = y_store.get(variety)
helper_df = pd.read_parquet('/home/data/vb/training_helper_150_{}.parquet'.format(variety))

# 对ret做分类
ret_y_series = np.exp(y_series) - 1 # 获得回报的原始收益

label_y_series = ret_y_series.transform(label_ret_bi).rename('Y_label') # 分类标签
ret_label_df = pd.concat([ret_y_series, label_y_series], axis=1) # 合并log ret和label
assert (factor_df.index == label_y_series.index).all() # 确认数据和标签索引一样


# 对齐日期 去掉na
pd.options.mode.use_inf_as_na = True
df = helper_df.join(factor_df, how='inner').join(label_y_series, how='inner')
df.dropna(inplace=True)

# 再次得到 factor_df, label_y_series, helper_df
factor_df = df[factor_df.columns]
label_y_series = df[label_y_series.name]
helper_df = df[helper_df.columns]

assert (factor_df.index == label_y_series.index).all() and \
        (label_y_series.index == helper_df.index).all()     # 确认数据和标签索引一样


# normalize data 在这里会丢失dataframe
scaler = StandardScaler()
scaler.fit(factor_df)
factor_normalized = scaler.transform(factor_df)



# 将dataframe的index和columns加回去
factor_df_normalized = pd.DataFrame(factor_normalized, 
                                          index=factor_df.index, columns=factor_df.columns)

del factor_normalized




In [89]:
# 给lstm制造时间序列数据

SEQ_LENGTH = 100
X, y, index = generate_sequence(factor_df_normalized, label_y_series, SEQ_LENGTH)


X = np.array([factor_seq_df.values for factor_seq_df in X]) # 将list 转换为ndarray

y_cat = keras.utils.to_categorical(y) # 标签转换为one hot


In [90]:
def set_gpu_option():
    os.environ["CUDA_VISIBLE_DEVICES"]="1" 
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    sess = tf.Session(config=config)
    keras.backend.tensorflow_backend.set_session(sess)
    
    return sess
    
def create_lstm_model(input_shape, num_classes):
    NUM_NEURONS = 1
    MULTIPLIER = 1
    model = Sequential()
    model.add(CuDNNLSTM(64*MULTIPLIER, input_shape=input_shape, return_sequences=True))
    model.add(CuDNNLSTM(32*MULTIPLIER, return_sequences=False))
    model.add(Dense(16*MULTIPLIER, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))  
    model.summary()
    
    return model

def create_lstm_model_dropout(input_shape, num_classes):
    NUM_NEURONS = 1
    MULTIPLIER = 8
    model = Sequential()
    model.add(CuDNNLSTM(64*MULTIPLIER, input_shape=input_shape, return_sequences=True))
    model.add(Dropout(0.25))
    model.add(CuDNNLSTM(32*MULTIPLIER, return_sequences=False))
    model.add(Dense(16*MULTIPLIER, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))  
    model.summary()
    
    return model

def create_cnn_model(input_shape, num_classes):
    model = Sequential()
    model.add(Reshape(input_shape+(1,), input_shape=input_shape))
    model.add(Conv2D(32, (5, 5), padding='same', activation='relu'))

    model.add(Flatten())
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    
    model.summary()
    
    return model

def create_cnn_conv1d_model(input_shape, num_classes):
    model = Sequential()
    model.add(Conv1D(filters=20, kernel_size=30, activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(Conv1D(filters=30, kernel_size=30, activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv1D(filters=30, kernel_size=30, activation='relu'))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(num_classes, activation='softmax'))
    model.summary()
    
    return model

In [91]:
from keras import backend as K
def Precision(y_true, y_pred):
    """精确率"""
    tp= K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))  # true positives
    pp= K.sum(K.round(K.clip(y_pred, 0, 1))) # predicted positives
    precision = tp/ (pp+ K.epsilon())
    return precision
    
def Recall(y_true, y_pred):
    """召回率"""
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) # true positives
    pp = K.sum(K.round(K.clip(y_true, 0, 1))) # possible positives
    recall = tp / (pp + K.epsilon())
    return recall
 
def F1(y_true, y_pred):
    """F1-score"""
    precision = Precision(y_true, y_pred)
    recall = Recall(y_true, y_pred)
    f1 = 2 * ((precision * recall) / (precision + recall + K.epsilon()))
    return f1    


In [99]:
NUM_CLASSES = y_cat.shape[1]
sess = set_gpu_option()
input_shape = (X.shape[1], X.shape[2])
print('input_shape: ', input_shape)
model = create_cnn_conv1d_model(input_shape=input_shape ,num_classes=NUM_CLASSES)
adam = keras.optimizers.Adam(learning_rate=1e-3)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall(), F1])

input_shape:  (100, 176)
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_20 (Conv1D)           (None, 71, 20)            105620    
_________________________________________________________________
batch_normalization_25 (Batc (None, 71, 20)            80        
_________________________________________________________________
conv1d_21 (Conv1D)           (None, 42, 30)            18030     
_________________________________________________________________
batch_normalization_26 (Batc (None, 42, 30)            120       
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 13, 30)            27030     
_________________________________________________________________
batch_normalization_27 (Batc (None, 13, 30)            120       
_________________________________________________________________
flatten_7 (Flatten)          

In [105]:
num_test = 200
num_train = 1000
train_start = 0
train_end = train_start + num_train
val_start = train_end
val_end = val_start + num_test

while val_end <= X.shape[0]:
    model.fit(x=X[train_start:train_end], y=y_cat[train_start:train_end], epochs=10, verbose=True, 
              batch_size=None, validation_data=(X[val_start:val_end], y_cat[val_start:val_end]), 
              shuffle=False)
    


Train on 800 samples, validate on 200 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [110]:
X.shape[0]

74824

In [84]:
np.rint(model.predict(X[-100:]))

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.

In [97]:
y_cat[-num_test:].sum(axis=0)/y_cat[-num_test:].sum()

array([0.44, 0.56], dtype=float32)