In [82]:
import pandas as pd
from keras.models import Sequential
from keras.layers import (Dense, CuDNNLSTM, Dropout, Conv1D, Conv2D, Reshape, Activation, MaxPooling2D, Flatten,
                        BatchNormalization)
import os
import tensorflow as tf
import keras
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from keras.losses import huber_loss
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.preprocessing import StandardScaler
import seaborn


In [83]:
# 给ret分类
def label_ret(ret):
    '''
    class 0 = [-inf, -0.1]
    class 1 = [-0.1, 0.1] unprofitable
    class 2 = [0.1, inf]
    '''
    label = None
    if ret < -0.1:
        label = 0
    elif -0.1 <= ret and ret <= 0.1:
        label = 1
    else:
        label = 2
        
    return label
def label_ret_bi(ret):
    label = None
    if ret <= 0:
        label = 0
    else:
        label = 1
    return label
def label_ret2(ret):
    if -0.1 < ret and ret < 0.1:
        return 0
    else:
        return 1
    
def label_ret3(ret):
    if -1 <= ret and ret <= 1:
        return 0
    elif 1 < ret:
        return 1
    elif ret < -1:
        return 2
    else:
        raise ValueError
        
def generate_sequence(X_df, y_series, seq_length):
    assert (X_df.index == y_series.index).all()
    dataX = list()
    dataY = list()
    index = list()
    for i in range(0, X_df.shape[0] - seq_length + 1):
        dataX.append(X_df[i:i+seq_length])
        dataY.append(y_series[i+seq_length-1])
        index.append(y_series.index[i+seq_length-1])
        
    return dataX, dataY, pd.Index(index)

In [84]:
variety = 'RB'
factor_store = pd.HDFStore('/home/data/vb/training_x_150.h5', mode='r')
factor_df = factor_store.get(variety)
y_store = pd.HDFStore('/home/data/vb/training_y_reg_150.h5', mode='r')
y_series = y_store.get(variety)
helper_df = pd.read_parquet('/home/data/vb/training_helper_150_{}.parquet'.format(variety))

# 对ret做分类
ret_y_series = np.exp(y_series) - 1 # 获得回报的原始收益

label_y_series = ret_y_series.transform(label_ret_bi).rename('Y_label') # 分类标签
ret_label_df = pd.concat([ret_y_series, label_y_series], axis=1) # 合并log ret和label
assert (factor_df.index == label_y_series.index).all() # 确认数据和标签索引一样


# 对齐日期 去掉na
pd.options.mode.use_inf_as_na = True
df = helper_df.join(factor_df, how='inner').join(label_y_series, how='inner')
df.dropna(inplace=True)

# 再次得到 factor_df, label_y_series, helper_df
factor_df = df[factor_df.columns]
label_y_series = df[label_y_series.name]
helper_df = df[helper_df.columns]

assert (factor_df.index == label_y_series.index).all() and \
        (label_y_series.index == helper_df.index).all()     # 确认数据和标签索引一样

# train val test split
factor_df_train, factor_df_test, label_y_series_train, label_y_series_test = \
train_test_split(factor_df, label_y_series, test_size=0.2, shuffle=False)
factor_df_train, factor_df_val, label_y_series_train, label_y_series_val = \
train_test_split(factor_df_train, label_y_series_train, test_size=0.1, shuffle=False)


# normalize data 在这里会丢失dataframe
scaler = StandardScaler()
scaler.fit(factor_df_train)
factor_train_normalized = scaler.transform(factor_df_train)
factor_val_normalized = scaler.transform(factor_df_val)
factor_test_normalized = scaler.transform(factor_df_test)


# 将dataframe的index和columns加回去
factor_df_train_normalized = pd.DataFrame(factor_train_normalized, 
                                          index=factor_df_train.index, columns=factor_df_train.columns)
factor_df_val_normalized = pd.DataFrame(factor_val_normalized, 
                                        index=factor_df_val.index, columns=factor_df_val.columns)
factor_df_test_normalized = pd.DataFrame(factor_test_normalized, 
                                         index=factor_df_test.index, columns=factor_df_test.columns)
del factor_train_normalized
del factor_val_normalized
del factor_test_normalized



In [86]:
# 给lstm制造时间序列数据

NUM_CLASSES = 2
seq_length = 100
X_train, y_train, index_train = generate_sequence(factor_df_train_normalized, label_y_series_train, seq_length)
X_val, y_val, index_val = generate_sequence(factor_df_val_normalized, label_y_series_val, seq_length)
X_test, y_test, index_test = generate_sequence(factor_df_test_normalized, label_y_series_test, seq_length)

X_train = np.array([factor_seq_df.values for factor_seq_df in X_train]) # 将list 转换为ndarray
X_val = np.array([factor_seq_df.values for factor_seq_df in X_val]) # 将list 转换为ndarray
X_test = np.array([factor_seq_df.values for factor_seq_df in X_test]) # 将list 转换为ndarray

y_cat_train = keras.utils.to_categorical(y_train, num_classes=NUM_CLASSES) # 标签转换为one hot
y_cat_val = keras.utils.to_categorical(y_val, num_classes=NUM_CLASSES) # 标签转换为one hot
y_cat_test = keras.utils.to_categorical(y_test, num_classes=NUM_CLASSES) # 标签转换为one hot

In [5]:
X_train.shape[1:] + (1,)

(100, 176, 1)

In [87]:
def set_gpu_option():
    os.environ["CUDA_VISIBLE_DEVICES"]="1" 
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    sess = tf.Session(config=config)
    keras.backend.tensorflow_backend.set_session(sess)
    
    return sess
    
def create_lstm_model(input_shape, num_classes):
    NUM_NEURONS = 1
    MULTIPLIER = 1
    model = Sequential()
    model.add(CuDNNLSTM(64*MULTIPLIER, input_shape=input_shape, return_sequences=True))
    model.add(CuDNNLSTM(32*MULTIPLIER, return_sequences=False))
    model.add(Dense(16*MULTIPLIER, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))  
    model.summary()
    
    return model

def create_lstm_model_dropout(input_shape, num_classes):
    NUM_NEURONS = 1
    MULTIPLIER = 8
    model = Sequential()
    model.add(CuDNNLSTM(64*MULTIPLIER, input_shape=input_shape, return_sequences=True))
    model.add(Dropout(0.25))
    model.add(CuDNNLSTM(32*MULTIPLIER, return_sequences=False))
    model.add(Dense(16*MULTIPLIER, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))  
    model.summary()
    
    return model

def create_cnn_model(input_shape, num_classes):
    model = Sequential()
    model.add(Reshape(input_shape+(1,), input_shape=input_shape))
    model.add(Conv2D(32, (5, 5), padding='same', activation='relu'))

    model.add(Flatten())
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    
    model.summary()
    
    return model

def create_cnn_conv1d_model(input_shape, num_classes):
    model = Sequential()
    model.add(Conv1D(filters=20, kernel_size=30, activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(Conv1D(filters=30, kernel_size=30, activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv1D(filters=30, kernel_size=30, activation='relu'))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(num_classes, activation='softmax'))
    model.summary()
    
    return model

In [88]:
from keras import backend as K
def Precision(y_true, y_pred):
    """精确率"""
    tp= K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))  # true positives
    pp= K.sum(K.round(K.clip(y_pred, 0, 1))) # predicted positives
    precision = tp/ (pp+ K.epsilon())
    return precision
    
def Recall(y_true, y_pred):
    """召回率"""
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) # true positives
    pp = K.sum(K.round(K.clip(y_true, 0, 1))) # possible positives
    recall = tp / (pp + K.epsilon())
    return recall
 
def F1(y_true, y_pred):
    """F1-score"""
    precision = Precision(y_true, y_pred)
    recall = Recall(y_true, y_pred)
    f1 = 2 * ((precision * recall) / (precision + recall + K.epsilon()))
    return f1    


In [89]:
sess = set_gpu_option()
input_shape = (X_train.shape[1], X_train.shape[2])
print('input_shape: ', input_shape)
model = create_cnn_conv1d_model(input_shape=input_shape ,num_classes=NUM_CLASSES)
adam = keras.optimizers.Adam(learning_rate=1e-3)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy', Precision, Recall, F1])

input_shape:  (100, 176)
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_22 (Conv1D)           (None, 71, 20)            105620    
_________________________________________________________________
batch_normalization_29 (Batc (None, 71, 20)            80        
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 42, 30)            18030     
_________________________________________________________________
batch_normalization_30 (Batc (None, 42, 30)            120       
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 13, 30)            27030     
_________________________________________________________________
batch_normalization_31 (Batc (None, 13, 30)            120       
_________________________________________________________________
flatten_8 (Flatten)          

In [90]:
majority_label_train = np.sum(y_cat_train, axis=0)/np.sum(y_cat_train)
print('majority_label_train: \n', majority_label_train)
majority_label_val = np.sum(y_cat_val, axis=0)/np.sum(y_cat_val)
print('majority_label_val: \n', majority_label_val)
majority_label_test = np.sum(y_cat_test, axis=0)/np.sum(y_cat_test)
print('majority_label_test: \n', majority_label_test)


majority_label_train: 
 [0.50641656 0.49358344]
majority_label_val: 
 [0.5216285 0.4783715]
majority_label_test: 
 [0.51504767 0.4849523 ]


In [13]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, write_grads=True)

In [91]:
model.fit(x=X_train, y=y_cat_train, epochs=10, verbose=True, batch_size=None, validation_data=(X_val, y_cat_val), 
          shuffle=False)#, callbacks=[tensorboard_callback])


Train on 53845 samples, validate on 5895 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f5be87d49e8>

In [92]:
model.evaluate(x=X_test, y=y_cat_test)



[2.210644110478686,
 0.4905951917171478,
 0.4897398054599762,
 0.4897398054599762,
 0.4897397756576538]

In [93]:
prediction = model.predict(X_test)
prediction

array([[0.5476969 , 0.45230305],
       [0.3923277 , 0.60767233],
       [0.37865564, 0.6213443 ],
       ...,
       [0.6948963 , 0.30510372],
       [0.5194539 , 0.48054606],
       [0.59105015, 0.40894982]], dtype=float32)

In [94]:
import numpy as np
sum(np.argmax(prediction, axis=1) != 0)

10704