In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import data_handle
import tensorflow as tf

# 载入并处理数据
1.读取数据

In [2]:
X, dummies, weight, label = data_handle.read_data('data/stock_train_data_20170910.csv')

2.将数据平均化，并去除极端值

In [3]:
X, scaled_features = data_handle.scale_feature(X,dummies,quantile_percent=0.995)

3.将数据进行随机分组，分成测试集与训练集

In [4]:
X_train, Y_train, X_test, Y_test = data_handle.data_split(X, label, test_size=0.1)
print('X_train shape:',X_train.shape,'\n',
     'Y_train shape:', Y_train.shape,'\n',
     'X_test shape:', X_test.shape,'\n',
     'Y_test shape:', Y_test.shape)

X_train shape: (289506, 116) 
 Y_train shape: (289506,) 
 X_test shape: (32168, 116) 
 Y_test shape: (32168,)


4.定义分批获取数据函数

In [5]:
def get_batches(X, Y, batch_size):
    data_len = len(X)
    for i in range(0, data_len, batch_size):
        end = i + batch_size
        if end > data_len:
            end = -1
        x = X[i: end].reshape(-1,X.shape[1])
        #print(x.shape)
        y = Y[i : end].reshape(-1,1)
        yield x, y

# 模型构建

In [6]:
def build_inputs(num_features):
    '''
    构建输入
    '''
    inputs = tf.placeholder(tf.float32, [None, num_features], name='inputs')
    targets = tf.placeholder(tf.float32, [None, 1], name='targets')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    return inputs, targets, keep_prob

In [7]:
def fc_model(inputs,keep_prob):
    layer1 = tf.layers.dense(inputs,58,activation=tf.nn.relu,kernel_initializer=tf.truncated_normal_initializer())
    dropout = tf.nn.dropout(layer1,keep_prob)
    layer2 = tf.layers.dense(dropout,29,activation=tf.nn.relu,kernel_initializer=tf.truncated_normal_initializer())
    dropout = tf.nn.dropout(layer2,keep_prob)
    layer3 = tf.layers.dense(dropout,14,activation=tf.nn.relu,kernel_initializer=tf.truncated_normal_initializer())
    dropout = tf.nn.dropout(layer3,keep_prob)
    logits = tf.layers.dense(dropout,1,activation=None,kernel_initializer=tf.truncated_normal_initializer(), name='logits')
    return logits

# 训练模型

In [22]:
def train(X_train,Y_train,X_test,Y_test,keep_prob,epoch_count, batch_size, learning_rate=0.001, num_features=116):
    inputs, targets, k_p = build_inputs(num_features)
    logits = fc_model(inputs,k_p)
    out = tf.sigmoid(logits)
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,labels=targets))
    train_opt = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
    correct_pred = tf.equal(tf.cast(tf.round(out), tf.int32), tf.cast(targets, tf.int32))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
    steps = 0
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch_i in range(epoch_count):
            for x,y in get_batches(X_train,Y_train,batch_size):
                steps += 1
                _, train_loss, train_accuracy = sess.run([train_opt, loss, accuracy], feed_dict={inputs:x, targets:y, k_p:keep_prob})
                
                if steps % 1000 == 0:
                    test_loss, test_accuracy = sess.run([loss, accuracy], feed_dict={inputs:X_test.reshape(-1,num_features),
                                                                                     targets:Y_test.reshape(-1,1), k_p:1.0})
                    print("Epoch {}/{}.".format(epoch_i+1, epoch_count),
                          "train_loss: {:.4f}..".format(train_loss),
                          "train_acc: {:.4f}..".format(train_accuracy),
                          "test_loss:{:.4f}..".format(test_loss),
                          "test_acc:{:.4f}..".format(test_accuracy))
                    
        data = pd.read_csv('data/stock_test_data_20170910.csv')
        dummies = pd.get_dummies(data['group'], prefix='group', drop_first=False)
        X = data.drop(['group','id'],axis=1)
        for each in X.columns:
            X.loc[:, each] = (X[each] - scaled_features[each][0])/scaled_features[each][1]
            X.loc[X[each]>X[each].quantile(0.995)] = X[each].quantile(0.995)
                              
        X = pd.concat([X, dummies], axis=1).values
        output = sess.run(out, feed_dict={inputs:X.reshape(-1,116),k_p:1.0})
        print(len(output))
        print(len(data))
        data['proba'] = output
        data[['id','proba']].to_csv('proba.csv',index=False)

In [28]:
batch_size = 1000
learning_rate = 0.0003
keep_prob = 0.80
epochs = 800

with tf.Graph().as_default():
    train(X_train,Y_train,X_test,Y_test,keep_prob,epochs,batch_size,learning_rate)

Epoch 4/800. train_loss: 13.4332.. train_acc: 0.5030.. test_loss:4.5527.. test_acc:0.5145..
Epoch 7/800. train_loss: 2.0233.. train_acc: 0.4960.. test_loss:0.7463.. test_acc:0.4780..
Epoch 11/800. train_loss: 0.8968.. train_acc: 0.4750.. test_loss:0.6932.. test_acc:0.5130..
Epoch 14/800. train_loss: 1.0306.. train_acc: 0.5430.. test_loss:0.6916.. test_acc:0.5300..
Epoch 18/800. train_loss: 0.7090.. train_acc: 0.5380.. test_loss:0.6911.. test_acc:0.5300..
Epoch 21/800. train_loss: 0.7137.. train_acc: 0.5460.. test_loss:0.6913.. test_acc:0.5300..
Epoch 25/800. train_loss: 0.7151.. train_acc: 0.5450.. test_loss:0.6910.. test_acc:0.5300..
Epoch 28/800. train_loss: 0.6932.. train_acc: 0.5230.. test_loss:0.6912.. test_acc:0.5300..
Epoch 32/800. train_loss: 0.7041.. train_acc: 0.5290.. test_loss:0.6910.. test_acc:0.5300..
Epoch 35/800. train_loss: 0.7426.. train_acc: 0.5020.. test_loss:0.6910.. test_acc:0.5300..
Epoch 38/800. train_loss: 0.6954.. train_acc: 0.4940.. test_loss:0.6910.. test_ac

Epoch 311/800. train_loss: 0.6528.. train_acc: 0.5970.. test_loss:0.6412.. test_acc:0.6216..
Epoch 314/800. train_loss: 0.6540.. train_acc: 0.6040.. test_loss:0.6418.. test_acc:0.6220..
Epoch 318/800. train_loss: 0.6373.. train_acc: 0.6410.. test_loss:0.6419.. test_acc:0.6217..
Epoch 321/800. train_loss: 0.6560.. train_acc: 0.6000.. test_loss:0.6408.. test_acc:0.6239..
Epoch 325/800. train_loss: 0.6387.. train_acc: 0.6060.. test_loss:0.6403.. test_acc:0.6234..
Epoch 328/800. train_loss: 0.6451.. train_acc: 0.6100.. test_loss:0.6407.. test_acc:0.6239..
Epoch 332/800. train_loss: 0.6662.. train_acc: 0.6070.. test_loss:0.6396.. test_acc:0.6244..
Epoch 335/800. train_loss: 0.6518.. train_acc: 0.6080.. test_loss:0.6399.. test_acc:0.6242..
Epoch 338/800. train_loss: 0.6415.. train_acc: 0.6250.. test_loss:0.6386.. test_acc:0.6248..
Epoch 342/800. train_loss: 0.6454.. train_acc: 0.6170.. test_loss:0.6399.. test_acc:0.6243..
Epoch 345/800. train_loss: 0.6550.. train_acc: 0.6060.. test_loss:0.63

Epoch 618/800. train_loss: 0.6112.. train_acc: 0.6680.. test_loss:0.6240.. test_acc:0.6398..
Epoch 621/800. train_loss: 0.6332.. train_acc: 0.6420.. test_loss:0.6239.. test_acc:0.6405..
Epoch 625/800. train_loss: 0.6296.. train_acc: 0.6370.. test_loss:0.6248.. test_acc:0.6403..
Epoch 628/800. train_loss: 0.6242.. train_acc: 0.6230.. test_loss:0.6240.. test_acc:0.6408..
Epoch 632/800. train_loss: 0.6430.. train_acc: 0.6040.. test_loss:0.6241.. test_acc:0.6416..
Epoch 635/800. train_loss: 0.6462.. train_acc: 0.6340.. test_loss:0.6233.. test_acc:0.6409..
Epoch 638/800. train_loss: 0.6301.. train_acc: 0.6160.. test_loss:0.6224.. test_acc:0.6416..
Epoch 642/800. train_loss: 0.6295.. train_acc: 0.6340.. test_loss:0.6234.. test_acc:0.6417..
Epoch 645/800. train_loss: 0.6294.. train_acc: 0.6320.. test_loss:0.6235.. test_acc:0.6404..
Epoch 649/800. train_loss: 0.6077.. train_acc: 0.6490.. test_loss:0.6222.. test_acc:0.6403..
Epoch 652/800. train_loss: 0.6330.. train_acc: 0.6240.. test_loss:0.62