In [1]:
%pylab inline
%matplotlib inline
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier,\
                              GradientBoostingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

Populating the interactive namespace from numpy and matplotlib


# 同样载入多个交易日数据
但是这次我们多做一点操作：

In [2]:
file_dir = "/l1/data/FBDQA2021A_MMP_Challenge_ver0.2/data"

sym = 4
dates = list(range(24))
df = pd.DataFrame()
for date in dates:
    if (date & 1):
        file_name = f"snapshot_sym{sym}_date{date//2}_am.csv"
    else:
        file_name = f"snapshot_sym{sym}_date{date//2}_am.csv"
    new_df = pd.read_csv(os.path.join(file_dir,file_name))
    # 价格+1（从涨跌幅还原到对前收盘价的比例）
    new_df['bid1'] = new_df['n_bid1']+1
    new_df['bid2'] = new_df['n_bid2']+1
    new_df['bid3'] = new_df['n_bid3']+1
    new_df['bid4'] = new_df['n_bid4']+1
    new_df['bid5'] = new_df['n_bid5']+1
    new_df['ask1'] = new_df['n_ask1']+1
    new_df['ask2'] = new_df['n_ask2']+1
    new_df['ask3'] = new_df['n_ask3']+1
    new_df['ask4'] = new_df['n_ask4']+1
    new_df['ask5'] = new_df['n_ask5']+1
    # 均线特征
    new_df['ask1_ma5']  = new_df['ask1'].rolling(window=5,  min_periods=1).mean()
    new_df['ask1_ma10'] = new_df['ask1'].rolling(window=10, min_periods=1).mean()
    new_df['ask1_ma20'] = new_df['ask1'].rolling(window=20, min_periods=1).mean()
    new_df['ask1_ma40'] = new_df['ask1'].rolling(window=40, min_periods=1).mean()
    new_df['ask1_ma60'] = new_df['ask1'].rolling(window=60, min_periods=1).mean()
    new_df['bid1_ma5']  = new_df['bid1'].rolling(window=5,  min_periods=1).mean()
    new_df['bid1_ma10'] = new_df['bid1'].rolling(window=10, min_periods=1).mean()
    new_df['bid1_ma20'] = new_df['bid1'].rolling(window=20, min_periods=1).mean()
    new_df['bid1_ma40'] = new_df['bid1'].rolling(window=40, min_periods=1).mean()
    new_df['bid1_ma60'] = new_df['bid1'].rolling(window=60, min_periods=1).mean()
    
    # 量价组合
    new_df['spread1'] =  new_df['ask1'] - new_df['bid1']
    new_df['spread2'] =  new_df['ask2'] - new_df['bid2']
    new_df['spread3'] =  new_df['ask3'] - new_df['bid3']
    new_df['mid_price1'] =  new_df['ask1'] + new_df['bid1']
    new_df['mid_price2'] =  new_df['ask2'] + new_df['bid2']
    new_df['mid_price3'] =  new_df['ask3'] + new_df['bid3']
    new_df['weighted_ab1'] = (new_df['ask1'] * new_df['n_bsize1'] + new_df['bid1'] * new_df['n_asize1']) / (new_df['n_bsize1'] + new_df['n_asize1'])
    new_df['weighted_ab2'] = (new_df['ask2'] * new_df['n_bsize2'] + new_df['bid2'] * new_df['n_asize2']) / (new_df['n_bsize2'] + new_df['n_asize2'])
    new_df['weighted_ab3'] = (new_df['ask3'] * new_df['n_bsize3'] + new_df['bid3'] * new_df['n_asize3']) / (new_df['n_bsize3'] + new_df['n_asize3'])

    new_df['relative_spread1'] = new_df['spread1'] / new_df['mid_price1']
    new_df['relative_spread2'] = new_df['spread2'] / new_df['mid_price2']
    new_df['relative_spread3'] = new_df['spread3'] / new_df['mid_price3']
    
    # 对量取对数
    new_df['bsize1'] = new_df['n_bsize1'].map(np.log)
    new_df['bsize2'] = new_df['n_bsize2'].map(np.log)
    new_df['bsize3'] = new_df['n_bsize3'].map(np.log)
    new_df['bsize4'] = new_df['n_bsize4'].map(np.log)
    new_df['bsize5'] = new_df['n_bsize5'].map(np.log)
    new_df['asize1'] = new_df['n_asize1'].map(np.log)
    new_df['asize2'] = new_df['n_asize2'].map(np.log)
    new_df['asize3'] = new_df['n_asize3'].map(np.log)
    new_df['asize4'] = new_df['n_asize4'].map(np.log)
    new_df['asize5'] = new_df['n_asize5'].map(np.log)
    new_df['amount'] = new_df['amount_delta'].map(np.log1p)

    df = df.append(new_df)

In [3]:
df[['relative_spread1','relative_spread2','relative_spread3',
                     'weighted_ab1','weighted_ab2','weighted_ab3',
                     'spread1','spread2','spread3',]].describe()

Unnamed: 0,relative_spread1,relative_spread2,relative_spread3,weighted_ab1,weighted_ab2,weighted_ab3,spread1,spread2,spread3
count,47976.0,47976.0,47976.0,47976.0,47976.0,47976.0,47976.0,47976.0,47976.0
mean,0.000414,0.001214,0.002011,1.002546,1.002676,1.002854,0.000831,0.002434,0.004033
std,8.7e-05,0.000102,0.000114,0.012736,0.012572,0.012379,0.000176,0.000208,0.000235
min,0.000381,0.001143,0.001905,0.980991,0.980183,0.981029,0.000769,0.002308,0.003846
25%,0.000391,0.001173,0.001955,0.992223,0.992568,0.993116,0.000782,0.002346,0.003909
50%,0.000395,0.001185,0.001976,0.999631,0.999526,0.999704,0.00079,0.00237,0.003949
75%,0.000408,0.001225,0.002047,1.012603,1.012702,1.012337,0.000826,0.002479,0.004132
max,0.002368,0.003157,0.003946,1.057024,1.057434,1.056919,0.004796,0.006395,0.007994


In [4]:
feature_col_names = ['bid1','bid2','bid3','bid4','bid5',
                     'ask1','ask2','ask3','ask4','ask5',
                     'bsize1','bsize2','bsize3','bsize4','bsize5',
                     'asize1','asize2','asize3','asize4','asize5',
                     'relative_spread1','relative_spread2','relative_spread3',
                     'weighted_ab1','weighted_ab2','weighted_ab3',
                     'spread1','spread2','spread3','amount',
                     'ask1_ma5','ask1_ma10','ask1_ma20','ask1_ma40','ask1_ma60',
                     'bid1_ma5','bid1_ma10','bid1_ma20','bid1_ma40','bid1_ma60'
                    ]
label_col_name = ['label_5']

In [5]:
train_sample_nums = 40000

train_data = np.ascontiguousarray(df[feature_col_names][:train_sample_nums].values)
train_label = df[label_col_name][:train_sample_nums].values.reshape(-1)

test_data = np.ascontiguousarray(df[feature_col_names][train_sample_nums:].values)
test_label = df[label_col_name][train_sample_nums:].values.reshape(-1)

In [6]:
train_data.shape

(40000, 40)

In [7]:
# 确定有无na值
df.isnull().any().any()

False

In [8]:
# 确定有无inf值
np.all(np.isfinite(train_data))

True

In [9]:
print("在训练集中：")
print("标签为0的样本个数：", sum(train_label == 0))
print("标签为1的样本个数：", sum(train_label == 1))
print("标签为2的样本个数：", sum(train_label == 2))

print("在测试集中：")
print("标签为0的样本个数：", sum(test_label == 0))
print("标签为1的样本个数：", sum(test_label == 1))
print("标签为2的样本个数：", sum(test_label == 2))

在训练集中：
标签为0的样本个数： 8636
标签为1的样本个数： 22758
标签为2的样本个数： 8606
在测试集中：
标签为0的样本个数： 1570
标签为1的样本个数： 5070
标签为2的样本个数： 1336


In [10]:
%%time
## 对样本依据类别加权：
model = SVC(class_weight='balanced')
model.fit(train_data,train_label)

CPU times: user 2min 22s, sys: 820 ms, total: 2min 23s
Wall time: 2min 23s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [11]:
# 训练集
y_hat = model.predict(train_data)
y = train_label
# 总体准确率：
print("总体准确率：", sum(y_hat == y)/len(y_hat))
# 所有不为1的标签的召回率（即仅考虑真实标签为上涨或下跌样本是否被正确分类）
index = y != 1
print("训练集上涨下跌召回率：", sum(y_hat[index]==y[index])/sum(index))
# 所有不为1的标签的准确率（即仅考虑预测为上涨或下跌样本是否是正确）
index = y_hat != 1
print("训练集上涨下跌准确率：", sum(y_hat[index]==y[index])/sum(index))

总体准确率： 0.5074
训练集上涨下跌召回率： 0.568785523721146
训练集上涨下跌准确率： 0.3862544308782985


In [12]:
## 测试集
y_hat = model.predict(test_data)
y = test_label
# 总体准确率：
print("总体准确率：", sum(y_hat == y)/len(y_hat))
# 所有不为1的标签的召回率（即仅考虑真实标签为上涨或下跌样本是否被正确分类）
index = y != 1
print("测试集上涨下跌召回率：", sum(y_hat[index]==y[index])/sum(index))
# 所有不为1的标签的准确率（即仅考虑预测为上涨或下跌样本是否是正确）
index = y_hat != 1
print("测试集上涨下跌准确率：", sum(y_hat[index]==y[index])/sum(index))

总体准确率： 0.5466399197592778
测试集上涨下跌召回率： 0.5350997935306263
测试集上涨下跌准确率： 0.37218764959310674


In [13]:
%%time
model = SVC()
grid_params = [{'kernel':['rbf','linear'],'C':[0.5,1,5], 'class_weight':['balanced']}]
Grid = GridSearchCV(model, grid_params, cv = 5, scoring = 'balanced_accuracy',refit=True, n_jobs = 6)
Grid.fit(train_data,train_label)

CPU times: user 2min 25s, sys: 741 ms, total: 2min 26s
Wall time: 13min 59s


GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=6,
             param_grid=[{'C': [0.5, 1, 5], 'class_weight': ['balanced'],
                          'kernel': ['rbf', 'linear']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='balanced_accuracy', verbose=0)

In [14]:
# 训练集
y_hat = Grid.best_estimator_.predict(train_data)
y = train_label
# 总体准确率：
print("总体准确率：", sum(y_hat == y)/len(y_hat))
# 所有不为1的标签的召回率（即仅考虑真实标签为上涨或下跌样本是否被正确分类）
index = y != 1
print("训练集上涨下跌召回率：", sum(y_hat[index]==y[index])/sum(index))
# 所有不为1的标签的准确率（即仅考虑预测为上涨或下跌样本是否是正确）
index = y_hat != 1
print("训练集上涨下跌准确率：", sum(y_hat[index]==y[index])/sum(index))

总体准确率： 0.5074
训练集上涨下跌召回率： 0.568785523721146
训练集上涨下跌准确率： 0.3862544308782985


In [15]:
## 测试集
y_hat = Grid.best_estimator_.predict(test_data)
y = test_label
# 总体准确率：
print("总体准确率：", sum(y_hat == y)/len(y_hat))
# 所有不为1的标签的召回率（即仅考虑真实标签为上涨或下跌样本是否被正确分类）
index = y != 1
print("测试集上涨下跌召回率：", sum(y_hat[index]==y[index])/sum(index))
# 所有不为1的标签的准确率（即仅考虑预测为上涨或下跌样本是否是正确）
index = y_hat != 1
print("测试集上涨下跌准确率：", sum(y_hat[index]==y[index])/sum(index))

总体准确率： 0.5466399197592778
测试集上涨下跌召回率： 0.5350997935306263
测试集上涨下跌准确率： 0.37218764959310674
