In [None]:
%pylab inline
%matplotlib inline
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# 载入多个交易日的数据

In [None]:
file_dir = "/l1/data/FBDQA2021A_MMP_Challenge_ver0.2/data"

sym = 4
dates = list(range(12))
df = pd.DataFrame()
for date in dates:
    if (date & 1):
        file_name = f"snapshot_sym{sym}_date{date//2}_am.csv"
    else:
        file_name = f"snapshot_sym{sym}_date{date//2}_pm.csv"
    df = df.append(pd.read_csv(os.path.join(file_dir,file_name)))


In [None]:
df

In [None]:
# 确定有无na值，若有要进行处理（是否一定能用0填充）
df.isnull().values.any()

In [None]:
feature_col_names = ['n_bid1','n_bid2','n_bid3','n_bid4','n_bid5',\
                     'n_ask1','n_ask2','n_ask3','n_ask4','n_ask5']
label_col_name = ['label_5']

In [None]:
train_sample_nums = 20000

In [None]:
# 别忘了数据形状和存储连续性
train_data = np.ascontiguousarray(df[feature_col_names][:train_sample_nums].values)
train_label = df[label_col_name][:train_sample_nums].values.reshape(-1)

test_data = np.ascontiguousarray(df[feature_col_names][train_sample_nums:].values)
test_label = df[label_col_name][train_sample_nums:].values.reshape(-1)

In [None]:
train_data.flags

## 别忘了看一下标签分布：

In [None]:
print("在训练集中：")
print("标签为0的样本个数：", sum(train_label == 0))
print("标签为1的样本个数：", sum(train_label == 1))
print("标签为2的样本个数：", sum(train_label == 2))

print("在测试集中：")
print("标签为0的样本个数：", sum(test_label == 0))
print("标签为1的样本个数：", sum(test_label == 1))
print("标签为2的样本个数：", sum(test_label == 2))

## 做一下简单的训练：

In [None]:
# 训练一个native svm分类器：
model = SVC()
model.fit(train_data,train_label)

定义一个统计结果的函数：

In [None]:
def check_metric(y, y_hat):
    # 总体情况
    print("预测正确的标签数：", sum(y_hat == y))
    print("总体正确率：", sum(y_hat == y)/len(y_hat))

    # 分标签查看：
    print("真实标签为0样本的正确预测个数：", sum(y[y == 0] == y_hat[y == 0]))
    print("真实标签为1样本的正确预测个数：", sum(y[y == 1] == y_hat[y == 1]))
    print("真实标签为2样本的正确预测个数：", sum(y[y == 2] == y_hat[y == 2]))

    ## 我们更关心上涨下跌情况的预测
    # 所有不为1的标签的召回率（即仅考虑真实标签为上涨或下跌样本是否被正确分类）
    index = y != 1
    print("上涨下跌召回率：", sum(y_hat[index]==y[index])/sum((index)+1e-6))
    # 所有不为1的标签的准确率（即仅考虑预测为上涨或下跌样本是否是正确）
    index = y_hat != 1
    print("上涨下跌准确率：", sum(y_hat[index]==y[index])/sum((index)+1e-6))

In [None]:
# 训练完了之后,看看在训练集的准确率（拟合的怎么样）
y_hat = model.predict(train_data)
y = train_label
check_metric(y, y_hat)

In [None]:
# 再看看测试集的准确率
y_hat = model.predict(test_data)
y = test_label
check_metric(y, y_hat)

## 结果不太好，简单调参看一看：

问题1：类别不均匀，静止tick太多

In [None]:
## 对样本依据类别加权：
model2 = SVC(class_weight='balanced')
model2.fit(train_data,train_label)

In [None]:
## 老规矩，先看训练集
y_hat = model2.predict(train_data)
y = train_label
check_metric(y, y_hat)

In [None]:
## 再看测试集
y_hat = model2.predict(test_data)
y = test_label
check_metric(y, y_hat)

对上涨和下跌的召回率和准确率都大幅上升！

问题2： SVM参数这么多，有没有什么简单的参数调整方法？

In [None]:
model = SVC()
grid_params = [{'kernel':['rbf','linear'],'C':[0.5,1,5], 'class_weight':['balanced']}]
Grid = GridSearchCV(model, grid_params, cv = 5, scoring = 'accuracy',refit=True, n_jobs = 6)
Grid.fit(train_data,train_label)

In [None]:
Grid.best_estimator_

In [None]:
y_hat = Grid.best_estimator_.predict(train_data)
y = train_label
check_metric(y, y_hat)

In [None]:
y_hat = Grid.best_estimator_.predict(test_data)
y = test_label
check_metric(y, y_hat)

为什么效果并没有特别显著的提升？
- scoring的方式?
- 参数覆盖范围太小？

数据分析就是一步一步思考，一点一点接近“更优解”的探索过程