In [50]:
# coding=utf8

import pandas as pd
import numpy as np
from sklearn.svm import SVC
import matplotlib.pyplot as plt

In [51]:
# 加载数据
df = pd.read_csv('data/DQC00-1d.txt',
                 sep=',',
                 nrows=3000,
                 index_col=['datetime'],  # 设置行索引
                 parse_dates=['datetime'],  # 解析时间 20100104130500.0
                 date_parser=lambda x: pd.to_datetime(x, format='%Y%m%d%H%M%S.%f'),  # 时间解析的格式，进行毫秒级数据解析
                 usecols=['datetime', 'open', 'high', 'low', 'close', 'volume'],  # 设置需要用到的列
                 encoding='utf-8',
                 float_precision='round_trip',  # 所有数据会当做string读取, 使用时再进行相应的转换为float
                 )

In [52]:
# pd.reset_option('display.float_format')  # 重置
pd.set_option('precision', 2)  # 显示小数点后的位数
pd.set_option('display.max_rows', 300)  # 控制显示的最大行数
pd.set_option('display.min_rows', 20)  # 确定显示的部分有多少行
# pd.set_option('display.float_format',  '{:,.2f}'.format) # 数字格式化显示 用逗号格式化大值数字 设置数字精度
# 指定列名设置计算精确度，未指定的保持原样
df.round({'open': 0, 'high': 0, 'low': 0, 'close': 0})
# 统一保持1位小数
df.round(0)
# 设置np输出精度
np.set_printoptions(precision=2)

In [53]:
print(df.dtypes)
df.head()

open      float64
high      float64
low       float64
close     float64
volume    float64
dtype: object


Unnamed: 0_level_0,open,high,low,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-09-22,1143.23,1147.88,1106.05,1106.98,77908.0
2004-09-23,1097.68,1099.54,1089.32,1093.04,37196.0
2004-09-24,1082.81,1101.4,1082.81,1095.83,29367.0
2004-09-27,1092.11,1100.47,1075.38,1076.31,17336.0
2004-09-28,1077.24,1088.39,1077.24,1082.81,26681.0


In [54]:
# 2.1 缺失值处理

# 检查数据中是否有缺失值，以下两种方式均可
#Flase:对应特征的特征值中无缺失值
#True：有缺失值
print(df.isnull().any())
print(np.isnan(df).any())
#删除包含缺失值的行
df.dropna(inplace=True)
#返回每一列缺失值统计个数
print(df.isnull().sum())

open      False
high      False
low       False
close     False
volume    False
dtype: bool
open      False
high      False
low       False
close     False
volume    False
dtype: bool
open      0
high      0
low       0
close     0
volume    0
dtype: int64


In [55]:
# 2.2 确定特征值 目标值
# df.reset_index(level=0, inplace=True)   #时间索引成为列

# 特征值取 开 高 低 收 价
X = df.loc[:, ['open', 'high', 'low', 'close']].values
print(X.dtype)
print(X.ndim)
print(X.shape)
print(X[:3, ])

float64
2
(3000, 4)
[[1143.23 1147.88 1106.05 1106.98]
 [1097.68 1099.54 1089.32 1093.04]
 [1082.81 1101.4  1082.81 1095.83]]


In [56]:
# 收盘价>开盘价 标记为类别1 收盘价<=开盘价 标记为类别0
df['flag'] = df.apply(lambda x: 0 if (x['open'] > x['close']) else 1, axis=1)

# 收盘价>开盘价 标记类别为1 收盘价<开盘价 标记类别为-1 收盘价=开盘价 标记类别为0
# epsilon = 0.1  #误差
# df['flag'] = df.apply(
#     lambda x: 0 if (abs(x['close'] - x['open']) <= epsilon)
#     else ( 1 if (x['close'] - x['open'] > epsilon) else -1), axis=1)

print(df.head())

# 目标值取收盘
Y = df.loc[:, 'flag'].values
print(Y.shape)
print(Y.ndim)
print(Y[:3])
y1 = Y.copy()

               open     high      low    close   volume  flag
datetime                                                     
2004-09-22  1143.23  1147.88  1106.05  1106.98  77908.0     0
2004-09-23  1097.68  1099.54  1089.32  1093.04  37196.0     0
2004-09-24  1082.81  1101.40  1082.81  1095.83  29367.0     1
2004-09-27  1092.11  1100.47  1075.38  1076.31  17336.0     0
2004-09-28  1077.24  1088.39  1077.24  1082.81  26681.0     1
(3000,)
1
[0 0 1]


In [57]:
print(X.shape,X.ndim)
print(Y.shape,Y.ndim)

(3000, 4) 2
(3000,) 1


In [58]:
#将数据划分为训练集和验证集
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=1)

In [59]:
## 混淆测试集目标类别的 指定概率50%的错误
# err = y_train
# for i in range(int(len(err) * 0.5)):
#     index = int(np.random.randint(len(err)))
#     err[index] = 0 if err[index] == 1 else 1

In [60]:
## 查看测试集是否与未混淆后不同
# print(np.sum(y_train>0)+np.sum(y_test>0), np.sum(Y>0))


In [61]:
# print(y_train[:100])
# print(y_test[:100])
# print(x_train[:3])
# print(x_test[:3])
# print(y_train[:3])
# print(y_test[:3])


In [62]:
# 3. 特征工程（标准化）
from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()

# 训练集 计算平均值和标准偏差
x_train_stand = standardScaler.fit_transform(x_train)
# 以训练集的平均值和标准偏差来缩放测试集
x_test_stand = standardScaler.transform(x_test)

In [63]:
# 目标类别为0，1 不用标准化
print(y_train[:10])

[1 1 1 1 1 1 0 0 1 0]


In [64]:
# 拟合分类模型
rbf = SVC(kernel='rbf', C=10, gamma=1, probability=True)
linear = SVC(kernel='linear', C=10, gamma='auto', probability=True)
poly = SVC(kernel='poly', C=10, gamma='auto', degree=3, coef0=1, probability=True)

In [65]:
svcs = [rbf, linear, poly]
kernel_label = ["rbf", "linear", "poly"]
for ix, svc in enumerate(svcs):
    performance = svc.fit(x_train_stand, y_train.ravel())
    # 获取预测值
    y_test_pred = performance.predict(x_test_stand)
    # 显示估计器
    print(f'ix=%s, svcs=%s' %(ix, svcs[ix]))
    # 获取这个估计器的参数
    print(f'估计器的参数: %s' %(svc.get_params()))
    # https://blog.csdn.net/gracejpw/article/details/101546293
    # 返回预测的决定系数R^2
    # R^2越接近于1，模型的拟合优度越高。
    print(f'训练集R2评分: %s ' %(performance.score(x_train_stand, y_train)))
    print(f'测试集R2评分: %s ' %(svc.score(x_test_stand, y_test)))

ix=0, svcs=SVC(C=10, gamma=1, probability=True)
估计器的参数: {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1, 'kernel': 'rbf', 'max_iter': -1, 'probability': True, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
训练集R2评分: 0.9373333333333334 
测试集R2评分: 0.9253333333333333 
ix=1, svcs=SVC(C=10, gamma='auto', kernel='linear', probability=True)
估计器的参数: {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'auto', 'kernel': 'linear', 'max_iter': -1, 'probability': True, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
训练集R2评分: 0.972 
测试集R2评分: 0.9626666666666667 
ix=2, svcs=SVC(C=10, coef0=1, gamma='auto', kernel='poly', probability=True)
估计器的参数: {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 1, 'decision_function_shape': 'ovr', 'degree'

In [66]:
# 将标准化后数据，还原成样本数据
x_train_inverse = standardScaler.inverse_transform(x_train_stand)
x_test_inverse = standardScaler.inverse_transform(x_test_stand)

In [67]:
clf = linear
print(clf.kernel)
print(x_test_inverse[:10])
print(y_test_pred[:10])
print(y_test[:10])


linear
[[2029.02 2038.41 2027.32 2034.15]
 [1996.33 2006.41 1995.49 2006.41]
 [1773.85 1777.66 1764.33 1764.33]
 [2043.78 2050.77 2037.66 2040.28]
 [1574.49 1581.   1573.56 1575.42]
 [1383.96 1394.18 1376.52 1381.17]
 [1064.22 1066.08 1060.51 1060.51]
 [1075.38 1077.24 1074.45 1074.45]
 [1328.19 1339.34 1323.54 1339.34]
 [2086.23 2090.43 2083.71 2084.55]]
[1 1 0 0 1 0 0 1 1 0]
[1 1 0 0 1 0 0 0 1 0]


In [68]:
print(y_train[:10])
print(y_test[:10])
print(x_train_stand.shape)
print(x_test_stand.shape)
print(y_train.shape)
print(y_train.ndim)
print(x_test_stand[:5])
print(y_test_pred[:50])

[1 1 1 1 1 1 0 0 1 0]
[1 1 0 0 1 0 0 0 1 0]
(2250, 4)
(750, 4)
(2250,)
1
[[ 0.89  0.89  0.91  0.91]
 [ 0.78  0.79  0.81  0.82]
 [ 0.07  0.06  0.07  0.04]
 [ 0.94  0.93  0.94  0.93]
 [-0.57 -0.57 -0.55 -0.57]]
[1 1 0 0 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 1 0 1 1 1 1 0 1 1 0 0 0 1 1 0 0 1 0
 0 1 1 0 0 0 1 0 1 1 1 0 1]


In [69]:
print(y_test.shape)
print(y_test.sum())
print(y_test_pred.sum())

(750,)
378
397
