In [94]:
# coding=utf8

import pandas as pd
import numpy as np
from sklearn.svm import SVC
import matplotlib.pyplot as plt

In [95]:
# 加载数据
df = pd.read_csv('data/DQC00-1d.txt',
                 sep=',',
                 nrows=500,
                 index_col=['datetime'],  # 设置行索引
                 parse_dates=['datetime'],  # 解析时间 20100104130500.0
                 date_parser=lambda x: pd.to_datetime(x, format='%Y%m%d%H%M%S.%f'),  # 时间解析的格式，进行毫秒级数据解析
                 usecols=['datetime', 'open', 'high', 'low', 'close', 'volume'],  # 设置需要用到的列
                 encoding='utf-8',
                 float_precision='round_trip',  # 所有数据会当做string读取, 使用时再进行相应的转换为float
                 )

In [227]:
# pd.reset_option('display.float_format')  # 重置
pd.set_option('precision', 2)  # 显示小数点后的位数
pd.set_option('display.max_rows', 300)  # 控制显示的最大行数
pd.set_option('display.min_rows', 20)  # 确定显示的部分有多少行
# pd.set_option('display.float_format',  '{:,.2f}'.format) # 数字格式化显示 用逗号格式化大值数字 设置数字精度
# 指定列名设置计算精确度，未指定的保持原样
df.round({'open':0, 'high':0, 'low':0, 'close':0 })
# 统一保持1位小数
df.round(0)
# 设置np输出精度
np.set_printoptions(precision=2)

In [228]:
print(df.dtypes)
df.head()

open      float64
high      float64
low       float64
close     float64
volume    float64
flag        int64
dtype: object


Unnamed: 0_level_0,open,high,low,close,volume,flag
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-09-22,1143.23,1147.88,1106.05,1106.98,77908.0,0
2004-09-23,1097.68,1099.54,1089.32,1093.04,37196.0,-1
2004-09-24,1082.81,1101.4,1082.81,1095.83,29367.0,0
2004-09-27,1092.11,1100.47,1075.38,1076.31,17336.0,-1
2004-09-28,1077.24,1088.39,1077.24,1082.81,26681.0,0


In [303]:
# 2.1 缺失值处理

# 检查数据中是否有缺失值，以下两种方式均可
#Flase:对应特征的特征值中无缺失值
#True：有缺失值
print(df.isnull().any())
print(np.isnan(df).any())
#删除包含缺失值的行
df.dropna(inplace=True)
#返回每一列缺失值统计个数
print(df.isnull().sum())

open      False
high      False
low       False
close     False
volume    False
flag      False
dtype: bool
open      False
high      False
low       False
close     False
volume    False
flag      False
dtype: bool
open      0
high      0
low       0
close     0
volume    0
flag      0
dtype: int64


In [304]:
# 2.2 确定特征值 目标值
# 特征值取 开 高 低 价
X = df.loc[:,['open','high','low']].values
print(X.shape)
print(X.ndim)
print(X[:3,])

(500, 3)
2
[[1143.23 1147.88 1106.05]
 [1097.68 1099.54 1089.32]
 [1082.81 1101.4  1082.81]]


In [305]:
# 收盘价>开盘价 标记为类别1 收盘价<=开盘价 标记为类别0
df['flag'] = df.apply(lambda x: 0 if (x['open']>=x['close']) else 1 , axis=1)
print(df.head())

# 目标值取收盘
y = df.loc[:, 'flag'].values
print(y.shape)
print(y.ndim)
print(y[:3])
y1 = y.copy()


               open     high      low    close   volume  flag
datetime                                                     
2004-09-22  1143.23  1147.88  1106.05  1106.98  77908.0     0
2004-09-23  1097.68  1099.54  1089.32  1093.04  37196.0     0
2004-09-24  1082.81  1101.40  1082.81  1095.83  29367.0     1
2004-09-27  1092.11  1100.47  1075.38  1076.31  17336.0     0
2004-09-28  1077.24  1088.39  1077.24  1082.81  26681.0     1
(500,)
1
[0 0 1]


In [306]:
#将数据划分为训练集和验证集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [307]:
# 混淆测试集目标类别的 指定概率50%的错误
# err = y_train
# for i in range(int(len(err)*0)):
#     index = int(np.random.randint(len(err)))
#     err[index] = 0 if err[index] == 1 else 1

In [308]:
print(y_train)
print(y_test)

[1 0 0 0 0 1 1 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 1 1 1 1
 0 1 1 0 1 1 1 0 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1
 0 0 1 0 1 0 1 1 1 1 0 1 1 1 0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 1
 1 1 1 0 1 0 1 1 0 0 0 0 0 1 1 1 0 1 0 1 1 0 1 0 1 1 0 0 1 1 1 1 1 0 0 0 1
 0 1 1 0 1 1 1 1 1 1 0 1 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 1 0 0 1 0 0 1 1 1 0
 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 0 1 1 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1
 1 1 1 1 0 1 1 1 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 1 1 1 1 0 0 0 1 0 1 1
 1 0 1 1 0 1 0 1 0 0 1 1 0 1 0 1 1 1 0 1 0 0 1 1 0 1 1 1 1 1 0 1 0 1 1 0 0
 0 0 1 1 0 1 1 1 1 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 1 0 1
 0 1 0 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0
 1 0 1 0 1]
[1 1 1 1 0 0 1 0 0 1 1 1 0 1 1 1 1 1 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0
 1 1 1 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0
 1 0 1 0 1 0 1 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 0
 0 0 0 0 0 0 

In [309]:
print(X_train[:3])
print(X_test[:3])
print(y_train[:3])
print(y_test[:3])

[[1209.22 1220.37 1204.57]
 [1355.14 1364.44 1348.64]
 [1376.52 1386.74 1365.37]]
[[1258.48 1265.92 1255.69]
 [1373.73 1398.83 1370.94]
 [1070.73 1077.24 1069.8 ]]
[1 0 0]
[1 1 1]


In [310]:
# 3. 特征工程（标准化）
from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()

# 测试集 计算平均值和标准偏差
standardScaler.fit(X_train)
# 缩放测试集
x_train_stand = standardScaler.transform(X_train)
# 重新计算平均值和标准偏差 并缩放测试集
x_test_stand = standardScaler.transform(X_test)

In [311]:
# 目标类别为0，1 不用标准化
print(y_train[:10])

[1 0 0 0 0 1 1 1 0 0]


In [312]:
# 拟合分类模型
svr_rbf = SVC(kernel='rbf', C=100, gamma=1)
svr_lin = SVC(kernel='linear', C=100, gamma='auto')
svr_poly = SVC(kernel='poly', C=100, gamma='auto', degree=3, coef0=1)

In [313]:
svc = svr_lin

performance = svc.fit(x_train_stand, y_train.ravel())
# 获取预测值
y_test_pred = performance.predict(x_test_stand)

In [314]:
# 获取这个估计器的参数
svc.get_params()

{'C': 100,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [315]:
print(x_train_stand.shape)
print(x_test_stand.shape)
print(y_train.shape)
print(y_train.ndim)
print(x_test_stand[:5])
print(y_test_pred[:])

(375, 3)
(125, 3)
(375,)
1
[[ 0.38  0.39  0.42]
 [ 1.45  1.62  1.5 ]
 [-1.36 -1.35 -1.33]
 [-1.34 -1.37 -1.33]
 [ 0.86  0.81  0.87]]
[1 1 1 0 0 1 1 0 1 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 1 0 1 1 0 1 0 1 0 0 0 0 0
 1 1 1 1 0 0 0 1 1 1 1 0 1 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 1 0 1 1 1 0 0 1
 1 1 1 0 1 0 1 1 0 0 1 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 0
 0 1 0 0 0 0 1 1 0 0 1 1 0 0]


In [316]:
# https://blog.csdn.net/gracejpw/article/details/101546293
# 返回预测的决定系数R^2
# R^2越接近于1，模型的拟合优度越高。
print('训练集评分 ',performance.score(x_train_stand, y_train))
print('测试集评分 ',svc.score(x_test_stand, y_test))

训练集评分  0.888
测试集评分  0.864


In [317]:
print(y_test.sum())
print(y_test_pred.sum())

53
62
