# 基于机器学习的pyborker量化交易策略

In [1]:
import pybroker
from pybroker.ext.data import AKShare
from pybroker import ExecContext, StrategyConfig, Strategy
from pybroker.data import DataSource
import matplotlib.pyplot as plt
from datetime import datetime
import riskfolio as rp
import akshare as ak
import pandas as pd
import numpy as np
import sqlite3
import datetime

import talib
from pybroker.vect import cross

#正常显示画图时出现的中文和负号
from pylab import mpl

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

mpl.rcParams['font.sans-serif']=['SimHei']
mpl.rcParams['axes.unicode_minus']=False

akshare = AKShare()

pybroker.enable_data_source_cache('akshare')

<diskcache.core.Cache at 0x1639a5872d0>

## 基于机器学习算法进行下个交易日的股票收益预测

在scikit-learn中，`r2_score` 是用于评估回归模型性能的一个重要指标，也被称为**决定系数（Coefficient of Determination）**。它衡量的是模型预测值对目标变量变化的解释程度，取值范围通常为 `(-∞, 1]`。

### 基本原理
- **R² = 1**：表示模型完全拟合数据，预测值与真实值完全一致。
- **R² = 0**：表示模型预测效果等同于简单平均基准线（即预测值恒为目标变量的均值）。
- **R² < 0**：表示模型性能比简单平均基准线还差，可能存在过拟合或错误的建模。


In [2]:
conn=sqlite3.connect(r'I:\量化金融\stock_2018.db')
stock_daily1=pd.read_sql("select * from stock_daily where 股票代码<'003000.SZ'",con=conn)
stock_daily1["交易日期"]=pd.to_datetime(stock_daily1["交易日期"].astype(str))

In [5]:
stock_daily1.columns

Index(['index', '交易日期', '股票代码', '股票简称', '开盘价', '最高价', '最低价', '收盘价', '成交量(手)',
       '成交额(千元)', '换手率(%)', '量比', '市盈率(静态)', '市盈率(TTM)', '市盈率(动态)', '市净率',
       '市销率', '市销率(TTM)', '股息率(%)', '股息率(TTM)(%)', '总股本(万股)', '流通股本(万股)',
       '总市值(万元)', '流通市值(万元)'],
      dtype='object')

In [3]:
stock_daily1.columns=['index', "date","symbol", '股票简称', "open","high","low","close","volume",
       '成交额(千元)', '换手率(%)', '量比', '市盈率(静态)', '市盈率(TTM)', '市盈率(动态)', '市净率',
       '市销率', '市销率(TTM)', '股息率(%)', '股息率(TTM)(%)', '总股本(万股)', '流通股本(万股)',
       '总市值(万元)', '流通市值(万元)']

In [4]:
def func0(x):
    return x.pct_change().shift(-1)

In [5]:
stock_daily1["return_s1"]=stock_daily1.groupby("symbol", group_keys=False).close.apply(func0)

In [6]:
xy=stock_daily1[['换手率(%)','市盈率(静态)','总市值(万元)',"return_s1"]].dropna()

In [7]:
xy_x=xy[['换手率(%)','市盈率(静态)','总市值(万元)']].values
xy_y=xy["return_s1"].values

In [8]:
x1,x2,y1,y2=train_test_split(xy_x,xy_y,test_size=0.7)#分割数据出训练集与测试集，0.7是两者行数的比例

In [22]:
x1

array([[-0.13972127,  0.27547332, -0.1416296 ],
       [-0.73762917, -0.38246096,  0.17937243],
       [-0.16066238, -0.27286238, -0.27403239],
       ...,
       [ 1.33949352,  0.15112145, -0.29883629],
       [-0.24321984, -0.22960699, -0.11603221],
       [ 1.68755648,  0.47981947, -0.32925919]])

In [9]:
clf = GradientBoostingRegressor()
#clf = LinearRegression()
clf = clf.fit(x1,y1)

In [24]:
clf

In [13]:
clf.predict(x2)

array([3.79243795e-04, 1.83752795e-03, 8.82648021e-05, ...,
       3.52756560e-04, 1.78506119e-04, 1.39414674e-03])

In [10]:
r2_score(y2,clf.predict(x2))

0.0020314384798882923

In [15]:
mean_absolute_error(y2,clf.predict(x2))

0.019961936150356283

## pybroker中的机器学习算法训练

在 `pybroker` 中，`bootstrap_sample_size=100` 是通过 `StrategyConfig` 类设置的参数，用于控制自举法（Bootstrapping）的样本量。自举法是一种统计重采样技术，用于估计交易策略性能指标的分布和置信区间，从而更可靠地评估策略的稳定性和风险。


 **自举法的核心思想**
传统回测通常只基于单一历史路径评估策略，但市场存在随机性。自举法通过以下步骤模拟这种随机性：
1. **有放回抽样**：从实际交易结果（如每日收益）中随机抽取样本，允许重复抽取。
2. **重建分布**：每次抽样后，计算策略的性能指标（如收益率、夏普比率等）。
3. **多次迭代**：重复抽样和计算过程，生成大量的性能指标样本，形成分布。
4. **统计推断**：基于分布计算置信区间、标准差等，评估策略的可靠性。


 **`bootstrap_sample_size=100` 的具体含义**
这意味着在评估策略时：
- **每次窗口测试**（如前向分析中的每个时间窗口），`pybroker` 会执行 **100 次自举抽样**。
- **每次抽样**会从实际交易结果中随机抽取相同数量的样本点（有放回），并计算一组新的性能指标。
- **最终结果**：基于这 100 组指标的分布，计算置信区间（如 95% 置信区间），量化策略的不确定性。

例如，若夏普比率的自举分布为 `1.2 ± 0.3`（95% 置信区间），则表明策略的真实夏普比率有 95% 的概率在 0.9 到 1.5 之间，帮助你更全面地理解策略的风险。


 **为什么需要自举法？**
1. **传统指标的局限性**：单次回测的夏普比率或收益率可能受特定历史路径的影响，无法反映策略的真实风险。
2. **捕捉随机性**：自举法通过模拟不同的市场路径，揭示策略在各种场景下的表现。
3. **统计显著性**：通过置信区间判断策略的盈利是否偶然，避免过度自信。


 **参数选择建议**
- **样本量越大**：结果越接近真实分布，但计算成本越高。
- **常见取值**：100-1000 之间。`100` 是一个平衡计算效率和准确性的常见选择。
- **实际应用**：若需要更精确的置信区间，可增加到 500 或 1000，但计算时间会相应延长。

In [11]:
def train_slr0(symbol, train_data, test_data):
    x1=train_data[['close','high','low','volume']]
    y1=train_data['return_s1']
    #model = DecisionTreeRegressor()
    model = GradientBoostingRegressor()
    #model = LinearRegression()
    model.fit(x1, y1)

    x2=test_data[['close','high','low','volume']]
    y2=test_data['return_s1']
    y_pred = model.predict(x2)
    r2 = r2_score(y2, y_pred)
    print(symbol, f'R^2={r2}')

    return model,["close","high","low","volume"]

In [12]:
model_slr = pybroker.model('slr', train_slr0)
pyb_data_pe=stock_daily1[["date","symbol","open","high","low","close","volume","return_s1"]].dropna()

In [13]:
pybroker.register_columns('return_s1')
config = StrategyConfig(bootstrap_sample_size=100)
strategy = Strategy(pyb_data_pe, '2018-03-19', '2021-02-15', config)
strategy.add_execution(None, ['000001.SZ','002594.SZ'], models=model_slr)

In [14]:
result = strategy.backtest(train_size=0.5)

Backtesting: 2018-03-19 00:00:00 to 2021-02-15 00:00:00

Train split: 2018-03-20 00:00:00 to 2019-08-28 00:00:00
000001.SZ R^2=-0.6884334867922908
002594.SZ R^2=-0.2150875731114421
Finished training models: 0:00:00 

Finished backtest: 0:00:00


## pybroker中的算法回测

In [15]:
def hold_long(ctx):
    if not ctx.long_pos():
        # Buy if the next bar is predicted to have a positive return:
        if ctx.preds('slr')[-1] > 0.:
            ctx.buy_shares = ctx.calc_target_shares(0.5)
    else:
        # Sell if the next bar is predicted to have a negative return:
        if ctx.preds('slr')[-1] < 0.:
            ctx.sell_shares = ctx.calc_target_shares(0.5)

model_slr = pybroker.model('slr', train_slr0)

pybroker.register_columns('return_s1')
config = StrategyConfig(bootstrap_sample_size=100)
strategy = Strategy(pyb_data_pe, '2018-03-19', '2022-02-15', config)
strategy.add_execution(hold_long, ['000001.SZ','002594.SZ'], models=model_slr)

result = strategy.backtest(train_size=0.5)

Backtesting: 2018-03-19 00:00:00 to 2022-02-15 00:00:00

Train split: 2018-03-19 00:00:00 to 2020-03-02 00:00:00
000001.SZ R^2=-0.26581129990175056
002594.SZ R^2=-0.9508445974775328
Finished training models: 0:00:00 

Test split: 2020-03-03 00:00:00 to 2022-02-15 00:00:00


  0% (0 of 475) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--
 10% (51 of 475) |##                     | Elapsed Time: 0:00:00 ETA:   0:00:00
 29% (141 of 475) |######                | Elapsed Time: 0:00:00 ETA:   0:00:00
 48% (231 of 475) |##########            | Elapsed Time: 0:00:00 ETA:   0:00:00
 69% (331 of 475) |###############       | Elapsed Time: 0:00:00 ETA:   0:00:00
 90% (431 of 475) |###################   | Elapsed Time: 0:00:00 ETA:   0:00:00
100% (475 of 475) |######################| Elapsed Time: 0:00:00 Time:  0:00:00



Finished backtest: 0:00:05


In [16]:
result.trades

Unnamed: 0_level_0,type,symbol,entry_date,exit_date,entry,exit,shares,pnl,return_pct,agg_pnl,bars,pnl_per_bar,stop,mae,mfe
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,long,000001.SZ,2020-03-10,2020-03-11,1595.50,1610.24,31,456.94,0.92,456.94,1,456.94,,-25.65,25.66
2,long,000001.SZ,2020-03-16,2020-03-17,1539.83,1479.24,32,-1938.88,-3.93,-1481.94,1,-1938.88,,-60.59,38.75
3,long,002594.SZ,2020-03-13,2020-03-18,55.46,54.36,864,-950.40,-1.98,-2432.34,3,-316.80,,-2.59,2.96
4,short,002594.SZ,2020-03-18,2020-03-24,54.36,49.56,36,172.80,9.69,-2259.54,4,43.20,,-1.35,5.56
5,long,000001.SZ,2020-03-18,2020-03-26,1430.11,1422.47,33,-252.12,-0.53,-2511.66,6,-42.02,,-129.91,49.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,long,000001.SZ,2022-01-17,2022-01-18,1824.33,1829.92,12,67.08,0.31,-39150.17,1,67.08,,-20.15,20.14
170,short,000001.SZ,2022-01-18,2022-01-21,1829.92,1945.76,4,-463.36,-5.95,-39613.53,3,-154.45,,-124.24,21.26
171,long,000001.SZ,2022-01-21,2022-01-24,1945.76,1922.82,11,-252.34,-1.18,-39865.87,1,-252.34,,-22.94,19.59
172,short,000001.SZ,2022-01-24,2022-02-07,1922.82,1807.54,4,461.12,6.38,-39404.75,5,92.22,,-22.38,152.21


In [17]:
result.metrics_df.head()

Unnamed: 0,name,value
0,trade_count,173.0
1,initial_market_value,100000.0
2,end_market_value,62344.77
3,total_pnl,-38457.83
4,unrealized_pnl,802.6


## pybroker中的算法前向测试

https://www.pybroker.com/zh-cn/latest/notebooks/6.%20Training%20a%20Model.html

In [18]:
strategy.clear_executions()
strategy.add_execution(hold_long, ['000001.SZ','002594.SZ'], models=model_slr)

result = strategy.walkforward(
    warmup=20,
    windows=3,
    train_size=0.5,
    lookahead=1,
    calc_bootstrap=True
)

Backtesting: 2018-03-19 00:00:00 to 2022-02-15 00:00:00

Train split: 2018-03-21 00:00:00 to 2019-03-12 00:00:00
000001.SZ R^2=-0.1357062022922011
002594.SZ R^2=-0.2507676498010858
Finished training models: 0:00:00 

Test split: 2019-03-13 00:00:00 to 2020-03-03 00:00:00


  0% (0 of 237) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--
 25% (61 of 237) |#####                  | Elapsed Time: 0:00:00 ETA:   0:00:00
 55% (131 of 237) |############          | Elapsed Time: 0:00:00 ETA:   0:00:00
 84% (201 of 237) |##################    | Elapsed Time: 0:00:00 ETA:   0:00:00
100% (237 of 237) |######################| Elapsed Time: 0:00:00 Time:  0:00:00



Train split: 2019-03-13 00:00:00 to 2020-03-03 00:00:00
000001.SZ R^2=-0.33170840978823257
002594.SZ R^2=-0.565988750552967
Finished training models: 0:00:00 

Test split: 2020-03-04 00:00:00 to 2021-02-23 00:00:00


  0% (0 of 237) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--
 21% (51 of 237) |####                   | Elapsed Time: 0:00:00 ETA:   0:00:00
 55% (131 of 237) |############          | Elapsed Time: 0:00:00 ETA:   0:00:00
 93% (221 of 237) |####################  | Elapsed Time: 0:00:00 ETA:   0:00:00
100% (237 of 237) |######################| Elapsed Time: 0:00:00 Time:  0:00:00



Train split: 2020-03-04 00:00:00 to 2021-02-23 00:00:00
000001.SZ R^2=-0.42795351524708747
002594.SZ R^2=-0.41951301434582167
Finished training models: 0:00:00 

Test split: 2021-02-24 00:00:00 to 2022-02-15 00:00:00


  0% (0 of 237) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--
 17% (41 of 237) |###                    | Elapsed Time: 0:00:00 ETA:   0:00:00
 42% (101 of 237) |#########             | Elapsed Time: 0:00:00 ETA:   0:00:00
 67% (161 of 237) |##############        | Elapsed Time: 0:00:00 ETA:   0:00:00
 89% (211 of 237) |###################   | Elapsed Time: 0:00:00 ETA:   0:00:00
100% (237 of 237) |######################| Elapsed Time: 0:00:00 Time:  0:00:00



Calculating bootstrap metrics: sample_size=100, samples=10000...
Calculated bootstrap metrics: 0:00:06 

Finished backtest: 0:00:08


In [19]:
result.orders

Unnamed: 0_level_0,type,symbol,date,shares,limit_price,fill_price,fees
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,buy,002594.SZ,2019-04-12,895,,55.08,0.0
2,buy,000001.SZ,2019-04-15,33,,1498.93,0.0
3,sell,000001.SZ,2019-04-16,33,,1519.46,0.0
4,sell,002594.SZ,2019-04-16,907,,56.06,0.0
5,buy,000001.SZ,2019-04-18,32,,1538.36,0.0
...,...,...,...,...,...,...,...
424,sell,002594.SZ,2022-01-26,210,,251.78,0.0
425,buy,002594.SZ,2022-01-27,204,,250.05,0.0
426,sell,002594.SZ,2022-02-09,212,,245.53,0.0
427,buy,002594.SZ,2022-02-10,207,,247.91,0.0


In [68]:
result.metrics_df.head()

Unnamed: 0,name,value
0,trade_count,413.0
1,initial_market_value,100000.0
2,end_market_value,130676.6
3,total_pnl,34315.21
4,unrealized_pnl,-3638.61
