# 基于机器学习的pyborker量化交易策略

In [1]:
import pybroker
from pybroker.ext.data import AKShare
from pybroker import ExecContext, StrategyConfig, Strategy
from pybroker.data import DataSource
import matplotlib.pyplot as plt
from datetime import datetime
import riskfolio as rp
import akshare as ak
import pandas as pd
import numpy as np
import sqlite3
import datetime
import seaborn as sns
from scipy.stats import pearsonr, spearmanr
import talib
from pybroker.vect import cross

#正常显示画图时出现的中文和负号
from pylab import mpl

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb

mpl.rcParams['font.sans-serif']=['SimHei']
mpl.rcParams['axes.unicode_minus']=False

akshare = AKShare()

pybroker.enable_data_source_cache('akshare')

<diskcache.core.Cache at 0x27a18d81190>

## 股票预测的因子构建

pip install lightgbm -i https://mirrors.aliyun.com/pypi/simple

In [2]:
conn=sqlite3.connect(r'I:\量化金融\stock_2018.db')
stock_daily1=pd.read_sql("select * from stock_daily where 股票代码<'003000.SZ'",con=conn)
stock_daily1["交易日期"]=pd.to_datetime(stock_daily1["交易日期"].astype(str))

In [3]:
stock_daily1.columns

Index(['index', '交易日期', '股票代码', '股票简称', '开盘价', '最高价', '最低价', '收盘价', '成交量(手)',
       '成交额(千元)', '换手率(%)', '量比', '市盈率(静态)', '市盈率(TTM)', '市盈率(动态)', '市净率',
       '市销率', '市销率(TTM)', '股息率(%)', '股息率(TTM)(%)', '总股本(万股)', '流通股本(万股)',
       '总市值(万元)', '流通市值(万元)'],
      dtype='object')

In [3]:
stock_daily1.columns=['index', "date","symbol", '股票简称', "open","high","low","close","volume",
       '成交额(千元)', '换手率(%)', '量比', '市盈率(静态)', '市盈率(TTM)', '市盈率(动态)', '市净率',
       '市销率', '市销率(TTM)', '股息率(%)', '股息率(TTM)(%)', '总股本(万股)', '流通股本(万股)',
       '总市值(万元)', '流通市值(万元)']

In [4]:
stock_daily1_d=stock_daily1.drop(["index","股票简称"],axis=1)

In [6]:
#stock_daily1_d

In [5]:
def func0(x):
    return x.pct_change().shift(-1)

stock_daily1_d["return_s1"]=stock_daily1_d.groupby("symbol", group_keys=False).close.apply(func0)

In [6]:
def compute_bb(stock_data):

    SMA20_close=talib.SMA(stock_data, timeperiod=20)

    high, mid, low = talib.BBANDS(stock_data, timeperiod=20)

    x,y,z=talib.MACD(stock_data,fastperiod=12, slowperiod=26, signalperiod=9)

    dongliang=talib.MOM(stock_data, timeperiod=10)

    rsi_6=talib.RSI(stock_data, timeperiod=6)
    rsi_24=talib.RSI(stock_data, timeperiod=24)

    up = talib.MAX(stock_data, 20)
    down = talib.MIN(stock_data, 20)

    return_30=stock_data.pct_change(30)
    return_5=stock_data.pct_change(5)
    return_10=stock_data.pct_change(10)


    df=pd.concat([SMA20_close,high,low,x,y,z,dongliang,rsi_6,rsi_24,up,down,return_30,return_5,return_10],axis=1)
    df.columns=["SMA20_close","b_high","b_low","MACD_x","MACD_y","MACD_z","dong10","rsi_6","rsi_24","up","down","return_30","return_5","return_10"]

    return df

In [7]:
z=stock_daily1_d.groupby("symbol", group_keys=False).close.apply(compute_bb)
stock_daily1_d=stock_daily1_d.join(z)
stock_daily1_d["close-o"]=stock_daily1_d["close"]-stock_daily1_d["open"]
stock_daily1_d["high-l"]=stock_daily1_d["high"]-stock_daily1_d["low"]

In [11]:
#stock_daily1_d

## 基于lightgbm的股票预测

* 格点调参

In [17]:
stock_daily1_d.date.max()

Timestamp('2023-02-17 00:00:00')

In [18]:
stock_daily1_d.date.min()

Timestamp('2018-01-02 00:00:00')

In [12]:
xy=stock_daily1_d[stock_daily1_d.date<datetime.datetime(2021,1,1)].iloc[:,2:].dropna()

In [13]:
xy_x=xy.drop("return_s1",axis=1)
xy_y=xy["return_s1"]
x1,x2,y1,y2=train_test_split(xy_x,xy_y,test_size=0.7)#分割数据出训练集与测试集，0.7是两者行数的比例

In [14]:
clf = LinearRegression()
#clf = GradientBoostingRegressor()
clf = clf.fit(x1,y1)
print(r2_score(y2,clf.predict(x2)))

0.0016471867933682827


In [15]:
# 创建一个LGBMRegressor模型，目标函数为回归，叶子节点数为31，学习率为0.1，估计器个数为30
gbm = lgb.LGBMRegressor(objective='regression',num_leaves=31,learning_rate=0.1,n_estimators=30)
gbm.fit(x1,y1)
y_pred = gbm.predict(x2)
print(r2_score(y2,y_pred))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9180
[LightGBM] [Info] Number of data points in the train set: 171108, number of used features: 36
[LightGBM] [Info] Start training from score 0.000342
0.014361957641128353


**交叉验证的工作原理**
在每次迭代中：
1. **数据分割**：将训练数据（`x1`, `y1`）随机分成3份，其中2份作为训练集，1份作为验证集。
2. **模型训练与评估**：使用当前参数组合在训练集上训练模型，并在验证集上评估性能。
3. **重复迭代**：重复上述过程3次，每次使用不同的验证集。
4. **结果汇总**：最终返回3次评估结果的平均值作为该参数组合的性能指标。


**为什么使用交叉验证？**
1. **更可靠的评估**：单一验证集可能受数据划分的随机性影响，交叉验证通过多次平均减少这种偏差。
2. **充分利用数据**：所有训练数据都有机会作为验证集，提高模型评估的稳定性。
3. **防止过拟合**：帮助选择在不同数据子集上都表现稳定的参数，避免模型对特定数据过拟合。


**`cv=3` 的具体示例**
假设你的训练数据有900个样本，`cv=3` 会将数据分为3份（每份300个样本）：

- **第一次迭代**：使用第1+2份（600个样本）训练，第3份（300个样本）验证。
- **第二次迭代**：使用第1+3份训练，第2份验证。
- **第三次迭代**：使用第2+3份训练，第1份验证。

最终，每个参数组合的性能是这3次验证分数的平均值。

**注意事项**
- **计算成本**：折数越多，计算时间越长（例如 `cv=10` 比 `cv=3` 慢约3倍）。
- **数据量**：数据量较小时，建议使用较高的折数（如 `cv=5` 或 `cv=10`）；数据量较大时，`cv=3` 通常足够。

In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'num_leaves': [31, 50],
    'learning_rate': [0.05, 0.1, 0.15],
    'n_estimators': [20, 30, 50]
}
gbm = lgb.LGBMRegressor(objective='regression')
grid_search = GridSearchCV(gbm, param_grid, cv=3)
grid_search.fit(x1,y1)
print(grid_search.best_params_)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027274 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9180
[LightGBM] [Info] Number of data points in the train set: 114072, number of used features: 36
[LightGBM] [Info] Start training from score 0.000273
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020374 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9180
[LightGBM] [Info] Number of data points in the train set: 114072, number of used features: 36
[LightGBM] [Info] Start training from score 0.000400
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019933 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9180
[LightGBM] [Info] Number of data points in the train set: 114072, number of used features: 36
[LightGBM] [Info] Start 

In [17]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x2)
print(r2_score(y2,y_pred))

0.015561255775650817


In [19]:
xy_t=stock_daily1_d[(stock_daily1_d.date>datetime.datetime(2021,1,1))&(stock_daily1_d.date<datetime.datetime(2021,5,1))].iloc[:,2:].dropna()
xy_x=xy_t.drop("return_s1",axis=1)
xy_y=xy_t["return_s1"]
y_pred = best_model.predict(xy_x)
print(r2_score(xy_y,y_pred))

0.003189569666169456


In [20]:
y_pred 

array([ 0.0016841 ,  0.00032934,  0.00093547, ..., -0.00053129,
        0.00057716,  0.00047847])

## 在pybroker上使用单日收益率预测模型

In [21]:
y_pred=pd.Series(y_pred,index=xy_x.index,name="pred")

In [22]:
y_pred

730        0.001684
731        0.000329
732        0.000935
733        0.000918
734        0.000882
             ...   
1726342   -0.001794
1726343   -0.002777
1726344   -0.000531
1726345    0.000577
1726346    0.000478
Name: pred, Length: 64898, dtype: float64

In [24]:
#stock_daily1_d[(stock_daily1_d.date>datetime.datetime(2021,1,1))&(stock_daily1_d.date<datetime.datetime(2021,5,1))].iloc[:,0:7]

In [25]:
pyb_data_pe=stock_daily1_d[(stock_daily1_d.date>datetime.datetime(2021,1,1))&(stock_daily1_d.date<datetime.datetime(2021,5,1))].iloc[:,0:7]
pyb_data_pe = pd.concat([pyb_data_pe, y_pred], axis=1)
pyb_data_pe.fillna(0,inplace=True)

In [26]:
pyb_data_pe

Unnamed: 0,date,symbol,open,high,low,close,volume,pred
730,2021-01-04,000001.SZ,2121.0359,2121.0359,2047.7436,2065.5114,1554216.43,0.001684
731,2021-01-05,000001.SZ,2043.3016,2052.1855,1976.6722,2017.7603,1821352.10,0.000329
732,2021-01-06,000001.SZ,2007.7659,2172.1184,1998.8820,2172.1184,1934945.12,0.000935
733,2021-01-07,000001.SZ,2167.6765,2218.7590,2135.4723,2209.8751,1584185.30,0.000918
734,2021-01-08,000001.SZ,2209.8751,2232.0849,2144.3562,2204.3226,1195473.22,0.000882
...,...,...,...,...,...,...,...,...
1730540,2021-04-26,002999.SZ,12.3500,12.6800,12.2300,12.3200,54335.98,0.000000
1730541,2021-04-27,002999.SZ,12.5800,12.6300,11.8800,11.9400,57535.60,0.000000
1730542,2021-04-28,002999.SZ,11.9900,12.0100,11.7200,11.8400,40535.97,0.000000
1730543,2021-04-29,002999.SZ,12.0000,12.1900,11.7700,11.7900,49342.27,0.000000


In [24]:
#pyb_data_pe

In [25]:
pyb_data_pe.pred.describe()

count    111632.000000
mean          0.000292
std           0.002206
min          -0.062534
25%           0.000000
50%           0.000000
75%           0.000606
max           0.089635
Name: pred, dtype: float64

In [54]:
pyb_data_pe.symbol.unique()[0:10]


array(['000001.SZ', '000002.SZ', '000004.SZ', '000005.SZ', '000006.SZ',
       '000007.SZ', '000008.SZ', '000009.SZ', '000010.SZ', '000011.SZ'],
      dtype=object)

In [28]:
def hold_long(ctx):
    if not ctx.long_pos():
        # Buy if the next bar is predicted to have a positive return:
        if ctx.pred[-1] > 0.:
            ctx.buy_shares = ctx.calc_target_shares(0.01)
    else:
        # Sell if the next bar is predicted to have a negative return:
        if ctx.pred[-1] < 0.:
            ctx.sell_shares = ctx.calc_target_shares(0.01)

In [33]:
pybroker.register_columns('pred')
config = StrategyConfig(initial_cash=10000000)
strategy = Strategy(pyb_data_pe, '2021-01-01', '2021-05-01',config)
strategy.add_execution(hold_long, pyb_data_pe.symbol.unique()[250:350])
result = strategy.backtest()

Backtesting: 2021-01-01 00:00:00 to 2021-05-01 00:00:00



Test split: 2021-01-04 00:00:00 to 2021-04-30 00:00:00


  0% (0 of 79) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  1% (1 of 79) |                         | Elapsed Time: 0:00:00 ETA:   0:00:18
 13% (11 of 79) |###                     | Elapsed Time: 0:00:00 ETA:   0:00:03
 26% (21 of 79) |######                  | Elapsed Time: 0:00:00 ETA:   0:00:01
 39% (31 of 79) |#########               | Elapsed Time: 0:00:00 ETA:   0:00:01
 51% (41 of 79) |############            | Elapsed Time: 0:00:01 ETA:   0:00:00
 64% (51 of 79) |###############         | Elapsed Time: 0:00:01 ETA:   0:00:00
 77% (61 of 79) |##################      | Elapsed Time: 0:00:01 ETA:   0:00:00
 89% (71 of 79) |#####################   | Elapsed Time: 0:00:01 ETA:   0:00:00
100% (79 of 79) |########################| Elapsed Time: 0:00:01 ETA:  00:00:00
100% (79 of 79) |########################| Elapsed Time: 0:00:01 Time:  0:00:01



Finished backtest: 0:00:02


In [34]:
result.metrics_df.head()

Unnamed: 0,name,value
0,trade_count,847.0
1,initial_market_value,10000000.0
2,end_market_value,10054134.95
3,total_pnl,43682.91
4,unrealized_pnl,10452.04


## 轮动策略

In [35]:
stock_n=5
config = StrategyConfig(max_long_positions=stock_n)
pybroker.param('target_size', 1 / stock_n)

0.2

In [36]:
def rand_ld(ctxs: dict[str, ExecContext]):
    symbols=[]
    ld=[]
    for ctx in ctxs.values():
        symbols.append(ctx.symbol)
        ld.append(ctx.pred[-1])
    
    v_rank=pd.DataFrame({"symbol":symbols,"ld":ld}).sort_values(by="ld",ascending=False)
    pybroker.param('symbols', v_rank.head(stock_n).symbol.values)

def rotate(ctx: ExecContext):
    if ctx.long_pos():
        if ctx.symbol not in pybroker.param('symbols'):
            ctx.sell_all_shares()
    else:
        ctx.buy_limit_price = ctx.close[-1] * 1.095
        target_size = pybroker.param('target_size')
        ctx.buy_shares = ctx.calc_target_shares(target_size)
        ctx.score = ctx.pred[-1]

In [100]:
pyb_data_pe

Unnamed: 0,date,symbol,open,high,low,close,volume,pred
730,2021-01-04,000001.SZ,2121.0359,2121.0359,2047.7436,2065.5114,1554216.43,0.000274
731,2021-01-05,000001.SZ,2043.3016,2052.1855,1976.6722,2017.7603,1821352.10,-0.000640
732,2021-01-06,000001.SZ,2007.7659,2172.1184,1998.8820,2172.1184,1934945.12,0.000760
733,2021-01-07,000001.SZ,2167.6765,2218.7590,2135.4723,2209.8751,1584185.30,0.000760
734,2021-01-08,000001.SZ,2209.8751,2232.0849,2144.3562,2204.3226,1195473.22,0.001234
...,...,...,...,...,...,...,...,...
1730540,2021-04-26,002999.SZ,12.3500,12.6800,12.2300,12.3200,54335.98,0.000000
1730541,2021-04-27,002999.SZ,12.5800,12.6300,11.8800,11.9400,57535.60,0.000000
1730542,2021-04-28,002999.SZ,11.9900,12.0100,11.7200,11.8400,40535.97,0.000000
1730543,2021-04-29,002999.SZ,12.0000,12.1900,11.7700,11.7900,49342.27,0.000000


In [37]:
pybroker.register_columns('pred')
config = StrategyConfig(initial_cash=10000000)
strategy = Strategy(pyb_data_pe, '2021-01-01', '2021-05-01', config)
strategy.add_execution(rotate, pyb_data_pe.symbol.unique())
strategy.set_before_exec(rand_ld)
result = strategy.backtest()

Backtesting: 2021-01-01 00:00:00 to 2021-05-01 00:00:00



Test split: 2021-01-04 00:00:00 to 2021-04-30 00:00:00


  0% (0 of 79) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  1% (1 of 79) |                         | Elapsed Time: 0:00:03 ETA:   0:04:37
 13% (11 of 79) |###                     | Elapsed Time: 0:00:07 ETA:   0:00:48
 26% (21 of 79) |######                  | Elapsed Time: 0:00:09 ETA:   0:00:27
 39% (31 of 79) |#########               | Elapsed Time: 0:00:11 ETA:   0:00:18
 51% (41 of 79) |############            | Elapsed Time: 0:00:13 ETA:   0:00:12
 64% (51 of 79) |###############         | Elapsed Time: 0:00:15 ETA:   0:00:08
 77% (61 of 79) |##################      | Elapsed Time: 0:00:17 ETA:   0:00:05
 89% (71 of 79) |#####################   | Elapsed Time: 0:00:19 ETA:   0:00:02
100% (79 of 79) |########################| Elapsed Time: 0:00:21 ETA:  00:00:00
100% (79 of 79) |########################| Elapsed Time: 0:00:21 Time:  0:00:21



Finished backtest: 0:00:22


In [144]:
result.trades

Unnamed: 0_level_0,type,symbol,entry_date,exit_date,entry,exit,shares,pnl,return_pct,agg_pnl,bars,pnl_per_bar,stop,mae,mfe
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,long,000001.SZ,2021-01-05,2021-01-06,2014.43,2085.50,968,68795.76,3.53,68795.76,1,68795.76,,-37.76,71.07
2,long,000002.SZ,2021-01-05,2021-01-06,4234.59,4346.16,467,52103.19,2.63,120898.95,1,52103.19,,-63.87,111.57
3,long,000004.SZ,2021-01-05,2021-01-06,84.69,83.05,23412,-38395.68,-1.94,82503.27,1,-38395.68,,-1.64,0.98
4,long,000005.SZ,2021-01-05,2021-01-06,23.08,22.57,85633,-43672.83,-2.21,38830.44,1,-43672.83,,-0.51,0.28
5,long,000006.SZ,2021-01-05,2021-01-06,199.15,197.32,9924,-18160.92,-0.92,20669.52,1,-18160.92,,-2.38,2.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565,long,000004.SZ,2021-04-29,2021-04-30,64.76,64.96,27478,5495.60,0.31,-1165769.24,1,5495.60,,-1.69,1.69
566,long,000008.SZ,2021-04-29,2021-04-30,50.53,50.42,33750,-3712.50,-0.22,-1169481.74,1,-3712.50,,-0.34,0.34
567,long,000020.SZ,2021-04-29,2021-04-30,16.00,15.92,1,-0.08,-0.50,-1169481.82,1,-0.08,,-0.14,0.15
568,long,000040.SZ,2021-04-29,2021-04-30,11.20,11.11,1,-0.09,-0.80,-1169481.91,1,-0.09,,-0.11,0.11


In [40]:
result.metrics_df.head()

Unnamed: 0,name,value
0,trade_count,581.0
1,initial_market_value,10000000.0
2,end_market_value,8596660.57
3,total_pnl,-1440405.7
4,unrealized_pnl,37066.27


## 基于机器学习的因子排序策略实现

In [60]:
def func10(x):
    return x.pct_change(periods=10).shift(-10)

stock_daily1_d["return_s10"]=stock_daily1_d.groupby("symbol", group_keys=False).close.apply(func10)

In [61]:
xy=stock_daily1_d[stock_daily1_d.date<datetime.datetime(2021,1,1)].iloc[:,2:].dropna()
xy_x=xy.drop(["return_s1","return_s10"],axis=1)
xy_y=xy["return_s10"]
x1,x2,y1,y2=train_test_split(xy_x,xy_y,test_size=0.8)#分割数据出训练集与测试集，0.7是两者行数的比例

In [62]:
gbm = lgb.LGBMRegressor(objective='regression',num_leaves=31,learning_rate=0.2,n_estimators=50)
gbm.fit(x1,y1)
y_pred = gbm.predict(x2)
print(r2_score(y2,y_pred))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017790 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9180
[LightGBM] [Info] Number of data points in the train set: 114070, number of used features: 36
[LightGBM] [Info] Start training from score 0.003236
0.051109060658275474


In [65]:
xy_t=stock_daily1_d[(stock_daily1_d.date>datetime.datetime(2021,1,1))&(stock_daily1_d.date<datetime.datetime(2021,5,1))].iloc[:,2:].dropna()
xy_x=xy_t.drop(["return_s1","return_s10"],axis=1)
xy_y=xy_t["return_s10"]
y_pred = gbm.predict(xy_x)
print(r2_score(xy_y,y_pred))

-0.016545272675347666


In [53]:
y_pred=pd.Series(y_pred,index=xy_x.index,name="pred")
pyb_data_pe=stock_daily1_d[(stock_daily1_d.date>datetime.datetime(2021,1,1))&(stock_daily1_d.date<datetime.datetime(2021,5,1))].iloc[:,0:7]
pyb_data_pe = pd.concat([pyb_data_pe, y_pred], axis=1)
pyb_data_pe.fillna(0,inplace=True)

In [70]:
#pyb_data_pe

In [54]:
def buy_highest_volume(ctx):
    # If there are no long positions across all tickers being traded:
    if not tuple(ctx.long_positions()):
        ctx.buy_shares = ctx.calc_target_shares(0.2)
        ctx.hold_bars = 10
        ctx.score = ctx.pred[-1]

In [66]:
pybroker.register_columns('pred')
strategy = Strategy(pyb_data_pe, '2021-01-01', '2021-05-01')
strategy.add_execution(buy_highest_volume, pyb_data_pe.symbol.unique())
result = strategy.backtest()

Backtesting: 2021-01-01 00:00:00 to 2021-05-01 00:00:00



Test split: 2021-01-04 00:00:00 to 2021-04-30 00:00:00


  0% (0 of 79) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  1% (1 of 79) |                         | Elapsed Time: 0:00:03 ETA:   0:03:55
 13% (11 of 79) |###                     | Elapsed Time: 0:00:05 ETA:   0:00:33
 26% (21 of 79) |######                  | Elapsed Time: 0:00:05 ETA:   0:00:15
 39% (31 of 79) |#########               | Elapsed Time: 0:00:06 ETA:   0:00:09
 51% (41 of 79) |############            | Elapsed Time: 0:00:06 ETA:   0:00:05
 64% (51 of 79) |###############         | Elapsed Time: 0:00:06 ETA:   0:00:03
 77% (61 of 79) |##################      | Elapsed Time: 0:00:06 ETA:   0:00:02
 89% (71 of 79) |#####################   | Elapsed Time: 0:00:07 ETA:   0:00:00
100% (79 of 79) |########################| Elapsed Time: 0:00:07 ETA:  00:00:00
100% (79 of 79) |########################| Elapsed Time: 0:00:07 Time:  0:00:07



Finished backtest: 0:00:08


In [68]:
result.metrics_df.head()

Unnamed: 0,name,value
0,trade_count,49.0
1,initial_market_value,100000.0
2,end_market_value,93075.55
3,total_pnl,-7356.97
4,unrealized_pnl,432.52


In [74]:
#result.trades

In [63]:
xy_y

730       -0.023118
731        0.076500
732        0.017382
733       -0.002513
734        0.026700
             ...   
1726342   -0.070123
1726343   -0.033101
1726344    0.005406
1726345    0.002943
1726346   -0.010211
Name: return_s1, Length: 64898, dtype: float64

## 基于akshare数据源的机器学习算法策略

In [71]:
stock_board_industry_cons_em_df = ak.stock_board_industry_cons_em(symbol="汽车整车")
print(stock_board_industry_cons_em_df)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

    序号      代码     名称     最新价   涨跌幅    涨跌额      成交量           成交额     振幅  \
0    1  600686   金龙汽车   13.07  2.91   0.37   265232  3.461206e+08   6.30   
1    2  600375   汉马科技    6.67  1.37   0.09   826696  5.562643e+08  11.55   
2    3  000550   江铃汽车   21.17  0.43   0.09    45711  9.747963e+07   3.32   
3    4  600166   福田汽车    2.68  0.00   0.00   372367  9.971643e+07   1.87   
4    5  200550  江  铃Ｂ    9.93 -0.10  -0.01      283  2.812130e+05   0.91   
5    6  000868   安凯客车    5.83 -0.17  -0.01   134752  7.844414e+07   2.57   
6    7  601777   千里科技    8.37 -0.24  -0.02   263089  2.187911e+08   3.93   
7    8  600303   曙光股份    4.07 -0.25  -0.01   102177  4.212229e+07   2.94   
8    9  000957   中通客车   11.34 -0.61  -0.07   166388  1.898896e+08   2.45   
9   10  600006   东风股份    7.65 -1.03  -0.08   580774  4.495370e+08   3.23   
10  11  600418   江淮汽车   40.36 -1.10  -0.45   570086  2.335257e+09   5.02   
11  12  000800   一汽解放    7.19 -1.24  -0.09    88571  6.410694e+07   1.65   
12  13  0009

In [72]:
stock_board_industry_cons_em_df.代码.unique()

array(['600686', '600375', '000550', '600166', '200550', '000868',
       '601777', '600303', '000957', '600006', '600418', '000800',
       '000951', '200625', '301039', '600104', '601238', '600066',
       '000625', '601633', '000572', '600841', '600733', '000980',
       '601127', '002594'], dtype=object)

In [73]:
df = akshare.query(
    symbols=stock_board_industry_cons_em_df.代码.unique(),
    start_date='1/1/2016',
    end_date='1/1/2025',
    adjust="hfq",
    timeframe="1d",
)
df

Loaded cached bar data.



Unnamed: 0,date,symbol,open,high,low,close,volume
19674,2016-01-04,000957,44.77,44.77,40.37,40.37,49046
19675,2016-01-05,000957,38.38,41.65,37.87,41.65,101643
19676,2016-01-06,000957,41.72,44.17,41.47,43.83,86237
19677,2016-01-07,000957,42.75,42.88,39.85,39.96,18357
19678,2016-01-08,000957,41.56,42.53,38.23,41.18,58311
...,...,...,...,...,...,...,...
39197,2024-12-25,200550,29.18,29.20,29.10,29.16,1910
39198,2024-12-26,200550,29.13,29.26,29.12,29.17,1985
39199,2024-12-27,200550,29.20,29.25,29.16,29.18,693
39200,2024-12-30,200550,29.14,29.18,29.11,29.11,1665


In [74]:
z=df.groupby("symbol", group_keys=False).close.apply(compute_bb)
df=df.join(z)
df["close-o"]=df["close"]-df["open"]
df["high-l"]=df["high"]-df["low"]

In [75]:
df["return_s1"]=df.groupby("symbol", group_keys=False).close.apply(func0)

In [76]:
xy=df[df.date<datetime.datetime(2024,1,1)].iloc[:,2:].dropna()

In [77]:
xy_x=xy.drop("return_s1",axis=1)
xy_y=xy["return_s1"]
x1,x2,y1,y2=train_test_split(xy_x,xy_y,test_size=0.8)#分割数据出训练集与测试集，0.7是两者行数的比例

In [78]:
param_grid = {
    'num_leaves': [31, 50],
    'learning_rate': [0.05, 0.1, 0.15],
    'n_estimators': [20, 30, 50]
}
gbm = lgb.LGBMRegressor(objective='regression')
grid_search = GridSearchCV(gbm, param_grid, cv=3)
grid_search.fit(x1,y1)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000570 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5355
[LightGBM] [Info] Number of data points in the train set: 6296, number of used features: 21
[LightGBM] [Info] Start training from score -0.000248
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000765 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5355
[LightGBM] [Info] Number of data points in the train set: 6297, number of used features: 21
[LightGBM] [Info] Start training from score -0.000156
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000739 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5355
[LightGBM] [Info] Number of data points in the train set: 6297, number of used features: 21
[LightGBM] [Info] Start trai

In [79]:
print(grid_search.best_params_)

{'learning_rate': 0.05, 'n_estimators': 20, 'num_leaves': 31}


In [80]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x2)
print(r2_score(y2,y_pred))

-0.004853512439754848


In [81]:
xy_t=df[(df.date>datetime.datetime(2024,1,1))].iloc[:,2:].dropna()
xy_x=xy_t.drop("return_s1",axis=1)
xy_y=xy_t["return_s1"]
y_pred = best_model.predict(xy_x)
print(r2_score(xy_y,y_pred))

-0.0115241069145251


In [82]:
y_pred=pd.Series(y_pred,index=xy_x.index,name="pred")
pyb_data_pe=df[df.date>datetime.datetime(2024,1,1)].iloc[:,0:7]
pyb_data_pe = pd.concat([pyb_data_pe, y_pred], axis=1)
pyb_data_pe.fillna(0,inplace=True)

In [86]:
pybroker.register_columns('pred')
config = StrategyConfig(initial_cash=10000000)
strategy = Strategy(pyb_data_pe, '2024-01-01', '2024-05-01',config)
strategy.add_execution(hold_long, pyb_data_pe.symbol.unique())
result = strategy.backtest()

Backtesting: 2024-01-01 00:00:00 to 2024-05-01 00:00:00

Test split: 2024-01-02 00:00:00 to 2024-04-30 00:00:00


  0% (0 of 78) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  1% (1 of 78) |                         | Elapsed Time: 0:00:00 ETA:   0:00:07
 14% (11 of 78) |###                     | Elapsed Time: 0:00:00 ETA:   0:00:01
 26% (21 of 78) |######                  | Elapsed Time: 0:00:00 ETA:   0:00:01
 39% (31 of 78) |#########               | Elapsed Time: 0:00:00 ETA:   0:00:00
 52% (41 of 78) |############            | Elapsed Time: 0:00:00 ETA:   0:00:00
 65% (51 of 78) |###############         | Elapsed Time: 0:00:00 ETA:   0:00:00
 78% (61 of 78) |##################      | Elapsed Time: 0:00:00 ETA:   0:00:00
 91% (71 of 78) |#####################   | Elapsed Time: 0:00:01 ETA:   0:00:00
100% (78 of 78) |########################| Elapsed Time: 0:00:01 ETA:  00:00:00
100% (78 of 78) |########################| Elapsed Time: 0:00:01 Time:  0:00:01



Finished backtest: 0:00:01


In [87]:
result.metrics_df.head()

Unnamed: 0,name,value
0,trade_count,510.0
1,initial_market_value,10000000.0
2,end_market_value,9949515.08
3,total_pnl,-73616.58
4,unrealized_pnl,23131.66
