<a href="https://colab.research.google.com/github/wannasmile/colab_code_note/blob/main/QUANT014.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys, site  # 导入 sys 和 site 模块，用于系统相关操作和站点配置
from pathlib import Path  # 导入 pathlib 模块，用于处理文件路径

try:
    import qlib  # 尝试导入 qlib 库
except ImportError:
    # 如果导入失败，则安装 qlib
    ! pip install --upgrade numpy  # 使用 pip 安装或升级 numpy 库
    ! pip install pyqlib  # 使用 pip 安装 pyqlib 库
    if "google.colab" in sys.modules:  # 如果当前环境是 Google Colab
        # Google Colab 环境可能较旧，需要降级 pyyaml 以兼容其他包
        ! pip install pyyaml==5.4.1  # 使用 pip 安装特定版本的 pyyaml 库
    # 重新加载站点配置，使新安装的库生效
    site.main()

scripts_dir = Path.cwd().parent.joinpath("scripts")  # 获取 scripts 目录的路径
if not scripts_dir.joinpath("get_data.py").exists():  # 如果 get_data.py 脚本不存在
    # 下载 get_data.py 脚本
    scripts_dir = Path("~/tmp/qlib_code/scripts").expanduser().resolve()  # 获取脚本目录的绝对路径，并展开用户目录
    scripts_dir.mkdir(parents=True, exist_ok=True)  # 创建脚本目录，如果父目录不存在则创建，如果目录已存在则不报错
    import requests  # 导入 requests 库，用于发送 HTTP 请求

    with requests.get("https://raw.githubusercontent.com/microsoft/qlib/main/scripts/get_data.py", timeout=10) as resp:  # 发送 GET 请求下载脚本，设置超时时间为 10 秒
        with open(scripts_dir.joinpath("get_data.py"), "wb") as fp:  # 以二进制写入模式打开脚本文件
            fp.write(resp.content)  # 将下载的脚本内容写入文件

In [None]:
import qlib  # 导入 qlib 库，用于量化投资研究
import pandas as pd  # 导入 pandas 库，用于数据处理
from qlib.constant import REG_CN  # 从 qlib.constant 模块导入 REG_CN 常量，表示中国市场
from qlib.utils import exists_qlib_data, init_instance_by_config  # 从 qlib.utils 模块导入函数，用于检查数据和初始化实例
from qlib.workflow import R  # 从 qlib.workflow 模块导入 R 对象，用于管理实验记录
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord  # 从 qlib.workflow.record_temp 模块导入记录类，用于记录信号和投资组合分析
from qlib.utils import flatten_dict  # 从 qlib.utils 模块导入函数，用于展平字典

In [None]:
# 使用默认数据
# 注意：需要从远程下载数据：python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data
provider_uri = "~/.qlib/qlib_data/cn_data"  # 设置数据提供者 URI，即数据存储路径
if not exists_qlib_data(provider_uri):  # 检查数据是否已存在
    print(f"在 {provider_uri} 中未找到 Qlib 数据")  # 如果数据不存在，则打印提示信息
    sys.path.append(str(scripts_dir))  # 将脚本目录添加到系统路径，以便导入 get_data.py
    from get_data import GetData  # 从 get_data.py 导入 GetData 类

    GetData().qlib_data(target_dir=provider_uri, region=REG_CN)  # 下载 Qlib 数据，指定目标目录和市场区域
qlib.init(provider_uri=provider_uri, region=REG_CN)  # 初始化 Qlib，指定数据提供者 URI 和市场区域

[1190:MainThread](2025-03-15 07:09:29,242) INFO - qlib.Initialization - [config.py:420] - default_conf: client.
[1190:MainThread](2025-03-15 07:09:29,247) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[1190:MainThread](2025-03-15 07:09:29,254) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/root/.qlib/qlib_data/cn_data')}


In [None]:
market = "csi300"  # 设置市场为沪深300指数
benchmark = "SH000300"  # 设置基准为沪深300指数

In [None]:
!pip install catboost



In [None]:
###################################
# 训练模型
###################################
data_handler_config = {  # 数据处理器配置
    "start_time": "2008-01-01",  # 数据起始时间
    "end_time": "2020-08-01",  # 数据结束时间
    "fit_start_time": "2008-01-01",  # 模型拟合起始时间
    "fit_end_time": "2014-12-31",  # 模型拟合结束时间
    "instruments": market,  # 使用的市场
}

task = {  # 任务配置
    "model": {  # 模型配置
        "class": "LGBModel",  # 模型类名
        "module_path": "qlib.contrib.model.gbdt",  # 模型模块路径
        "kwargs": {  # 模型参数
            "loss": "mse",  # 损失函数：均方误差
            "colsample_bytree": 0.8879,  # 列采样比例
            "learning_rate": 0.0421,  # 学习率
            "subsample": 0.8789,  # 行采样比例
            "lambda_l1": 205.6999,  # L1 正则化系数
            "lambda_l2": 580.9768,  # L2 正则化系数
            "max_depth": 8,  # 最大树深度
            "num_leaves": 210,  # 叶子节点数
            "num_threads": 20,  # 线程数
        },
    },
    "dataset": {  # 数据集配置
        "class": "DatasetH",  # 数据集类名
        "module_path": "qlib.data.dataset",  # 数据集模块路径
        "kwargs": {  # 数据集参数
            "handler": {  # 数据处理器配置
                "class": "Alpha158",  # 数据处理器类名
                "module_path": "qlib.contrib.data.handler",  # 数据处理器模块路径
                "kwargs": data_handler_config,  # 数据处理器参数
            },
            "segments": {  # 数据集分段
                "train": ("2008-01-01", "2014-12-31"),  # 训练集时间段
                "valid": ("2015-01-01", "2016-12-31"),  # 验证集时间段
                "test": ("2017-01-01", "2020-08-01"),  # 测试集时间段
            },
        },
    },
}

# 模型初始化
model = init_instance_by_config(task["model"])  # 根据配置初始化模型实例
dataset = init_instance_by_config(task["dataset"])  # 根据配置初始化数据集实例

# 开始实验训练模型
with R.start(experiment_name="train_model"):  # 启动实验，命名为 "train_model"
    R.log_params(**flatten_dict(task))  # 记录实验参数
    model.fit(dataset)  # 训练模型
    R.save_objects(trained_model=model)  # 保存训练好的模型
    rid = R.get_recorder().id  # 获取实验记录 ID

[1190:MainThread](2025-03-15 07:12:54,398) INFO - qlib.timer - [log.py:127] - Time cost: 180.278s | Loading data Done
[1190:MainThread](2025-03-15 07:12:55,653) INFO - qlib.timer - [log.py:127] - Time cost: 0.319s | DropnaLabel Done
[1190:MainThread](2025-03-15 07:12:59,511) INFO - qlib.timer - [log.py:127] - Time cost: 3.856s | CSZScoreNorm Done
[1190:MainThread](2025-03-15 07:12:59,527) INFO - qlib.timer - [log.py:127] - Time cost: 5.127s | fit & process data Done
[1190:MainThread](2025-03-15 07:12:59,530) INFO - qlib.timer - [log.py:127] - Time cost: 185.409s | Init data Done
[1190:MainThread](2025-03-15 07:12:59,549) INFO - qlib.workflow - [exp.py:258] - Experiment 990456155965144949 starts running ...
[1190:MainThread](2025-03-15 07:12:59,843) INFO - qlib.workflow - [recorder.py:345] - Recorder fd27a8f59cc845a9b7738211d05d70f9 starts running under Experiment 990456155965144949 ...
[1190:MainThread](2025-03-15 07:12:59,861) INFO - qlib.workflow - [recorder.py:378] - Fail to log the

Training until validation scores don't improve for 50 rounds
[20]	train's l2: 0.990585	valid's l2: 0.99431
[40]	train's l2: 0.986931	valid's l2: 0.993693
[60]	train's l2: 0.984352	valid's l2: 0.99349
[80]	train's l2: 0.982319	valid's l2: 0.993382
[100]	train's l2: 0.980442	valid's l2: 0.99331
[120]	train's l2: 0.97871	valid's l2: 0.993247
[140]	train's l2: 0.976987	valid's l2: 0.993334
[160]	train's l2: 0.97536	valid's l2: 0.993338
Early stopping, best iteration is:
[122]	train's l2: 0.978519	valid's l2: 0.993238


[1190:MainThread](2025-03-15 07:14:42,737) INFO - qlib.timer - [log.py:127] - Time cost: 0.280s | waiting `async_log` Done


In [None]:
###################################
# 预测、回测和分析
###################################
port_analysis_config = {  # 投资组合分析配置
    "executor": {  # 执行器配置
        "class": "SimulatorExecutor",  # 执行器类名
        "module_path": "qlib.backtest.executor",  # 执行器模块路径
        "kwargs": {  # 执行器参数
            "time_per_step": "day",  # 每步时间间隔：天
            "generate_portfolio_metrics": True,  # 生成投资组合指标
        },
    },
    "strategy": {  # 策略配置
        "class": "TopkDropoutStrategy",  # 策略类名
        "module_path": "qlib.contrib.strategy.signal_strategy",  # 策略模块路径
        "kwargs": {  # 策略参数
            "model": model,  # 使用的模型
            "dataset": dataset,  # 使用的数据集
            "topk": 50,  # 选择前 50 只股票
            "n_drop": 5,  # 丢弃 5 只股票
        },
    },
    "backtest": {  # 回测配置
        "start_time": "2017-01-01",  # 回测起始时间
        "end_time": "2020-08-01",  # 回测结束时间
        "account": 100000000,  # 初始资金：1 亿
        "benchmark": benchmark,  # 基准
        "exchange_kwargs": {  # 交易所参数
            "freq": "day",  # 交易频率：天
            "limit_threshold": 0.095,  # 涨跌停阈值：9.5%
            "deal_price": "close",  # 成交价格：收盘价
            "open_cost": 0.0005,  # 开仓手续费：0.05%
            "close_cost": 0.0015,  # 平仓手续费：0.15%
            "min_cost": 5,  # 最低手续费：5 元
        },
    },
}

# 回测和分析
with R.start(experiment_name="backtest_analysis"):  # 启动实验，命名为 "backtest_analysis"
    recorder = R.get_recorder(recorder_id=rid, experiment_name="train_model")  # 获取训练模型实验的记录器
    model = recorder.load_object("trained_model")  # 加载训练好的模型

    # 预测
    recorder = R.get_recorder()  # 获取当前实验的记录器
    ba_rid = recorder.id  # 获取当前实验的记录器 ID
    sr = SignalRecord(model, dataset, recorder)  # 创建信号记录器
    sr.generate()  # 生成信号

    # 回测和分析
    par = PortAnaRecord(recorder, port_analysis_config, "day")  # 创建投资组合分析记录器
    par.generate()  # 生成投资组合分析结果

[1190:MainThread](2025-03-15 07:14:42,775) INFO - qlib.workflow - [exp.py:258] - Experiment 208014624712546602 starts running ...
[1190:MainThread](2025-03-15 07:14:42,793) INFO - qlib.workflow - [recorder.py:345] - Recorder b5f44ac05ccd49999d0cbb40930121bb starts running under Experiment 208014624712546602 ...
[1190:MainThread](2025-03-15 07:14:42,805) INFO - qlib.workflow - [recorder.py:378] - Fail to log the uncommitted code of $CWD(/content) when run git diff.
[1190:MainThread](2025-03-15 07:14:42,814) INFO - qlib.workflow - [recorder.py:378] - Fail to log the uncommitted code of $CWD(/content) when run git status.
[1190:MainThread](2025-03-15 07:14:42,825) INFO - qlib.workflow - [recorder.py:378] - Fail to log the uncommitted code of $CWD(/content) when run git diff --cached.
[1190:MainThread](2025-03-15 07:14:47,066) INFO - qlib.workflow - [record_temp.py:198] - Signal record 'pred.pkl' has been saved as the artifact of the Experiment 208014624712546602
[1190:MainThread](2025-03-

'The following are prediction results of the LGBModel model.'
                          score
datetime   instrument          
2017-01-03 SH600000   -0.042865
           SH600008    0.005925
           SH600009    0.030596
           SH600010   -0.013973
           SH600015   -0.141758




backtest loop:   0%|          | 0/871 [00:00<?, ?it/s]

  return np.nanmean(self.data)
  return np.nanmean(self.data)
  return np.nanmean(self.data)
[1190:MainThread](2025-03-15 07:16:14,522) INFO - qlib.workflow - [record_temp.py:515] - Portfolio analysis record 'port_analysis_1day.pkl' has been saved as the artifact of the Experiment 208014624712546602
[1190:MainThread](2025-03-15 07:16:14,560) INFO - qlib.workflow - [record_temp.py:540] - Indicator analysis record 'indicator_analysis_1day.pkl' has been saved as the artifact of the Experiment 208014624712546602


'The following are analysis results of benchmark return(1day).'
                       risk
mean               0.000477
std                0.012295
annualized_return  0.113561
information_ratio  0.598699
max_drawdown      -0.370479
'The following are analysis results of the excess return without cost(1day).'
                       risk
mean               0.000748
std                0.005687
annualized_return  0.178088
information_ratio  2.029781
max_drawdown      -0.057265
'The following are analysis results of the excess return with cost(1day).'
                       risk
mean               0.000553
std                0.005685
annualized_return  0.131583
information_ratio  1.500251
max_drawdown      -0.066180
'The following are analysis results of indicators(1day).'
     value
ffr    1.0
pa     0.0
pos    0.0


[1190:MainThread](2025-03-15 07:16:16,267) INFO - qlib.timer - [log.py:127] - Time cost: 0.000s | waiting `async_log` Done


In [None]:
from qlib.contrib.report import analysis_model, analysis_position  # 导入模型分析和持仓分析模块
from qlib.data import D  # 导入数据模块

recorder = R.get_recorder(recorder_id=ba_rid, experiment_name="backtest_analysis")  # 获取回测分析实验的记录器
print(recorder)  # 打印记录器信息
pred_df = recorder.load_object("pred.pkl")  # 加载预测结果 DataFrame
report_normal_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")  # 加载常规报告 DataFrame，1 天频率
positions = recorder.load_object("portfolio_analysis/positions_normal_1day.pkl")  # 加载持仓 DataFrame，1 天频率
analysis_df = recorder.load_object("portfolio_analysis/port_analysis_1day.pkl")  # 加载投资组合分析 DataFrame，1 天频率

{'class': 'Recorder', 'id': 'b5f44ac05ccd49999d0cbb40930121bb', 'name': 'mlflow_recorder', 'experiment_id': '208014624712546602', 'start_time': '2025-03-15 07:14:42', 'end_time': '2025-03-15 07:16:16', 'status': 'FINISHED'}


In [None]:
analysis_position.report_graph(report_normal_df)  # 生成并显示持仓分析报告图表，使用常规报告数据

In [None]:
analysis_position.risk_analysis_graph(analysis_df, report_normal_df)  # 生成并显示风险分析图表，使用投资组合分析数据和常规报告数据

In [None]:
label_df = dataset.prepare("test", col_set="label")  # 准备测试集标签数据，列名为 "label"
label_df.columns = ["label"]  # 将标签数据列名重命名为 "label"

In [None]:
pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)  # 合并标签数据和预测数据，并按照标签数据的索引重新排序
analysis_position.score_ic_graph(pred_label)  # 生成并显示 IC 值图表，使用合并后的预测标签数据

In [None]:
analysis_model.model_performance_graph(pred_label)  # 生成并显示模型性能图表，使用合并后的预测标签数据