In [1]:
import qlib
from qlib.data.dataset.loader import QlibDataLoader
import numpy as np
from gplearn.functions import make_function

qlib.init(provider_uri="/data/home/dinghj/zr-alphagen/zr-alpha-training-base/.qlib/qlib_data/cn_data", region='cn')

# 配置时间范围（可灵活调整）
train_start = '2020-01-01'
train_end   = '2020-12-31'
test_start  = '2021-01-01'
test_end    = '2021-06-30'

# 定义股票池（沪深300成份股）
universe = "csi300"


[83248:MainThread](2025-05-01 21:43:20,019) INFO - qlib.Initialization - [config.py:420] - default_conf: client.
[83248:MainThread](2025-05-01 21:43:21,630) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[83248:MainThread](2025-05-01 21:43:21,632) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/home/dinghj/zr-alphagen/zr-alpha-training-base/.qlib/qlib_data/cn_data')}


In [None]:
# 定义特征表达式列表和名称
feature_expressions = [
    "$close",
    "$volume",
    "$high - $low",
    "Mean($close, 5)",
    "Std($close, 10)",
    "Rank($close, 5)",
    "$close/Ref($close, 1) - 1",
    "Mean($volume, 5)",
]
feature_names = [
    "$close",
    "$volume",
    "$high - $low",
    "Mean($close, 5)",
    "Std($close, 10)",
    "Rank($close, 5)",
    "$close/Ref($close, 1) - 1",
    "Mean($volume, 5)",
]

# 定义标签表达式和名称（下一日收益率）
label_expression = ["Ref($close, -1)/$close - 1"]
label_names = ["LABEL"]

# 配置 DataLoader（包含特征和标签）
data_loader_config = {
    "feature": (feature_expressions, feature_names),
    "label": (label_expression, label_names)
}
data_loader = QlibDataLoader(config=data_loader_config)

# 加载训练集和测试集数据
train_df = data_loader.load(instruments=universe, start_time=train_start, end_time=train_end)
test_df  = data_loader.load(instruments=universe, start_time=test_start, end_time=test_end)

# 清除缺失值（由于滚动计算等可能在序列开头产生 NaN）
train_df = train_df.dropna()
test_df  = test_df.dropna()


In [3]:
import numpy as np

# 提取训练集的 X 和 y
X_train_df = train_df["feature"]            # 特征部分 DataFrame
y_train_df = train_df["label"]["LABEL"]     # 标签 Series

# 提取测试集的 X 和 y
X_test_df = test_df["feature"]
y_test_df = test_df["label"]["LABEL"]

# 转换为 numpy 数组，以供 gplearn 使用
X_train = X_train_df.values
y_train = y_train_df.values
X_test = X_test_df.values
y_test = y_test_df.values

print("Train samples:", X_train.shape, "Test samples:", X_test.shape)
# 输出示例: Train samples: (样本数, 8) Test samples: (样本数, 8)


Train samples: (72734, 8) Test samples: (35330, 8)


In [None]:
from gplearn.genetic import SymbolicTransformer
from scipy.stats import spearmanr
import pandas as pd
from gplearn import _program

# 1) 配置 Transformer
n_components = 10
transformer = SymbolicTransformer(
    function_set=('Add', 'Abs', 'Mean'),
    n_components=n_components,
    generations=10,
    metric='spearman',            # IC 作为 fitness
    p_crossover=0.7,
    p_subtree_mutation=0.1,
    p_hoist_mutation=0.05,
    p_point_mutation=0.1,
    parsimony_coefficient=0.001,
    max_samples=0.9,
    feature_names = list(X_train_df.columns),
    verbose=1,
    random_state=42,
)

# 2) 拟合并生成新特征
transformer.fit(X_train, y_train)
X_train_new = transformer.transform(X_train)
X_test_new  = transformer.transform(X_test)

programs = transformer._best_programs

# 4) 计算每条因子的 IC 并把 X0 → 原始特征名
records = []
for idx, prog in enumerate(programs):
    # LISP 风格的原始公式
    expr = str(prog)
    # 把 X0, X1 ... 替换成你的 feature_names
    for i, name in enumerate(feature_names):
        expr = expr.replace(f"X{i}", name)

    # 训练集／测试集上的 Spearman IC
    ic_tr = spearmanr(X_train_new[:, idx], y_train).correlation
    ic_te = spearmanr(X_test_new[:, idx],  y_test).correlation

    records.append({
        "factor_id": idx,
        "formula":   expr,
        "IC_train":  ic_tr,
        "IC_test":   ic_te,
    })

# 5) 整理成 DataFrame 并按 IC_train 排序
result_df = pd.DataFrame(records).sort_values("IC_train", ascending=False)

# 6) 打印 & （可选）保存到 csv
print(result_df.to_string(index=False))

# # 如果你想保存到 CSV：
# result_df.to_csv("discovered_factors.csv", index=False)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left


AttributeError: '_Program' object has no attribute 'get_all_indices'

In [9]:
programs = transformer._best_programs

# 4) 计算每条因子的 IC 并把 X0 → 原始特征名
records = []
for idx, prog in enumerate(programs):
    # LISP 风格的原始公式
    expr = str(prog)
    # 把 X0, X1 ... 替换成你的 feature_names
    for i, name in enumerate(feature_names):
        expr = expr.replace(f"X{i}", name)

    # 训练集／测试集上的 Spearman IC
    ic_tr = spearmanr(X_train_new[:, idx], y_train).correlation
    ic_te = spearmanr(X_test_new[:, idx],  y_test).correlation

    records.append({
        "factor_id": idx,
        "formula":   expr,
        "IC_train":  ic_tr,
        "IC_test":   ic_te,
    })

# 5) 整理成 DataFrame 并按 IC_train 排序
result_df = pd.DataFrame(records).sort_values("IC_train", ascending=False)

# 6) 打印 & （可选）保存到 csv
print(result_df.to_string(index=False))

 factor_id                                                     formula  IC_train  IC_test
         0                            add(std10, div(std10, range_hl))  0.041338 0.019815
         1                add(std10, add(std10, div(std10, range_hl)))  0.041005 0.016199
         5                add(std10, add(std10, div(std10, range_hl)))  0.041005 0.016199
         9                add(std10, add(std10, div(std10, range_hl)))  0.041005 0.016199
         8                add(std10, add(std10, div(std10, range_hl)))  0.041005 0.016199
         7                add(std10, add(std10, div(std10, range_hl)))  0.041005 0.016199
         3                         add(range_hl, div(std10, range_hl))  0.040738 0.015601
         4                         add(range_hl, div(std10, range_hl))  0.040738 0.015601
         6                         add(range_hl, div(std10, range_hl))  0.040738 0.015601
         2 add(add(std10, div(std10, range_hl)), div(std10, range_hl))  0.040139 0.022891


In [30]:
all([1, 1,1,1])

True