In [1]:
import numpy as np
import qlib
from qlib.data import D

# 1. 初始化 Qlib，用你自己的数据路径或测试路径
qlib.init(
    provider_uri="/data/home/dinghj/zr-alphagen/zr-alpha-training-base/.qlib/qlib_data/cn_data",
    region="cn"
)

[25392:MainThread](2025-05-21 20:41:47,707) INFO - qlib.Initialization - [config.py:420] - default_conf: client.
[25392:MainThread](2025-05-21 20:41:48,950) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[25392:MainThread](2025-05-21 20:41:48,951) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/home/dinghj/zr-alphagen/zr-alpha-training-base/.qlib/qlib_data/cn_data')}


In [2]:
from gplearn._program import _Program
from gplearn.fitness import _fitness_map

In [3]:
# 3. 构造一个固定的 _Program 实例
prog = _Program(
    function_set=["Mean"],
    # 这里按照你在 config.py 里定义的 arity：Mean 对应 4，
    # 但 __str__ 会将 arity=4 当作二元算子(序列, 窗口)来处理
    arities={4: ["Mean"]},
    init_depth=(1, 1),
    init_method="full",
    n_features=1,
    const_range=(1, 10),
    metric=_fitness_map["mse"],        # 任意可用的 fitness，只用于初始化
    p_point_replace=0.1,
    parsimony_coefficient=0.0,
    random_state=np.random.RandomState(0),
    transformer=None,
    feature_names=["$close"],          # 用于识别变量名
    program=["Add", "Mean", "$close", 5, 2],     # 我们想要测试的树
    qlib_config={                       # 这里只给个占位，__str__ 不会读取它
        "data_client": D,
        "instruments": D.instruments(market="csi300"),
        "start_time": "2021-01-01",
        "end_time": "2021-02-01",
        "freq": "day",
    }
)

In [4]:
str(prog)

'Add(Mean($close, 5), 2)'

In [5]:
from qlib.data import D
expr = "Add(Mean($close, 5), 2)"
df = D.features(instruments=D.instruments('csi300'),
                fields=[expr],
                start_time="2021-01-01",
                end_time="2021-02-01",
                freq="day")
print(df.head())

                       Add(Mean($close, 5), 2)
instrument datetime                           
SH600000   2021-01-04                13.998880
           2021-01-05                14.026306
           2021-01-06                14.098881
           2021-01-07                14.176306
           2021-01-08                14.213806


In [6]:
out = prog.execute(None)
out

Add(Mean($close, 5), 2)


array([13.99888 , 14.026306, 14.098881, ..., 10.326578,  9.925967,
        9.630958], dtype=float32)

# Phase 4

In [7]:
import numpy as np
import qlib
from qlib.data import D
from gplearn._program import _Program
from gplearn.config import functions_arity

# ——— 1. 初始化 Qlib ———
qlib.init(
    provider_uri="/data/home/dinghj/zr-alphagen/zr-alpha-training-base/.qlib/qlib_data/cn_data",
    region="cn"
)

# ——— 2. 构造通用参数 ———
common_args = {
    "function_set": ["Mean", "Add"],
    # Mean 在 config.py 中标为 arity=4（滚动类），Add 标为 arity=2
    "arities": {4: ["Mean"], 2: ["Add"]},
    "init_depth": (1, 1),
    "init_method": "full",
    "n_features": 1,
    "const_range": (1, 10),
    "metric": None,                # 只为初始化，不实际用
    "p_point_replace": 0.0,
    "parsimony_coefficient": 0.0,
    "random_state": np.random.RandomState(0),
    "transformer": None,
    "feature_names": ["$close"],
    "qlib_config": {
        "data_client": D,
        "instruments": D.instruments(market="csi300"),
        "start_time": "2021-01-01",
        "end_time": "2021-02-01",
        "freq": "day",
    }
}

# ——— 3. 创建两个简单的 _Program 实例 ———
prog1 = _Program(program=["Add", "Mean", "$close", 5, 2], **common_args)
prog2 = _Program(program=["Sub", "$close", "$open"],  **common_args)

[25392:MainThread](2025-05-21 20:41:53,196) INFO - qlib.Initialization - [config.py:420] - default_conf: client.
[25392:MainThread](2025-05-21 20:41:53,199) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[25392:MainThread](2025-05-21 20:41:53,200) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/home/dinghj/zr-alphagen/zr-alpha-training-base/.qlib/qlib_data/cn_data')}


In [8]:
def assert_valid(nodes):
    """
    尝试用 nodes 列表重新构造 _Program，
    若抛异常则说明语法不合法。
    """
    try:
        _Program(program=nodes, **common_args)
    except Exception as e:
        raise AssertionError(f"非法程序结构: {e}")

def assert_window_constants(nodes):
    """
    对于所有标记为滚动类的函数（functions_arity == 4），
    检查其第二个子节点（窗口参数）是否为 int 或 float。
    """
    for idx, node in enumerate(nodes):
        if isinstance(node, str) and functions_arity.get(node, 0) == 4:
            # __str__ 中把 arity=4 当成二元算子，第二个参数在 idx+2
            if idx + 2 >= len(nodes):
                raise AssertionError(f"函数 {node} 缺少参数")
            win = nodes[idx + 2]
            if not isinstance(win, (int, float)):
                raise AssertionError(f"{node} 的窗口参数不是常量: got {win!r}")

In [9]:
offspring, removed, donor_removed = prog1.crossover(prog2.program, np.random.RandomState(1))
assert_valid(offspring)
assert_window_constants(offspring)

In [10]:
mutated, removed, _ = prog1.subtree_mutation(np.random.RandomState(2))
assert_valid(mutated)
assert_window_constants(mutated)

['Mean', 'Add']
['$close']


In [11]:
hoisted, removed = prog1.hoist_mutation(np.random.RandomState(3))
assert_valid(hoisted)
assert_window_constants(hoisted)

In [12]:
point_mut, mutated_nodes = prog1.point_mutation(np.random.RandomState(4))
assert_valid(point_mut)
assert_window_constants(point_mut)

In [13]:
import numpy as np
import pandas as pd
from gplearn.genetic import SymbolicTransformer

# 生成伪造数据
n_samples = 6300
n_features = 2
X = np.random.randn(n_samples, n_features)
y = np.random.randn(n_samples)

# 创建并运行 SymbolicTransformer
transformer = SymbolicTransformer(
    population_size=10,             # 极小种群
    hall_of_fame=5,                  # 最多保留 5 个最优程序
    n_components=3,                  # 最终生成 3 个特征
    generations=1,                   # 只跑一代
    function_set=("Add", "Mean", "Sub"),    # 简化的算子集
    metric="pearson",                # 用相关系数做指标
    parsimony_coefficient=0.0,
    qlib_config=common_args["qlib_config"],
    feature_names=["$close", "$open"],        # 与 X 列对应
    random_state=0
)

  raise ValueError('invalid type found in `function_set`.'


In [14]:
X_new = transformer.fit_transform(X, y)

['Add', 'Mean', 'Sub']
['$close', '$open']
Sub(Add(Sub(Mean(Sub($open, $open), 30), Sub(Mean($close, 64), Sub($close, $open))), $close), Sub(Mean(Sub(Sub($open, $open), Mean($close, 64)), 64), Sub($close, Mean(Mean($open, 64), 2))))
['Add', 'Mean', 'Sub']
['$close', '$open']
Add(Add(Mean(Mean(Sub($close, $open), 12), 64), Sub(Add(Mean($open, 5), Sub($close, $open)), Add(Mean($open, 5), Add($close, $close)))), Mean(Mean(Sub(Mean($open, 12), Mean($close, 30)), 30), 30))
['Add', 'Mean', 'Sub']
['$close', '$open']
Sub(Sub(Mean(Mean(Add($close, $close), 2), 5), Sub(Sub(Mean($open, 2), Mean($open, 30)), Sub(Sub($open, $open), Add($open, $close)))), Sub(Add(Add(Add($open, $close), Add($open, $close)), Add(Add($close, $open), Add($close, $open))), Add(Mean(Mean($open, 12), 2), Add(Add($open, $close), Sub($open, $open)))))
['Add', 'Mean', 'Sub']
['$close', '$open']
Add(Add($open, Add(Mean(Mean($close, 64), 2), $close)), Sub($open, Sub($open, $open)))
['Add', 'Mean', 'Sub']
['$close', '$open']
M

In [16]:
programs = transformer._best_programs

In [24]:
records = []
from scipy.stats import spearmanr
for idx, prog in enumerate(programs):
    expr = str(prog)
    ic = spearmanr(X_new[:, idx], y).correlation
    records.append({
        "formula":   expr,
        "IC":  ic,
    })

# 5) 整理成 DataFrame 并按 IC_train 排序
result_df = pd.DataFrame(records)

# 6) 打印 & （可选）保存到 csv
result_df

Unnamed: 0,formula,IC
0,"Mean(Sub(Sub($close, $open), Add($close, $clos...",0.007669
1,"Add(Mean(Sub(Sub($open, $close), Mean($open, 1...",0.007837
2,"Mean(Sub(Sub(Mean($close, 5), Sub(Add(Sub($clo...",0.007013
