In [1]:
import tushare as ts

# 1）初始化
ts.set_token('d9328f1b9d1a325374a671d8dbe5de2f40914c468f6fff40c350e67d')  
pro = ts.pro_api()

In [14]:
# fetch_sh_sz_active_since_2021_industry.py

import tushare as ts
import pandas as pd

def fetch_all_sh_sz_map() -> dict:
    """
    获取沪深两市所有在市和已退市股票的行业映射（尽可能全，不限日期）。

    返回
    ----
    dict { ts_code: industry }
    """
    # 1) 拉当前在市和已退市的股票，一次性取 exchange 字段作交易所筛选
    df_all = pro.stock_basic(
        # 不指定 list_status，默认返回所有（'L','D','P'等）
        fields="ts_code,industry,exchange"
    )
    # 2) 只保留上交所（exchange=='SSE'）和深交所（'SZSE'）的
    df_shsz = df_all[df_all["exchange"].isin(["SSE", "SZSE"])].copy()

    # 3) 构造映射 dict
    mapping = dict(zip(df_shsz["ts_code"], df_shsz["industry"]))

    return mapping

def save_to_csv(mapping: dict, path: str = "/data/home/dinghj/zr-alphagen/zr-alpha-training-base/paper/AlphaEvolve/1.csv"):
    """
    保存为 CSV，格式：
      instrument,industry
    """
    df = pd.DataFrame(list(mapping.items()), columns=["instrument", "industry"])
    df['instrument'] = df['instrument'].str.replace(
        r'(\d+)\.(\w+)',     # 匹配：一串数字 + “.” + 一串字母
        r'\2\1',             # 替换为：字母 + 数字
        regex=True
    )    
    df.to_csv(path, index=False)
    print(f"已保存 {len(mapping)} 条沪深所有股票映射到 {path}")

if __name__ == "__main__":
    df_active = fetch_all_sh_sz_map()
    save_to_csv(df_active)


已保存 5151 条沪深所有股票映射到 /data/home/dinghj/zr-alphagen/zr-alpha-training-base/paper/AlphaEvolve/1.csv


In [8]:
df = pro.index_classify(level='L1', src='SW2021')

In [6]:
df_cat = pro.index_classify(
    level="L1",
    src="SW2021",
    fields=["index_code", "industry_name"],
)

In [8]:
import pandas as pd
def fetch_full_industry_map():
    # 1) 申万 L1 映射（只含沪深）
    df_cat = pro.index_classify(level="L1", src="SW2021", fields=["index_code", "industry_name"])
    sw_map = {}
    for _, row in df_cat.iterrows():
        ind_code, ind_name = row["index_code"], row["industry_name"]
        df_mem = pro.index_member(index_code=ind_code, fields=["con_code"])
        for ts_code in df_mem["con_code"].dropna().unique():
            sw_map[ts_code] = ind_name

    # 2) stock_basic 拿全市场行业
    df_basic = pro.stock_basic(list_status="L", fields=["ts_code", "industry"])
    # df_basic['industry'] 是 Tushare 自带的行业字段（CSRC/无注）
    basic_map = dict(zip(df_basic["ts_code"], df_basic["industry"]))

    # 3) 合并：以 sw_map 为准，其他用 basic_map
    full_map = sw_map.copy()
    for ts_code, ind in basic_map.items():
        if ts_code not in full_map:
            full_map[ts_code] = ind  # 北交所/其他交易所股票会被补上

    return full_map

full_map = fetch_full_industry_map()
df = pd.DataFrame(list(full_map.items()), columns=["instrument", "industry"])
print("共 %d 只股票被映射到行业" % len(full_map))

共 5681 只股票被映射到行业


In [7]:
mapping = {}
for _, row in df_cat.iterrows():
    ind_code = row["index_code"]
    ind_name = row["industry_name"]
    # 只拉取 con_code（成分股代码）即可
    df_mem = pro.index_member(
        index_code=ind_code,
        fields=["con_code"],
    )
    # 将每只成分股映射到该行业
    for ts_code in df_mem["con_code"].dropna().unique():
        mapping[ts_code] = ind_code

In [8]:
import pandas as pd
df = pd.DataFrame(list(mapping.items()), columns=["instrument", "industry"])

In [9]:
df['instrument'] = df['instrument'].str.replace(
    r'(\d+)\.(\w+)',     # 匹配：一串数字 + “.” + 一串字母
    r'\2\1',             # 替换为：字母 + 数字
    regex=True
)
df

Unnamed: 0,instrument,industry
0,SZ000019,801200.SI
1,SZ000034,801770.SI
2,SZ000048,801180.SI
3,SZ000061,801200.SI
4,SZ000505,801180.SI
...,...,...
5405,SH688701,801970.SI
5406,SZ001328,801980.SI
5407,SZ301371,801980.SI
5408,SH603193,801980.SI


In [10]:
df.to_csv("/data/home/dinghj/zr-alphagen/zr-alpha-training-base/paper/AlphaEvolve/industry.csv", index=False)

In [3]:
import my_qlib
my_qlib.init(
    provider_uri="/data/home/dinghj/zr-alphagen/zr-alpha-training-base/.qlib/qlib_data/cn_data",
    region="cn"
)

[95764:MainThread](2025-07-08 09:39:17,476) INFO - my_qlib.Initialization - [config.py:420] - default_conf: client.
[95764:MainThread](2025-07-08 09:39:18,681) INFO - my_qlib.Initialization - [__init__.py:74] - my_qlib successfully initialized based on client settings.
[95764:MainThread](2025-07-08 09:39:18,683) INFO - my_qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/home/dinghj/zr-alphagen/zr-alpha-training-base/.qlib/qlib_data/cn_data')}


In [4]:
from my_qlib.data.ops import load_relation_map_from_csv
load_relation_map_from_csv("/data/home/dinghj/zr-alphagen/zr-alpha-training-base/paper/AlphaEvolve/industry.csv")

In [5]:
from my_qlib.data import D
df = D.features(
    instruments=D.instruments(market='csi800'),
    fields=[
        # "Mean($close, 5)",
        "RelationRank($close)",
    ],
    start_time="2023-01-01",
    end_time="2023-06-30",
    freq='day'
)
df

Instrument SH600627 的行业映射不存在，请先加载关系映射。
Instrument SZ000043 的行业映射不存在，请先加载关系映射。
Instrument SZ000022 的行业映射不存在，请先加载关系映射。
Instrument SH600263 的行业映射不存在，请先加载关系映射。
Instrument SH600786 的行业映射不存在，请先加载关系映射。
Instrument SH600991 的行业映射不存在，请先加载关系映射。
Instrument SH600553 的行业映射不存在，请先加载关系映射。
Instrument SH689009 的行业映射不存在，请先加载关系映射。


Unnamed: 0_level_0,Unnamed: 1_level_0,RelationRank($close)
instrument,datetime,Unnamed: 2_level_1
SH600000,2023-01-03,0.976190
SH600000,2023-01-04,0.976190
SH600000,2023-01-05,0.976190
SH600000,2023-01-06,0.952381
SH600000,2023-01-09,0.952381
...,...,...
SZ301029,2023-06-26,0.106272
SZ301029,2023-06-27,0.090592
SZ301029,2023-06-28,0.094077
SZ301029,2023-06-29,0.090592
