In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from tqdm import tqdm
import time
import os
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def find_optimal_estimators_rfe(X_subset, y, cv_folds, base_random_state, max_depth_rf, estimator_range, verbose=False):

    best_n_estimators = estimator_range[0]
    best_mean_r2 = -np.inf
    
    if verbose:
        print(f"    开始为当前子集优化n_estimators (范围: {estimator_range[0]}-{estimator_range[1]}, 步长: {estimator_range[2]})...")

    for n_est in range(estimator_range[0], estimator_range[1] + estimator_range[2], estimator_range[2]):
        if n_est == 0: continue 
        
        cv_scores = []
        kf = KFold(n_splits=cv_folds, shuffle=True, random_state=base_random_state)
        
        for train_idx, val_idx in kf.split(X_subset, y):
            X_train_fold, X_val_fold = X_subset.iloc[train_idx], X_subset.iloc[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
            
            model_rf = RandomForestRegressor(
                n_estimators=n_est,
                max_depth=max_depth_rf,
                random_state=base_random_state, 
                n_jobs=-1
            )
            model_rf.fit(X_train_fold, y_train_fold)
            y_pred_val = model_rf.predict(X_val_fold)
            cv_scores.append(r2_score(y_val_fold, y_pred_val))
        
        mean_r2 = np.mean(cv_scores)
        
        if verbose:
            print(f"      测试 n_estimators = {n_est}, 平均CV R2 = {mean_r2:.4f}")
            
        if mean_r2 > best_mean_r2:
            best_mean_r2 = mean_r2
            best_n_estimators = n_est
            
    if verbose:
        print(f"    当前子集最优n_estimators = {best_n_estimators}, 对应平均CV R2 = {best_mean_r2:.4f}")
        
    return best_n_estimators, best_mean_r2

In [None]:
class StableRFE:
    def __init__(self, n_features_to_select, max_depth, cv_folds, random_state_base, verbose=0):
        self.n_features_to_select = n_features_to_select
        self.max_depth = max_depth
        self.cv_folds = cv_folds
        self.random_state_base = random_state_base
        self.verbose = verbose 
        
        self.selected_features_ = None
        self.feature_ranking_ = None
        self.optimized_n_estimators_for_this_run_ = None
        self.initial_r2_on_full_set_with_opt_n_est_ = None

    def fit(self, X, y):
        """
        执行一次完整的RFE流程
        """
        if self.verbose >= 1:
            print(f"\n--- 开始RFE运行 (种子: {self.random_state_base}) ---")
        if self.verbose >= 2:
            print(f"  正在为本次RFE运行优化n_estimators (基于初始 {X.shape[1]} 个特征)...")

        _estimator_range = (50, 951, 50) # 优化n_estimators的范围
        self.optimized_n_estimators_for_this_run_, self.initial_r2_on_full_set_with_opt_n_est_ = \
            find_optimal_estimators_rfe(
                X_subset=X.copy(), y=y, cv_folds=self.cv_folds,
                base_random_state=self.random_state_base, max_depth_rf=self.max_depth,
                estimator_range=_estimator_range, verbose=(self.verbose >= 2)
            )

        if self.verbose >= 1:
            print(f"  种子 {self.random_state_base}: 用于RFE过程的最优n_estimators = {self.optimized_n_estimators_for_this_run_} "
                  f"(在完整特征集上的CV R2: {self.initial_r2_on_full_set_with_opt_n_est_:.4f})")

        n_features_start = X.shape[1]
        X_current = X.copy()
        current_features_names = X.columns.tolist()
        eliminated_features_in_order = []

        num_features_to_eliminate = n_features_start - self.n_features_to_select
        
        pbar_rfe_steps = tqdm(range(num_features_to_eliminate),
                              desc=f"RFE (种子 {self.random_state_base}, n_est={self.optimized_n_estimators_for_this_run_})",
                              disable=(self.verbose < 1), leave=False)

        for i in pbar_rfe_steps:
            model = RandomForestRegressor(
                n_estimators=self.optimized_n_estimators_for_this_run_, max_depth=self.max_depth,
                random_state=self.random_state_base, n_jobs=-1
            )
            model.fit(X_current, y)
            importances = pd.Series(model.feature_importances_, index=X_current.columns)
            least_important_feature = importances.idxmin()

            eliminated_features_in_order.append(least_important_feature)
            current_features_names.remove(least_important_feature)
            X_current = X_current[current_features_names]

            if self.verbose >= 2:
                print(f"    已剔除: {least_important_feature} (重要性: {importances.min():.4f})")
            if self.verbose >= 1:
                pbar_rfe_steps.set_postfix_str(f"已剔除: {least_important_feature}, 剩余: {len(current_features_names)}")

        self.selected_features_ = current_features_names[:]

        ranking_dict = {feature: 1 for feature in self.selected_features_}
        for rank_val, feature in enumerate(reversed(eliminated_features_in_order)):
            ranking_dict[feature] = rank_val + 2
        self.feature_ranking_ = pd.Series(ranking_dict).sort_values()
        
        pbar_rfe_steps.close()

        if self.verbose >= 1:
            print(f"  种子 {self.random_state_base}: 选择的特征: {self.selected_features_}")
        if self.verbose >= 1:
            print(f"--- 完成RFE运行 (种子: {self.random_state_base}) ---")
        
        return self.selected_features_, self.feature_ranking_, self.optimized_n_estimators_for_this_run_, self.initial_r2_on_full_set_with_opt_n_est_

In [None]:
def multi_seed_stable_rfe(X, y, n_features_to_select, max_depth, cv_folds,
                           n_seeds=300, start_seed=1, verbose_level=1,
                           output_dir_name="RFEPCC300times"): #####################改这里################################
    """
    使用多个不同的随机种子重复运行RFE，以找到最稳定的特征子集。
    """
    all_selected_features_lists = []
    feature_counter = Counter()
    seed_results_summary = []

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    output_dir = f"{output_dir_name}_{timestamp}"
    os.makedirs(output_dir, exist_ok=True)
    log_file_path = os.path.join(output_dir, "rfe_selection_log.txt")

    def log_print(message, V_level=1):
        if verbose_level >= V_level:
            print(message)
        with open(log_file_path, "a", encoding='utf-8') as f:
            f.write(message + "\n")

    log_print(f"多种子RFE (选择模式) - 运行开始于 {timestamp}", V_level=0)
    log_print(f"参数: n_features_to_select={n_features_to_select}, max_depth={max_depth}, "
              f"cv_folds={cv_folds}, n_seeds={n_seeds}, start_seed={start_seed}", V_level=0)
    log_print(f"输入数据维度: X={X.shape}, y={y.shape}", V_level=0)
    log_print("-" * 30, V_level=0)

    pbar_seeds = tqdm(range(n_seeds), desc="多种子RFE选择", disable=(verbose_level < 1))
    for i in pbar_seeds:
        current_seed = start_seed + i
        if verbose_level >= 1:
             pbar_seeds.set_postfix_str(f"当前种子: {current_seed}")

        rfe_instance = StableRFE(
            n_features_to_select=n_features_to_select, max_depth=max_depth,
            cv_folds=cv_folds, random_state_base=current_seed, verbose=verbose_level
        )

        selected_features, _, n_est_for_seed, initial_r2_full = rfe_instance.fit(X, y)

        all_selected_features_lists.append(selected_features)
        feature_counter.update(selected_features)

        seed_results_summary.append({
            "seed": current_seed,
            "selected_features": sorted(selected_features),
            "n_selected": len(selected_features),
            "n_estimators_used": n_est_for_seed,
            "initial_full_set_cv_r2": initial_r2_full
        })

    pbar_seeds.close()
    log_print("-" * 30, V_level=0)
    log_print("多种子RFE选择完成。", V_level=0)

    seed_details_df = pd.DataFrame(seed_results_summary)
    seed_details_df.to_csv(os.path.join(output_dir, 'rfe_seed_details.csv'), index=False, encoding='utf-8-sig')
    log_print(f"\n种子运行详情已保存至: {os.path.join(output_dir, 'rfe_seed_details.csv')}", V_level=0)

    feature_freq_list = feature_counter.most_common()
    feature_freq_df = pd.DataFrame(feature_freq_list, columns=['feature', 'selection_count'])
    feature_freq_df['selection_frequency_percent'] = (feature_freq_df['selection_count'] / n_seeds) * 100
    feature_freq_df = feature_freq_df.sort_values(by='selection_count', ascending=False)
    
    feature_freq_df.to_csv(os.path.join(output_dir, 'rfe_feature_frequency.csv'), index=False, encoding='utf-8-sig')
    log_print(f"特征选择频率已保存至: {os.path.join(output_dir, 'rfe_feature_frequency.csv')}", V_level=0)

    plt.figure(figsize=(12, max(6, len(feature_freq_df) * 0.3))) # 动态调整图像高度
    sns.barplot(x='selection_count', y='feature', data=feature_freq_df, orient='h', palette="viridis")
    plt.title(f'特征选择频率 ({n_seeds} 次RFE运行, 目标特征数: {n_features_to_select})', fontsize=14)
    plt.xlabel(f'选择次数 (共 {n_seeds} 次运行)', fontsize=12)
    plt.ylabel('特征', fontsize=12)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'feature_selection_frequency.png'))
    log_print(f"特征选择频率图已保存至: {os.path.join(output_dir, 'feature_selection_frequency.png')}", V_level=0)
    plt.close()

    log_print(f"\n--- 多种子RFE总结 ({n_seeds} 个种子) ---", V_level=0)
    log_print(f"每次RFE运行的目标特征数: {n_features_to_select}", V_level=0)
    log_print(f"平均使用的n_estimators (每次独立优化): {seed_details_df['n_estimators_used'].mean():.1f}", V_level=0)
    log_print(f"平均初始CV R2 (在完整特征集上): {seed_details_df['initial_full_set_cv_r2'].mean():.4f}", V_level=0)

    recommended_features = [feat for feat, count in feature_freq_list[:n_features_to_select]]
    log_print(f"\n推荐的 {len(recommended_features)} 个特征 (基于最高频率):", V_level=0)
    log_print(", ".join(recommended_features), V_level=0)
    
    log_print(f"\n完整结果与日志保存在目录: {output_dir}", V_level=0)

    return recommended_features, feature_freq_df, seed_details_df

In [None]:
data_file_path = '/Users/yangmingyue/Desktop/78/降维后PCC/ABO478PCC.csv' #####################改这里################################
target_column_name = 'TCF' #####################改这里################################

full_df = pd.read_csv(data_file_path)
y_s = full_df[target_column_name]
X_df = full_df.drop(columns=[target_column_name])

# --- Matplotlib 中文显示设置 ---
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei', 'SimHei', 'Arial Unicode MS', 'sans-serif']
plt.rcParams['axes.unicode_minus'] = False

In [None]:
TARGET_N_FEATURES = 7 #####################改这里################################
N_SEEDS_TO_RUN = 300 #####################改这里################################
CV_FOLDS_FOR_TUNING = 10
VERBOSE_LEVEL = 1 # 0: 静默, 1: 摘要, 2: 详细

print(f"\n开始多种子RFE选择，目标是找出最稳定的 {TARGET_N_FEATURES} 个特征...")
recommended_feats, freq_df, seeds_df = multi_seed_stable_rfe(
    X=X_df,
    y=y_s,
    n_features_to_select=TARGET_N_FEATURES,
    max_depth=None, # RF的max_depth，None表示不限制
    cv_folds=CV_FOLDS_FOR_TUNING,
    n_seeds=N_SEEDS_TO_RUN,
    start_seed=1,
    verbose_level=VERBOSE_LEVEL,
    output_dir_name="RFEPCC300times" #####################改这里################################
)
print("\n--- RFE运行结束 ---")

In [None]:
print(f"最终推荐的 {len(recommended_feats)} 个特征: {', '.join(recommended_feats)}")

print("\n特征选择频率 (部分展示):")
if not freq_df.empty:
    print(freq_df.head())
else:
    print("特征频率DataFrame为空。")

print("\n各种子运行详情 (部分展示):")
if not seeds_df.empty:
    print(seeds_df.head())
else:
    print("各种子运行详情DataFrame为空。")


print(f"\n查看输出目录 '{seeds_df.iloc[0]['输出目录名基底'] if not seeds_df.empty and '输出目录名基底' in seeds_df.columns else 'RFE10times'}_{{时间戳格式YYYYMMDD-HHMMSS}}' 获取完整结果。")
print(f"请检查基于 '{'RFEPCC300times'}' 并带有时间戳的输出目录获取完整结果。")#####################改这里################################