In [2]:
import pandas as pd
import string

def rename_columns(data):
    start = 3

    # Save original tail column names before renaming
    orig_cols = list(data.columns)
    orig_tail = orig_cols[start:]

    tail_count = max(0, data.shape[1] - start)
    new_tail = list(string.ascii_uppercase[:tail_count]) 

    data.columns = ["start", "ECI", "y"] + new_tail

    # Build mapping from letter to original column name for later decoding
    letter_to_source = {new_tail[i]: orig_tail[i] for i in range(len(new_tail))}
    # Attach mapping to DataFrame attrs to survive through later transforms
    data.attrs['letter_to_source'] = letter_to_source
    return data

def expand_data(df: pd.DataFrame, n: int) -> pd.DataFrame:
    if n <= 0:
        # If no lag requested, just keep base columns
        base_cols = [c for c in ['start', 'ECI', 'y'] if c in df.columns]
        return df[base_cols].copy()

    cols = list(df.columns)
    # Prefer contiguous tail starting from 'A' (output of rename_columns)
    if 'A' in cols:
        start_idx = cols.index('A')
        feat_cols = cols[start_idx:]
    else:
        # Fallback: uppercase letter-like column names in original order (A, B, ..., AA, AB, ...)
        feat_candidates = {c for c in cols if c.isalpha() and c.upper() == c}
        feat_cols = [c for c in cols if c in feat_candidates]

    base_cols = [c for c in ['start', 'ECI', 'y'] if c in df.columns]

    # Build lag features with vectorized shift (efficient, no Python loops over rows)
    out = {}
    for col in feat_cols:
        s = df[col]
        for k in range(1, n + 1):
            out[f'{col}_{k}'] = s.shift(k)

    features = pd.DataFrame(out, index=df.index)
    res = pd.concat([df[base_cols], features], axis=1)

    # Propagate and build decoding map for lagged features
    letter_to_source = df.attrs.get('letter_to_source', {})
    feature_decode = {}
    for col in feat_cols:
        src = letter_to_source.get(col, col)
        for k in range(1, n + 1):
            feature_decode[f'{col}_{k}'] = {'source': src, 'lag': k}
    res.attrs['letter_to_source'] = letter_to_source
    res.attrs['feature_decode'] = feature_decode
    trimmed = res.iloc[n:].reset_index(drop=True)
    trimmed.attrs['letter_to_source'] = letter_to_source
    trimmed.attrs['feature_decode'] = feature_decode
    return trimmed

def prepare_data():
    data = pd.read_excel("drama.xlsx")
    df_ = rename_columns(data)
    return expand_data(df_, 5)

df = prepare_data()
df.head()

Unnamed: 0,start,ECI,y,A_1,A_2,A_3,A_4,A_5,B_1,B_2,...,K_1,K_2,K_3,K_4,K_5,L_1,L_2,L_3,L_4,L_5
0,2025-08-24,28826434,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2025-08-25,28826434,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2025-08-26,28826434,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2025-08-27,28826434,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2025-08-28,28826434,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier

def train_xgb_and_rank_features(
    df: pd.DataFrame,
    target_col: str = "y",
    drop_cols=("start", "ECI"),
    importance_type: str = "gain",  # 'gain' | 'weight' | 'cover' | 'total_gain' | 'total_cover'
    n_estimators: int = 400,
    max_depth: int = 5,
    learning_rate: float = 0.05,
    subsample: float = 0.8,
    colsample_bytree: float = 0.8,
    reg_lambda: float = 1.0,
    random_state: int = 42,
):
    """
    Train an XGBoost classifier and return sorted feature importances.
    - Features: all columns except [drop_cols + target_col]
    - Target: y (auto-encode to integers if not numeric)
    """
    # 1) Build X, y
    feat_cols = [c for c in df.columns if c not in set(drop_cols) | {target_col}]
    X = df[feat_cols].apply(pd.to_numeric, errors="coerce")
    y = pd.to_numeric(df[target_col], errors="ignore")

    # 2) Drop rows with NaN in X or y to ensure clean training data
    mask = X.notna().all(axis=1) & pd.notna(y)
    X, y = X.loc[mask], y.loc[mask]

    # 3) Encode non-numeric y to category codes for classification
    if not np.issubdtype(y.dtype, np.number):
        y = y.astype("category").cat.codes

    classes = np.unique(y)
    num_class = len(classes)
    objective = "binary:logistic" if num_class == 2 else "multi:softprob"

    # 4) Construct classifier (only set num_class when > 2)
    params = dict(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_lambda=reg_lambda,
        objective=objective,
        eval_metric="logloss" if num_class == 2 else "mlogloss",
        n_jobs=0,
        tree_method="hist",
        random_state=random_state,
    )
    if num_class > 2:
        params["num_class"] = num_class

    clf = XGBClassifier(**params)
    clf.fit(X, y)

    # 5) Get feature importance
    booster = clf.get_booster()
    score = booster.get_score(importance_type=importance_type)  
    if score and not next(iter(score)).startswith("f"):
        # Good case: keys are real feature names
        imp_df = pd.DataFrame([(name, score.get(name, 0.0)) for name in X.columns],
                              columns=["feature", "importance"])
    else:
        # Fallback: use sklearn-style importances aligned with columns
        imp_df = pd.DataFrame({"feature": X.columns, "importance": clf.feature_importances_})

    imp_df = imp_df.sort_values("importance", ascending=False).reset_index(drop=True)
    return clf, imp_df

clf, importance_df = train_xgb_and_rank_features(df, target_col="y")

# Decode feature names back to original column names with lag info if mapping is available
letter_map = df.attrs.get('letter_to_source', {})
feat_decode = df.attrs.get('feature_decode', {})

def _decode(name: str) -> str:
    info = feat_decode.get(name)
    if isinstance(info, dict) and 'source' in info and 'lag' in info:
        return f"{info['source']}_lag{info['lag']}"
    # Fallback: try to parse Letter_Lag pattern
    try:
        letter, lag = name.split('_', 1)
        return f"{letter_map.get(letter, letter)}_lag{lag}"
    except Exception:
        return name

if isinstance(importance_df, pd.DataFrame) and 'feature' in importance_df.columns:
    importance_df['original_feature'] = importance_df['feature'].map(_decode)
    print(importance_df[["feature", "original_feature", "importance"]].head(30))
else:
    print(importance_df.head(30))  # 查看前30个最重要特征(无解码映射时的回退)
print(importance_df.head(30))  # 查看前30个最重要特征

   feature      original_feature  importance
0      K_2       是否弱覆盖（0，1）_lag2    0.214508
1      K_1       是否弱覆盖（0，1）_lag1    0.154559
2      A_1       是否过覆盖（0，1）_lag1    0.000000
3      I_5  是否邻区配置数据异常（0，1）_lag5    0.000000
4      G_3    小区eRAB拥塞（0,1）_lag3    0.000000
5      G_4    小区eRAB拥塞（0,1）_lag4    0.000000
6      G_5    小区eRAB拥塞（0,1）_lag5    0.000000
7      H_1      是否干扰小区（0，1）_lag1    0.000000
8      H_2      是否干扰小区（0，1）_lag2    0.000000
9      H_3      是否干扰小区（0，1）_lag3    0.000000
10     H_4      是否干扰小区（0，1）_lag4    0.000000
11     H_5      是否干扰小区（0，1）_lag5    0.000000
12     I_1  是否邻区配置数据异常（0，1）_lag1    0.000000
13     I_2  是否邻区配置数据异常（0，1）_lag2    0.000000
14     I_3  是否邻区配置数据异常（0，1）_lag3    0.000000
15     I_4  是否邻区配置数据异常（0，1）_lag4    0.000000
16     J_1  小区过覆盖导致高负荷（0,1）_lag1    0.000000
17     A_2       是否过覆盖（0，1）_lag2    0.000000
18     J_2  小区过覆盖导致高负荷（0,1）_lag2    0.000000
19     J_3  小区过覆盖导致高负荷（0,1）_lag3    0.000000
20     J_4  小区过覆盖导致高负荷（0,1）_lag4    0.000000
21     J_5

In [4]:
df.attrs['letter_to_source']['A']

'是否过覆盖（0，1）'