# 1. Open file as DataFrame

In [1]:
import pandas as pd

In [37]:
df = pd.read_parquet("E:\\parquet\\EG.5_57-Fc_10_c43b80c0-ce9a-11f0-9f6b-f02f74d321c5.parquet")
df

Unnamed: 0,FSC-H,SSC-H,FITC-H,FSC-A,SSC-A,FITC-A,label,FITC_raw,FITC_log10,FITC_asinh,FITC_pct,FITC_z_robust,FSC_ratio,SSC_ratio,gmm_prob_infected
0,27852.0,129713.0,209.0,215725.13,955365.88,1179.00,False,1179.00,3.071882,1.002460,0.627628,0.392773,7.745409,7.365228,1.0
1,360777.0,524272.0,366.0,14136050.00,39242472.00,5594.38,False,5594.38,3.747830,2.422804,0.888889,1.359204,39.182237,74.851360,1.0
2,247020.0,354271.0,442.0,7274405.00,22703936.00,3165.06,False,3165.06,3.500519,1.869390,0.813814,1.005614,29.448648,64.086352,1.0
3,9323.0,34349.0,116.0,33711.19,108659.44,196.44,False,196.44,2.295435,0.195198,0.354354,-0.717347,3.615917,3.163395,1.0
4,24168.0,-64401.0,246.0,110837.34,-1343954.13,1036.00,False,1036.00,3.015779,0.906602,0.600601,0.312560,4.586120,20.868529,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661,9964.0,-524287.0,752.0,27170.91,-1835004.50,1414.81,False,1414.81,3.151005,1.146560,0.656156,0.505898,2.726908,3.500000,1.0
662,54244.0,316714.0,473.0,1218225.00,-20122002.00,260.94,False,260.94,2.418202,0.258066,0.384384,-0.541822,22.458244,-63.533668,1.0
663,6463.0,-168922.0,115.0,17693.34,-518386.75,156.56,False,156.56,2.197446,0.155927,0.327327,-0.857446,2.737636,3.068794,1.0
664,5974.0,203992.0,360.0,17268.06,322255.69,559.41,False,559.41,2.748506,0.533709,0.483483,-0.069572,2.890536,1.579747,1.0


# 2. Scale DataFrame

In [38]:
from sklearn.preprocessing import MinMaxScaler

# Предположим, df — ваш датафрейм
scaler = MinMaxScaler(feature_range=(1e-5, 1))
df_scaled = pd.DataFrame(scaler.fit_transform(df[["FSC-H", "SSC-H", "FSC-A", "SSC-A", "FITC-A"]]), columns=df[["FSC-H", "SSC-H", "FSC-A", "SSC-A", "FITC-A"]].columns)
df_scaled["label"] = df["label"].astype(int)
df_scaled

Unnamed: 0,FSC-H,SSC-H,FSC-A,SSC-A,FITC-A,label
0,0.052961,0.623717,0.008651,0.515035,0.009830,0
1,0.688095,1.000000,0.573392,0.989684,0.010419,0
2,0.471076,0.837873,0.295018,0.784654,0.010095,0
3,0.017613,0.532770,0.001267,0.504538,0.009699,0
4,0.045933,0.438594,0.004396,0.486530,0.009811,0
...,...,...,...,...,...,...
661,0.018836,0.000010,0.001002,0.480442,0.009862,0
662,0.103310,0.802056,0.049322,0.253736,0.009708,0
663,0.012157,0.338915,0.000617,0.496764,0.009694,0
664,0.011224,0.694555,0.000600,0.507186,0.009748,0


# 3. Create statistics

In [None]:
import seaborn as sns
sns.set_theme(style="ticks")

# df = sns.load_dataset("penguins")
sns.pairplot(df, hue="label")

KeyboardInterrupt: 

# 4. Balance data

In [39]:
# Импортируем необходимые библиотеки
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Предположим, что ваш датафрейм называется df
# Разделяем датафрейм на признаки и целевую переменную
X = df_scaled.drop('label', axis=1)  # признаки
y = df_scaled['label']  # целевая переменная

# Разбиваем данные на тренировочную и тестовую выборки
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )

# Применяем SMOTE для балансировки тренировочной выборки
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X,  y)

# Проверяем баланс классов после балансировки
print("Распределение классов до балансировки:", y.value_counts())
print("Распределение классов после балансировки:", y_balanced.value_counts())

ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead

# 5. Create new features

In [6]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import umap


# ============================================================
# 1. Полиномиальные признаки
# ============================================================
def add_polynomial_features(X: pd.DataFrame, degree=2):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    poly_data = poly.fit_transform(X)
    poly_cols = poly.get_feature_names_out(X.columns)

    df_poly = pd.DataFrame(poly_data, columns=poly_cols, index=X.index)

    # Удаляем оригинальные столбцы, чтобы не дублировать
    df_poly = df_poly.drop(columns=X.columns, errors="ignore")

    return pd.concat([X, df_poly], axis=1)



# ============================================================
# 2. Статистические агрегаты
# ============================================================
def add_stat_features(X: pd.DataFrame):
    X_new = X.copy()
    X_new["feat_sum"]  = X.sum(axis=1)
    X_new["feat_mean"] = X.mean(axis=1)
    X_new["feat_std"]  = X.std(axis=1)
    X_new["feat_min"]  = X.min(axis=1)
    X_new["feat_max"]  = X.max(axis=1)
    X_new["feat_cv"]   = X.std(axis=1) / (X.mean(axis=1) + 1e-9)
    return X_new



# ============================================================
# 3. Отношения признаков (ratios)
# ============================================================
def add_ratio_features(X: pd.DataFrame):
    X_new = X.copy()
    cols = X.columns

    # Все попарные отношения
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            c1, c2 = cols[i], cols[j]
            X_new[f"{c1}_to_{c2}"] = X[c1] / (X[c2] + 1e-9)
            X_new[f"{c2}_to_{c1}"] = X[c2] / (X[c1] + 1e-9)

    return X_new



# ============================================================
# 4. Логарифмические признаки
# ============================================================
def add_log_features(X: pd.DataFrame):
    X_new = X.copy()
    for col in X.columns:
        X_new["log_" + col] = np.log1p(X[col])
    return X_new



# ============================================================
# 5. PCA признаки
# ============================================================
def add_pca_features(X: pd.DataFrame, n_components=3):
    pca = PCA(n_components=n_components)
    pca_res = pca.fit_transform(X)
    df_pca = pd.DataFrame(
        pca_res,
        columns=[f"PCA{i+1}" for i in range(n_components)],
        index=X.index,
    )
    return pd.concat([X, df_pca], axis=1)



# ============================================================
# 6. UMAP признаки
# ============================================================
def add_umap_features(X: pd.DataFrame, n_components=2, n_neighbors=15, min_dist=0.1):
    reducer = umap.UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        random_state=42,
    )
    emb = reducer.fit_transform(X)

    df_umap = pd.DataFrame(
        emb,
        columns=[f"UMAP{i+1}" for i in range(n_components)],
        index=X.index,
    )
    return pd.concat([X, df_umap], axis=1)



# ============================================================
# 7. Кластеризационные признаки
# ============================================================
def add_cluster_features(X: pd.DataFrame, n_clusters=5):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X)

    X_new = X.copy()
    X_new["cluster"] = labels

    # Расстояния до центров — очень сильные признаки
    dists = kmeans.transform(X)
    for i in range(n_clusters):
        X_new[f"dist_to_cluster_{i}"] = dists[:, i]

    return X_new



# ============================================================
# 8. Полный пайплайн генерации всех признаков
# ============================================================
def generate_all_features(
    X,
    poly_degree=2,
    pca_components=3,
    umap_components=2,
    n_clusters=5
):
    X_ext = X.copy()

    print("> add_polynomial_features")
    X_ext = add_polynomial_features(X_ext, degree=poly_degree)
    print("> add_stat_features")
    X_ext = add_stat_features(X_ext)
    print("> add_ratio_features")
    X_ext = add_ratio_features(X_ext)
    print("> add_log_features")
    X_ext = add_log_features(X_ext)
    print("> add_pca_features")
    X_ext = add_pca_features(X_ext, n_components=pca_components)
    print("> add_umap_features")
    X_ext = add_umap_features(X_ext, n_components=umap_components)
    print("> add_cluster_features")
    X_ext = add_cluster_features(X_ext, n_clusters=n_clusters)

    return X_ext

X_final = generate_all_features(
    X_balanced,
    poly_degree=2,
    pca_components=3,
    umap_components=2,
    n_clusters=5
)
X_final

> add_polynomial_features
> add_stat_features
> add_ratio_features


  X_new[f"{c2}_to_{c1}"] = X[c2] / (X[c1] + 1e-9)
  X_new[f"{c1}_to_{c2}"] = X[c1] / (X[c2] + 1e-9)
  X_new[f"{c2}_to_{c1}"] = X[c2] / (X[c1] + 1e-9)
  X_new[f"{c1}_to_{c2}"] = X[c1] / (X[c2] + 1e-9)
  X_new[f"{c2}_to_{c1}"] = X[c2] / (X[c1] + 1e-9)
  X_new[f"{c1}_to_{c2}"] = X[c1] / (X[c2] + 1e-9)
  X_new[f"{c2}_to_{c1}"] = X[c2] / (X[c1] + 1e-9)
  X_new[f"{c1}_to_{c2}"] = X[c1] / (X[c2] + 1e-9)
  X_new[f"{c2}_to_{c1}"] = X[c2] / (X[c1] + 1e-9)
  X_new[f"{c1}_to_{c2}"] = X[c1] / (X[c2] + 1e-9)
  X_new[f"{c2}_to_{c1}"] = X[c2] / (X[c1] + 1e-9)
  X_new[f"{c1}_to_{c2}"] = X[c1] / (X[c2] + 1e-9)
  X_new[f"{c2}_to_{c1}"] = X[c2] / (X[c1] + 1e-9)
  X_new[f"{c1}_to_{c2}"] = X[c1] / (X[c2] + 1e-9)
  X_new[f"{c2}_to_{c1}"] = X[c2] / (X[c1] + 1e-9)
  X_new[f"{c1}_to_{c2}"] = X[c1] / (X[c2] + 1e-9)
  X_new[f"{c2}_to_{c1}"] = X[c2] / (X[c1] + 1e-9)
  X_new[f"{c1}_to_{c2}"] = X[c1] / (X[c2] + 1e-9)
  X_new[f"{c2}_to_{c1}"] = X[c2] / (X[c1] + 1e-9)
  X_new[f"{c1}_to_{c2}"] = X[c1] / (X[c2] + 1e-9)


> add_log_features


  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" + col] = np.log1p(X[col])
  X_new["log_" +

> add_pca_features
> add_umap_features


  warn(


> add_cluster_features


Unnamed: 0,FSC-H,SSC-H,FSC-A,SSC-A,FITC-A,FSC-H^2,FSC-H SSC-H,FSC-H FSC-A,FSC-H SSC-A,FSC-H FITC-A,...,PCA2,PCA3,UMAP1,UMAP2,cluster,dist_to_cluster_0,dist_to_cluster_1,dist_to_cluster_2,dist_to_cluster_3,dist_to_cluster_4
0,0.002405,0.003302,0.703293,0.511719,0.991729,0.000006,0.000008,0.001692,0.001231,0.002386,...,-150209.463550,-73790.860670,-0.109670,11.714401,0,7.080976e+05,1.518126e+10,1.544325e+10,9.046370e+09,1.014105e+10
1,0.003848,0.005217,0.905527,0.949851,0.991849,0.000015,0.000020,0.003484,0.003655,0.003816,...,131965.358591,27820.799718,1.758180,21.360153,0,4.680637e+05,1.518154e+10,1.544429e+10,9.046805e+09,1.014188e+10
2,0.002315,0.002781,0.709210,0.505006,0.991809,0.000005,0.000006,0.001642,0.001169,0.002296,...,-303621.376626,-155437.640387,-1.631208,16.740513,0,1.052435e+06,1.518120e+10,1.544283e+10,9.046284e+09,1.014072e+10
3,0.002320,0.002593,0.700302,0.500021,0.991702,0.000005,0.000006,0.001625,0.001160,0.002301,...,-357052.126786,-187695.475230,-5.838109,19.678785,0,1.137402e+06,1.518121e+10,1.544273e+10,9.046290e+09,1.014063e+10
4,0.002343,0.002723,0.701166,0.497428,0.991714,0.000005,0.000006,0.001643,0.001166,0.002324,...,-304485.707151,-159567.165245,-2.483861,16.427511,0,1.009540e+06,1.518123e+10,1.544287e+10,9.046315e+09,1.014074e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81939,0.003035,0.003098,0.714508,0.506798,0.991969,0.000009,0.000009,0.002169,0.001538,0.003011,...,-61236.934052,-64383.649214,-4.981483,-6.506984,0,9.491782e+04,1.518149e+10,1.544380e+10,9.046707e+09,1.014147e+10
81940,0.002984,0.003300,0.712860,0.508927,0.992062,0.000009,0.000010,0.002127,0.001519,0.002960,...,-30192.943573,-44773.153161,1.621307,-3.333443,0,2.755771e+04,1.518148e+10,1.544385e+10,9.046691e+09,1.014151e+10
81941,0.003440,0.003653,0.724509,0.515702,0.992274,0.000012,0.000013,0.002492,0.001774,0.003413,...,78290.408571,736.259980,13.743039,18.774908,0,3.706678e+05,1.518158e+10,1.544423e+10,9.046841e+09,1.014182e+10
81942,0.003099,0.003397,0.716195,0.510770,0.992165,0.000010,0.000011,0.002219,0.001583,0.003075,...,2079.262421,-31168.248691,10.419333,-2.054430,0,9.537153e+04,1.518151e+10,1.544396e+10,9.046735e+09,1.014160e+10


In [14]:
# feature_selection_utils.py
import numpy as np
import pandas as pd
from typing import List, Optional, Tuple

from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.cluster import FeatureAgglomeration
from sklearn.pipeline import make_pipeline
from sklearn.utils.multiclass import type_of_target


# ---------- utilities ----------
def _safe_copy_and_clean(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    # replace infs and large values
    X = X.replace([np.inf, -np.inf], np.nan)
    # fillna with column median (safe default)
    X = X.fillna(X.median())
    return X


# ---------- 0. quick summary ----------
def summarize_features(X: pd.DataFrame) -> pd.DataFrame:
    Xc = _safe_copy_and_clean(X)
    desc = pd.DataFrame({
        'dtype': Xc.dtypes,
        'n_unique': Xc.nunique(),
        'n_null': Xc.isnull().sum(),
        'mean': Xc.mean(),
        'std': Xc.std(),
        'min': Xc.min(),
        'max': Xc.max(),
        'skew': Xc.skew(),
    })
    desc['abs_mean_over_std'] = (desc['mean'].abs() / (desc['std'] + 1e-12))
    return desc.sort_values('std')


# ---------- 1. remove zero/low variance ----------
def remove_low_variance(X: pd.DataFrame, threshold: float = 1e-8) -> pd.DataFrame:
    Xc = _safe_copy_and_clean(X)
    sel = VarianceThreshold(threshold=threshold)
    sel.fit(Xc)
    cols = Xc.columns[sel.get_support()]
    return Xc[cols]


# ---------- 2. remove duplicate / near-duplicate columns ----------
def remove_duplicate_columns(X: pd.DataFrame, tol: float = 1e-12) -> pd.DataFrame:
    Xc = _safe_copy_and_clean(X)
    # drop exactly duplicated columns first
    Xc = Xc.loc[:, ~Xc.T.duplicated()]
    # drop columns that are linear duplicates (correlation ~ 1)
    corr = Xc.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [c for c in upper.columns if any(upper[c] > (1 - 1e-12))]
    return Xc.drop(columns=to_drop)


# ---------- 3. remove highly correlated features ----------
def remove_highly_correlated(X: pd.DataFrame, threshold: float = 0.95) -> pd.DataFrame:
    Xc = _safe_copy_and_clean(X)
    corr = Xc.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
    return Xc.drop(columns=to_drop), to_drop


# ---------- 4. unsupervised feature clustering — выбираем репрезентанты ----------
def feature_cluster_select(X: pd.DataFrame, n_clusters: int = 20, strategy: str = 'variance') -> Tuple[List[str], pd.DataFrame]:
    """
    Clusters features into n_clusters using FeatureAgglomeration.
    For each cluster, selects one representative feature:
      - 'variance' -> feature with largest variance in the cluster
      - 'corr_to_component' -> feature with highest absolute corr to cluster component
    Returns (list_of_selected_feature_names, reduced_df).
    """
    Xc = _safe_copy_and_clean(X)
    # standardize features (important for clustering)
    scaler = StandardScaler()
    Xs = scaler.fit_transform(Xc)

    agg = FeatureAgglomeration(n_clusters=max(1, min(n_clusters, Xc.shape[1])), metric='euclidean', linkage='ward')
    agg.fit(Xs)
    labels = getattr(agg, "labels_", None)
    if labels is None:
        # fallback: treat as 1 cluster
        labels = np.zeros(Xc.shape[1], dtype=int)

    # get transformed components
    components = agg.transform(Xs)  # shape (n_samples, n_clusters)

    selected = []
    for cl in range(components.shape[1]):
        idxs = np.where(labels == cl)[0]
        feats = Xc.columns[idxs].tolist()
        if strategy == 'variance':
            # pick feature with maximum variance
            variances = Xc[feats].var(axis=0)
            pick = variances.idxmax()
        else:
            # correlation to component
            comp = components[:, cl]
            corrs = {f: abs(np.corrcoef(Xc[f].values, comp)[0, 1]) for f in feats}
            pick = max(corrs, key=corrs.get)
        selected.append(pick)

    selected = list(dict.fromkeys(selected))  # keep order, unique
    return selected, Xc[selected]


# ---------- 5. unsupervised PCA-based selection ----------
def pca_feature_loadings_select(X: pd.DataFrame, n_components: int = 5, top_k_per_comp: int = 5) -> Tuple[List[str], pd.DataFrame]:
    """
    Fit PCA, for each of top n_components pick top_k_per_comp features by absolute loading.
    """
    Xc = _safe_copy_and_clean(X)
    scaler = StandardScaler()
    Xs = scaler.fit_transform(Xc)
    pca = PCA(n_components=min(n_components, Xs.shape[1]))
    pca.fit(Xs)
    loadings = np.abs(pca.components_)  # shape (n_components, n_features)
    selected = set()
    for i in range(loadings.shape[0]):
        idxs = np.argsort(loadings[i])[::-1][:top_k_per_comp]
        for j in idxs:
            selected.add(Xc.columns[j])
    selected = list(selected)
    return selected, Xc[selected]


# ---------- 6. supervised: mutual information ----------
def select_k_best_mutual_info(X: pd.DataFrame, y: pd.Series, k: int = 30) -> Tuple[List[str], pd.DataFrame]:
    Xc = _safe_copy_and_clean(X)
    # detect type
    t = type_of_target(y)
    if t in ('continuous', 'continuous-multioutput'):
        mi = mutual_info_regression(Xc, y)
    else:
        mi = mutual_info_classif(Xc, y)
    mi = np.array(mi)
    order = np.argsort(mi)[::-1][:min(k, Xc.shape[1])]
    selected = Xc.columns[order].tolist()
    return selected, Xc[selected]


# ---------- 7. supervised: model-based (RandomForest / SelectFromModel) ----------
def select_via_model_importance(X: pd.DataFrame, y: pd.Series, n_features_to_select: int = 30, problem: str = 'auto', random_state: int = 42) -> Tuple[List[str], pd.DataFrame, np.ndarray]:
    """
    Trains a RandomForest and uses feature importances to select top features.
    Returns (selected_feature_names, reduced_df, importances_array_sorted_by_feature_names).
    """
    Xc = _safe_copy_and_clean(X)
    # scale not strictly necessary for tree models, but safe
    # detect problem type
    if problem == 'auto':
        t = type_of_target(y)
        if t in ('continuous', 'continuous-multioutput'):
            problem = 'regression'
        else:
            problem = 'classification'

    if problem == 'regression':
        model = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=random_state)
    else:
        model = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=random_state)

    model.fit(Xc, y)
    importances = model.feature_importances_
    order = np.argsort(importances)[::-1][:min(n_features_to_select, Xc.shape[1])]
    selected = Xc.columns[order].tolist()
    importances_by_feature = pd.Series(importances, index=Xc.columns).sort_values(ascending=False)
    return selected, Xc[selected], importances_by_feature


# ---------- 8. L1-based sparse selection (Logistic/Lasso) ----------
def select_via_l1(X: pd.DataFrame, y: pd.Series, n_features_to_select: Optional[int] = None, problem: str = 'auto', random_state: int = 42) -> Tuple[List[str], pd.DataFrame]:
    Xc = _safe_copy_and_clean(X)
    scaler = StandardScaler()
    Xs = scaler.fit_transform(Xc)

    if problem == 'auto':
        t = type_of_target(y)
        if t in ('continuous', 'continuous-multioutput'):
            problem = 'regression'
        else:
            problem = 'classification'

    if problem == 'classification':
        model = LogisticRegression(penalty='l1', solver='saga', max_iter=2000, random_state=random_state)
        # you may want CV; here simple
        model.fit(Xs, y)
        coefs = np.abs(model.coef_).sum(axis=0)
    else:
        model = LassoCV(cv=5, random_state=random_state, max_iter=20000)
        model.fit(Xs, y)
        coefs = np.abs(model.coef_)

    feat_series = pd.Series(coefs, index=Xc.columns)
    feat_series = feat_series.sort_values(ascending=False)
    if n_features_to_select is None:
        # choose non-zero features
        selected = feat_series[feat_series > 1e-8].index.tolist()
    else:
        selected = feat_series.index[:n_features_to_select].tolist()
    return selected, Xc[selected]


# ---------- 9. combined pipeline for convenience ----------
def auto_feature_selector(X: pd.DataFrame, y: Optional[pd.Series] = None,
                          unsupervised_keep: int = 50,
                          supervised_keep: int = 40,
                          intermediate_corr_threshold: float = 0.98) -> Tuple[List[str], pd.DataFrame, dict]:
    """
    Full pipeline:
      - cleaning
      - remove low-variance + duplicates
      - remove extremely correlated (threshold)
      - if y is None: unsupervised selection (feature clustering + PCA)
      - if y provided: combine mutual info + model importance + L1
    Returns (selected_features, X_reduced, diagnostics dict)
    """
    Xc = _safe_copy_and_clean(X)

    # 1. low-variance
    Xv = remove_low_variance(Xc, threshold=1e-12)

    # 2. remove duplicate columns
    Xd = remove_duplicate_columns(Xv)

    # 3. remove extremely correlated features
    Xr, dropped_corr = remove_highly_correlated(Xd, threshold=intermediate_corr_threshold)

    diagnostics = {
        'n_initial': X.shape[1],
        'n_after_var': Xv.shape[1],
        'n_after_dup': Xd.shape[1],
        'n_after_corr': Xr.shape[1],
        'dropped_corr': dropped_corr
    }

    if y is None:
        # unsupervised strategy: feature clustering + PCA loadings
        sel1, df1 = feature_cluster_select(Xr, n_clusters=min(unsupervised_keep, Xr.shape[1]), strategy='corr_to_component')
        sel2, df2 = pca_feature_loadings_select(Xr, n_components=8, top_k_per_comp=3)
        # union (preserve order)
        selected = list(dict.fromkeys(sel1 + sel2))
        selected = selected[:unsupervised_keep]
        return selected, Xr[selected], diagnostics
    else:
        # supervised: mutual info + model importances + l1
        sel_mi, _ = select_k_best_mutual_info(Xr, y, k=supervised_keep)
        sel_rf, _, importances = select_via_model_importance(Xr, y, n_features_to_select=supervised_keep)
        sel_l1, _ = select_via_l1(Xr, y, n_features_to_select=supervised_keep)
        # create ranked union: weight model importance highest, then MI, then L1
        combined = []
        for s in [list(importances.index), sel_mi, sel_l1]:
            for f in s:
                if f not in combined and f in Xr.columns:
                    combined.append(f)
        selected = combined[:supervised_keep]
        diagnostics.update({'mi_top': sel_mi[:10], 'rf_top': list(importances.index[:10]), 'l1_top': sel_l1[:10]})
        return selected, Xr[selected], diagnostics


In [None]:
X_final[X_final[""]]

Unnamed: 0,FSC-H,SSC-H,FSC-A,SSC-A,FITC-A,FSC-H^2,FSC-H SSC-H,FSC-H FSC-A,FSC-H SSC-A,FSC-H FITC-A,...,PCA2,PCA3,UMAP1,UMAP2,cluster,dist_to_cluster_0,dist_to_cluster_1,dist_to_cluster_2,dist_to_cluster_3,dist_to_cluster_4
0,0.002405,0.003302,0.703293,0.511719,0.991729,0.000006,0.000008,0.001692,0.001231,0.002386,...,-150209.463550,-73790.860670,-0.109670,11.714401,0,7.080976e+05,1.518126e+10,1.544325e+10,9.046370e+09,1.014105e+10
1,0.003848,0.005217,0.905527,0.949851,0.991849,0.000015,0.000020,0.003484,0.003655,0.003816,...,131965.358591,27820.799718,1.758180,21.360153,0,4.680637e+05,1.518154e+10,1.544429e+10,9.046805e+09,1.014188e+10
2,0.002315,0.002781,0.709210,0.505006,0.991809,0.000005,0.000006,0.001642,0.001169,0.002296,...,-303621.376626,-155437.640387,-1.631208,16.740513,0,1.052435e+06,1.518120e+10,1.544283e+10,9.046284e+09,1.014072e+10
3,0.002320,0.002593,0.700302,0.500021,0.991702,0.000005,0.000006,0.001625,0.001160,0.002301,...,-357052.126786,-187695.475230,-5.838109,19.678785,0,1.137402e+06,1.518121e+10,1.544273e+10,9.046290e+09,1.014063e+10
4,0.002343,0.002723,0.701166,0.497428,0.991714,0.000005,0.000006,0.001643,0.001166,0.002324,...,-304485.707151,-159567.165245,-2.483861,16.427511,0,1.009540e+06,1.518123e+10,1.544287e+10,9.046315e+09,1.014074e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81939,0.003035,0.003098,0.714508,0.506798,0.991969,0.000009,0.000009,0.002169,0.001538,0.003011,...,-61236.934052,-64383.649214,-4.981483,-6.506984,0,9.491782e+04,1.518149e+10,1.544380e+10,9.046707e+09,1.014147e+10
81940,0.002984,0.003300,0.712860,0.508927,0.992062,0.000009,0.000010,0.002127,0.001519,0.002960,...,-30192.943573,-44773.153161,1.621307,-3.333443,0,2.755771e+04,1.518148e+10,1.544385e+10,9.046691e+09,1.014151e+10
81941,0.003440,0.003653,0.724509,0.515702,0.992274,0.000012,0.000013,0.002492,0.001774,0.003413,...,78290.408571,736.259980,13.743039,18.774908,0,3.706678e+05,1.518158e+10,1.544423e+10,9.046841e+09,1.014182e+10
81942,0.003099,0.003397,0.716195,0.510770,0.992165,0.000010,0.000011,0.002219,0.001583,0.003075,...,2079.262421,-31168.248691,10.419333,-2.054430,0,9.537153e+04,1.518151e+10,1.544396e+10,9.046735e+09,1.014160e+10


## Быстрая разведка

In [8]:
desc = summarize_features(X_final)
desc.head(40)

Unnamed: 0,dtype,n_unique,n_null,mean,std,min,max,skew,abs_mean_over_std
log_feat_mean_to_feat_sum,float64,31740,0,0.04879,1.174596e-13,0.04879016,0.04879,0.0,43661680000.0
feat_mean_to_feat_sum,float64,33199,0,0.05,1.233326e-13,0.05,0.05,0.0,44510410000.0
log_feat_sum_to_feat_mean,float64,65743,0,3.044522,4.698385e-11,3.044522,3.044522,0.0,63448900000.0
feat_sum_to_feat_mean,float64,75161,0,20.0,9.866607e-10,20.0,20.0,0.0,20249870000.0
log_feat_min_to_feat_sum,float64,81944,0,2e-06,2.637407e-07,1.832738e-11,3e-06,0.0,6.414842
feat_min_to_feat_sum,float64,81944,0,2e-06,2.637411e-07,1.832738e-11,3e-06,0.0,6.414838
log_feat_min_to_feat_cv,float64,81944,0,7e-06,1.238375e-06,5.551735e-11,1.5e-05,-0.815469,6.002608
feat_min_to_feat_cv,float64,81944,0,7e-06,1.238383e-06,5.551735e-11,1.5e-05,-0.815463,6.00259
log_feat_min,float64,77814,0,9e-06,1.540601e-06,1e-10,1.8e-05,-0.902689,6.137096
feat_min,float64,77814,0,9e-06,1.540615e-06,1e-10,1.8e-05,-0.902683,6.137072


## Разделение данных

In [27]:
from sklearn.model_selection import train_test_split

X = X_final  # если эти поля — не фичи
y = y_balanced   # твоя целевая переменная

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## Очистка — low variance + дубликаты (последовательно)

In [28]:
Xv = remove_low_variance(X_train, threshold=1e-12)   # порог можно поднять до 1e-8
Xd = remove_duplicate_columns(Xv)
print("after var:", Xv.shape, "after dup:", Xd.shape)

after var: (65555, 1357) after dup: (65555, 1117)


## Убрать высоко-коррелированные фичи (filter)

In [29]:
Xr, dropped = remove_highly_correlated(Xd, threshold=0.95)
print("dropped by corr:", dropped)

dropped by corr: ['SSC-H', 'FITC-A', 'FSC-H^2', 'FSC-H SSC-H', 'FSC-H FITC-A', 'SSC-H^2', 'SSC-H FITC-A', 'FSC-A^2', 'FSC-A SSC-A', 'FSC-A FITC-A', 'SSC-A^2', 'SSC-A FITC-A', 'FITC-A^2', 'feat_sum', 'feat_min', 'FSC-H_to_FSC-A', 'FSC-A_to_FSC-H', 'SSC-A_to_FSC-H', 'FITC-A_to_FSC-H', 'FSC-H_to_FSC-H^2', 'FSC-H_to_FSC-H SSC-H', 'FSC-H_to_FSC-H FSC-A', 'FSC-H FSC-A_to_FSC-H', 'FSC-H_to_FSC-H SSC-A', 'FSC-H_to_FSC-H FITC-A', 'FSC-H FITC-A_to_FSC-H', 'FSC-H_to_SSC-H^2', 'SSC-H^2_to_FSC-H', 'FSC-H_to_SSC-H FSC-A', 'SSC-H FSC-A_to_FSC-H', 'FSC-H_to_SSC-H SSC-A', 'SSC-H SSC-A_to_FSC-H', 'FSC-H_to_SSC-H FITC-A', 'SSC-H FITC-A_to_FSC-H', 'FSC-A^2_to_FSC-H', 'FSC-H_to_FSC-A SSC-A', 'FSC-A SSC-A_to_FSC-H', 'FSC-A FITC-A_to_FSC-H', 'FSC-H_to_SSC-A^2', 'SSC-A^2_to_FSC-H', 'FSC-H_to_SSC-A FITC-A', 'SSC-A FITC-A_to_FSC-H', 'FITC-A^2_to_FSC-H', 'FSC-H_to_feat_sum', 'feat_sum_to_FSC-H', 'FSC-H_to_feat_std', 'feat_std_to_FSC-H', 'FSC-H_to_feat_min', 'feat_min_to_FSC-H', 'FSC-H_to_feat_max', 'feat_max_to_

## С Y. Supervised выбор (mutual info, RF, L1)

In [33]:
sel_mi, _ = select_k_best_mutual_info(Xr, y_train, k=60)
sel_rf, _, importances = select_via_model_importance(Xr, y_train, n_features_to_select=60)
sel_l1, _ = select_via_l1(Xr, y_train, n_features_to_select=60)

# объединяем с приоритетом RF -> MI -> L1
combined = []
for s in [list(importances.index), sel_mi, sel_l1]:
    for f in s:
        if f not in combined and f in Xr.columns:
            combined.append(f)
selected_supervised = combined[:40]   # оставить top-40 (пример)
X_sup = Xr[selected_supervised]



## Валидация выбранного набора

In [34]:
X_final[selected_supervised]

Unnamed: 0,feat_max,feat_std,FITC-A_to_feat_cv,FSC-A_to_feat_std,feat_cv,FSC-A_to_feat_sum,feat_min_to_SSC-H,FSC-A_to_feat_cv,log_FSC-H SSC-A_to_feat_min,FITC-A_to_feat_sum,...,SSC-H SSC-A,log_FSC-H_to_FSC-H SSC-H,log_FSC-H_to_SSC-H SSC-A,SSC-H FSC-A,SSC-H_to_SSC-H^2,FSC-H SSC-A_to_SSC-A^2,log_FSC-H_to_SSC-H FSC-A,FSC-H SSC-A,FSC-A_to_feat_min,log_SSC-A_to_SSC-H
0,0.991729,0.352818,0.777198,1.993361,1.276031,0.127180,0.001752,0.551156,5.364572,0.179339,...,0.001690,5.716439,0.885268,0.002322,302.831316,0.004701,0.710910,0.001231,121529.588230,5.049720
1,0.991849,0.467881,0.878528,1.935376,1.128990,0.109251,0.002838,0.802068,5.512814,0.119666,...,0.004956,5.260945,0.574606,0.004724,191.666019,0.004051,0.595775,0.003655,61161.385765,5.209818
2,0.991809,0.353494,0.775293,2.006285,1.279269,0.128329,0.001927,0.554387,5.389671,0.179464,...,0.001404,5.887718,0.973960,0.001972,359.590989,0.004584,0.776469,0.001169,132342.178449,5.207402
3,0.991702,0.351522,0.771896,1.992200,1.284762,0.127975,0.002076,0.545083,5.377428,0.181226,...,0.001297,5.957375,1.025883,0.001816,385.600843,0.004640,0.823192,0.001160,130059.346227,5.267018
4,0.991714,0.351477,0.771203,1.994915,1.285930,0.128266,0.002017,0.545260,5.362383,0.181417,...,0.001354,5.908624,1.004349,0.001909,367.207069,0.004711,0.800837,0.001166,127662.387471,5.213215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81939,0.991969,0.354515,0.777591,2.015453,1.275695,0.128555,0.002974,0.560093,5.123737,0.178476,...,0.001570,5.780052,1.076103,0.002213,322.776785,0.005989,0.863410,0.001538,77556.538448,5.103517
81940,0.992062,0.354384,0.778278,2.011549,1.274688,0.128205,0.002699,0.559243,5.144711,0.178418,...,0.001679,5.717131,1.021393,0.002352,303.034195,0.005864,0.819200,0.001519,80039.206329,5.044949
81941,0.992274,0.357019,0.783103,2.029332,1.267106,0.128569,0.003239,0.571783,5.016721,0.176086,...,0.001884,5.615829,1.038847,0.002646,273.742219,0.006670,0.832782,0.001774,61231.717738,4.957094
81942,0.992165,0.355142,0.779647,2.016644,1.272581,0.128317,0.002827,0.562789,5.110820,0.177761,...,0.001735,5.688297,1.024701,0.002433,294.392561,0.006067,0.821501,0.001583,74571.747387,5.019792


In [None]:
from sklearn.metrics import roc_auc_score

clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)
clf.fit(X_train[selected_supervised], y_train)
pred = clf.predict_proba(X_val[selected_supervised])[:,1]

print("AUC:", roc_auc_score(y_val, pred))

AUC: 0.9998673786664836


In [42]:
d = pd.read_csv("E:\\parquet\\parsed_data\\EG.5_57-Fc_1_b80d42dd-ce9a-11f0-aae4-f02f74d321c5_stats.parquet")

In [45]:
d.to_parquet("E:\\parquet\\parsed_data\\test.parquet", compression='zstd', compression_level=10)