In [1]:
import h5py
import numpy as np
import os
import xgboost as xgb

In [2]:
H5_PATH = r"C:/Users/Administrator/Desktop/lung_multi_omics_aligned_v2.h5"

In [3]:
CONFIGS = {
    'task1': {'mrna': 300, 'mirna': 100}, 
    'task2': {'mrna': 500, 'mirna': 200}  
}

In [4]:
def select_by_variance(task, modality, top_n):
    print(f"  > Processing {task} - {modality} (Top {top_n} Variance)...")
    with h5py.File(H5_PATH, "r") as f:
        group = "processed_data"
        mat = f[f"{group}/expr_matrix_{modality}"][:]
        feat = f[f"processed_data/{modality}_ids"][:].astype(str)
        split = f[f"{group}/split"][:].astype(str)
        if f"{group}/{task}_label" in f:
            lbl = f[f"{group}/{task}_label"][:].astype(int)
        else:
            lbl = f["processed_data/task1_label"][:].astype(int) 
    if task == 'task1':
        valid_mask = (lbl != -1) 
    else:
        valid_mask = (lbl != -1)
    train_mask = (split == 'train') & valid_mask
    X_train = mat[train_mask]
    print(f"    - Training samples used: {X_train.shape[0]}")
    variances = np.var(X_train, axis=0)
    sorted_idx = np.argsort(variances)[::-1][:top_n]
    sorted_idx = np.sort(sorted_idx) 
    selected_feats = feat[sorted_idx]
    with h5py.File(H5_PATH, "a") as f:
        grp = f.require_group("feature_selected")
        dn = f"{task}_selected_{modality}"
        if dn in grp: del grp[dn]
        grp.create_dataset(dn, data=np.array([m.encode('utf-8') for m in selected_feats]))
        prep_grp = f.require_group("preprocessing_params")
        mean_dn = f"{task}_{modality}_mean"
        std_dn = f"{task}_{modality}_std"
        if mean_dn in prep_grp: del prep_grp[mean_dn]
        if std_dn in prep_grp: del prep_grp[std_dn]
        mean_val = X_train.mean(axis=0)
        std_val = X_train.std(axis=0)
        std_val[std_val == 0] = 1.0 
        prep_grp.create_dataset(mean_dn, data=mean_val[sorted_idx])
        prep_grp.create_dataset(std_dn, data=std_val[sorted_idx])
        grp.attrs['selection_method'] = 'unsupervised_variance'
if __name__ == "__main__":
    for task, cfg in CONFIGS.items():
        print(f"\n=== Task: {task} ===")
        select_by_variance(task, 'mRNA', cfg['mrna'])
        select_by_variance(task, 'miRNA', cfg['mirna'])


=== Task: task1 ===
  > Processing task1 - mRNA (Top 300 Variance)...
    - Training samples used: 704
  > Processing task1 - miRNA (Top 100 Variance)...
    - Training samples used: 704

=== Task: task2 ===
  > Processing task2 - mRNA (Top 500 Variance)...
    - Training samples used: 671
  > Processing task2 - miRNA (Top 200 Variance)...
    - Training samples used: 671


In [5]:
task = "task2" 
SKIP_TOP_MRNA = 300
TAKE_MRNA = 100
SKIP_TOP_MIRNA = 200
TAKE_MIRNA = 50

In [10]:
def select_weak_features(modality, skip_n, take_n):
    print(f"\n>>> Selecting WEAK features for {modality} (Skipping top {skip_n})...")
    with h5py.File(H5_PATH, "r") as f:
        group = "processed_data"
        mat = f[f"{group}/expr_matrix_{modality}"][:]
        feat = f[f"processed_data/{modality}_ids"][:].astype(str)
        lbl = f[f"{group}/{task}_label"][:].astype(int)
        split = f[f"{group}/split"][:].astype(str)
    train_mask = (lbl != -1) & (split == 'train')
    X_train = mat[train_mask]
    y_train = lbl[train_mask]
    mean_train = X_train.mean(axis=0)
    std_train = X_train.std(axis=0)
    std_train[std_train == 0] = 1
    X_train_norm = (X_train - mean_train) / std_train
    model = xgb.XGBClassifier(
        n_estimators=100, 
        max_depth=4, 
        eval_metric='logloss', 
        use_label_encoder=False, 
        n_jobs=-1,
        random_state=42
    )
    model.fit(X_train_norm, y_train)
    importances = model.feature_importances_
    sorted_idx = np.argsort(importances)[::-1] 

    if len(sorted_idx) < skip_n + take_n:
        print(f"Warning: Insufficient number of features, take {take_n} weakest features from the tail")
        weak_idx = sorted_idx[-take_n:]
    else:
        weak_idx = sorted_idx[skip_n : skip_n + take_n]
    weak_idx = np.sort(weak_idx)
    selected_feats = feat[weak_idx]
    print(f"Skip Top {skip_n} and select the features that rank {skip_n+1}-{skip_n+take_n}")
    print(f"- Example: {selected_feats[:5]}")
    with h5py.File(H5_PATH, "a") as f:
        grp_name = "feature_selected2" 
        grp = f.require_group(grp_name)
        dn = f"{task}_selected_{modality}"
        if dn in grp: del grp[dn]
        grp.create_dataset(dn, data=np.array([m.encode('utf-8') for m in selected_feats]))
        prep_grp_name = "preprocessing_params2"
        prep_grp = f.require_group(prep_grp_name)
        mean_dn = f"{task}_{modality}_mean"
        std_dn = f"{task}_{modality}_std"
        if mean_dn in prep_grp: del prep_grp[mean_dn]
        if std_dn in prep_grp: del prep_grp[std_dn]
        prep_grp.create_dataset(mean_dn, data=mean_train[weak_idx])
        prep_grp.create_dataset(std_dn, data=std_train[weak_idx])
        grp.attrs['feature_type'] = 'weak_features'
        grp.attrs['selection_method'] = f'skip_top_{skip_n}_take_{take_n}'

In [11]:
if __name__ == "__main__":
    select_weak_features("mRNA", SKIP_TOP_MRNA, TAKE_MRNA)
    select_weak_features("miRNA", SKIP_TOP_MIRNA, TAKE_MIRNA)


>>> Selecting WEAK features for mRNA (Skipping top 300)...
Skip Top 300 and select the features that rank 301-400
- Example: ['ENSG00000131730.16' 'ENSG00000131732.12' 'ENSG00000131791.8'
 'ENSG00000131979.20' 'ENSG00000132386.11']

>>> Selecting WEAK features for miRNA (Skipping top 200)...
Skip Top 200 and select the features that rank 201-250
- Example: ['hsa-mir-296' 'hsa-mir-30b' 'hsa-mir-30c-1' 'hsa-mir-30c-2' 'hsa-mir-30d']
