In [1]:
import os
import h5py
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit

In [2]:
MIRNA_ROOT  = r'C:\Users\Administrator\Desktop\mirna'
MRNA_ROOT   = r'C:\Users\Administrator\Desktop\rna'
MAP_PATH    = r'C:\Users\Administrator\Desktop' 
DST_H5      = r'C:\Users\Administrator\Desktop\lung_multi_omics_aligned_v2.h5' 
RANDOM_SEED = 42

In [3]:
FOLDER_CONFIG = {
    'LUAD_T': {'type': 'LUAD_T', 't1': 1, 't2': 0},
    'LUAD_N': {'type': 'LUAD_N', 't1': 0, 't2': -1},
    'LUSC_T': {'type': 'LUSC_T', 't1': 1, 't2': 1},
    'LUSC_N': {'type': 'LUSC_N', 't1': 0, 't2': -1},
}

In [4]:
mirna_chunks = []
for folder, cfg in FOLDER_CONFIG.items():
    fpath = os.path.join(MIRNA_ROOT, folder)
    if not os.path.exists(fpath):
        print(f'[!] skip {fpath}')
        continue
    for sample_dir in tqdm(os.listdir(fpath), desc=f'miRNA-{folder}'):
        sp = os.path.join(fpath, sample_dir)
        if not os.path.isdir(sp): continue
        txt = [t for t in os.listdir(sp) if t.endswith('.txt')]
        if not txt: continue
        
        tmp = pd.read_csv(os.path.join(sp, txt[0]), sep='\t')
        tmp = tmp.rename(columns={'miRNA_ID': 'feature_id', 'reads_per_million_miRNA_mapped': 'value'})
        tmp['file_id'] = sample_dir
        tmp['sample_type'] = cfg['type']
        tmp['t1'] = cfg['t1']
        tmp['t2'] = cfg['t2']
        mirna_chunks.append(tmp)
mirna_long = pd.concat(mirna_chunks, ignore_index=True)

miRNA-LUAD_T: 100%|███████████████████████████████████████████████████████████████| 1038/1038 [00:06<00:00, 161.10it/s]
miRNA-LUAD_N: 100%|███████████████████████████████████████████████████████████████████| 92/92 [00:00<00:00, 182.99it/s]
miRNA-LUSC_T: 100%|█████████████████████████████████████████████████████████████████| 970/970 [00:05<00:00, 161.86it/s]
miRNA-LUSC_N: 100%|███████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 165.26it/s]


In [5]:
mrna_wide_list = []
for folder, cfg in FOLDER_CONFIG.items():
    fpath = os.path.join(MRNA_ROOT, folder)
    if not os.path.exists(fpath):
        print(f'[!] skip {fpath}')
        continue
    for sample_dir in tqdm(os.listdir(fpath), desc=f'mRNA-{folder}'):
        sp = os.path.join(fpath, sample_dir)
        if not os.path.isdir(sp): continue
        tsv = [t for t in os.listdir(sp) if t.endswith('.tsv')]
        if not tsv: continue
        file_id = sample_dir
        tmp = pd.read_csv(os.path.join(sp, tsv[0]), sep='\t', skiprows=1, header=0, skipfooter=4, engine='python')
        tmp = tmp[['gene_id', 'tpm_unstranded']].rename(columns={'gene_id': 'feature_id', 'tpm_unstranded': 'value'})
        tmp = tmp.dropna(subset=['feature_id'])
        wide = tmp.set_index('feature_id').T
        wide.index = [file_id]          
        wide['file_id'] = file_id       
        wide['sample_type'] = cfg['type']
        wide['t1'] = cfg['t1']
        wide['t2'] = cfg['t2']
        mrna_wide_list.append(wide)

mRNA-LUAD_T: 100%|███████████████████████████████████████████████████████████████████| 534/534 [03:03<00:00,  2.92it/s]
mRNA-LUAD_N: 100%|█████████████████████████████████████████████████████████████████████| 58/58 [00:20<00:00,  2.85it/s]
mRNA-LUSC_T: 100%|███████████████████████████████████████████████████████████████████| 494/494 [02:52<00:00,  2.87it/s]
mRNA-LUSC_N: 100%|█████████████████████████████████████████████████████████████████████| 51/51 [00:18<00:00,  2.78it/s]


In [6]:
luad_map = pd.read_csv(os.path.join(MAP_PATH, 'LUAD.tsv'), sep='\t', usecols=['File ID','Sample ID'])
lusc_map = pd.read_csv(os.path.join(MAP_PATH, 'LUSC.tsv'), sep='\t', usecols=['File ID','Sample ID'])
id_map = (pd.concat([luad_map, lusc_map], ignore_index=True)
          .rename(columns={'File ID':'file_id', 'Sample ID':'sample_id'})
          .set_index('file_id')['sample_id']
          .to_dict())   

In [7]:
for df in mirna_chunks:
    df['sample_id'] = df['file_id'].map(id_map)
mirna_long = pd.concat(mirna_chunks, ignore_index=True)

mirna_long = mirna_long.dropna(subset=['sample_id'])

In [8]:
for df in mrna_wide_list:
    df['sample_id'] = df['file_id'].map(id_map)
mrna_dummy = pd.concat(mrna_wide_list, sort=False)
mrna_dummy = mrna_dummy.dropna(subset=['sample_id'])

In [9]:
meta_mir = mirna_long[['sample_id', 't1', 't2', 'sample_type']].drop_duplicates().set_index('sample_id')
mirna_raw = mirna_long.pivot_table(index='sample_id', columns='feature_id', values='value', fill_value=0)

In [10]:
meta_mrna = mrna_dummy[['sample_id', 't1', 't2', 'sample_type']].set_index('sample_id')
mrna_raw = mrna_dummy.set_index('sample_id').select_dtypes('number').fillna(0)

In [11]:
cols_to_drop = ['t1', 't2']
mrna_raw = mrna_raw.drop(columns=[c for c in cols_to_drop if c in mrna_raw.columns])
mirna_raw = mirna_raw.groupby(level=0).mean()
mrna_raw = mrna_raw.groupby(level=0).mean()
meta_mir = meta_mir.groupby(level=0).first() 

In [12]:
common_samples = sorted(list(set(mirna_raw.index) & set(mrna_raw.index)))
print(f'Samples aligned. miRNA: {mirna_raw.shape[0]}, mRNA: {mrna_raw.shape[0]}, Common: {len(common_samples)}')

Samples aligned. miRNA: 1040, mRNA: 1117, Common: 1009


In [13]:
X_samples = np.array(common_samples)
mirna_raw = mirna_raw.loc[X_samples]
mrna_raw  = mrna_raw.loc[X_samples]
t1_l = meta_mir.loc[X_samples, 't1'].astype(int)
t2_l = meta_mir.loc[X_samples, 't2'].astype(int)
sample_types = meta_mir.loc[X_samples, 'sample_type']

In [14]:
patient_ids = np.array([s[:12] for s in X_samples])
gss_outer = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=RANDOM_SEED)
trainval_idx, test_idx = next(gss_outer.split(X_samples, t1_l, groups=patient_ids))
X_trainval = X_samples[trainval_idx]
X_test     = X_samples[test_idx]
y_trainval = t1_l.iloc[trainval_idx]
p_trainval = patient_ids[trainval_idx] 
gss_inner = GroupShuffleSplit(n_splits=1, test_size=0.1765, random_state=RANDOM_SEED)
train_idx_inner, val_idx_inner = next(gss_inner.split(X_trainval, y_trainval, groups=p_trainval))
X_train = X_trainval[train_idx_inner]
X_val   = X_trainval[val_idx_inner]
split_map = pd.Series('train', index=X_samples)
split_map.loc[X_val]  = 'val'
split_map.loc[X_test] = 'test'

print("Split Summary (Count by Set):")
print(split_map.value_counts())

Split Summary (Count by Set):
train    704
val      153
test     152
dtype: int64


In [15]:
train_p = set([s[:12] for s in X_train])
test_p  = set([s[:12] for s in X_test])
overlap = train_p.intersection(test_p)

In [16]:
def process_modality(raw_df, train_samples, modality_name):
    print(f"Processing {modality_name}...")
    train_df = raw_df.loc[train_samples]
    keep_mask = (train_df < 1).mean(axis=0) < 0.5
    filtered_df = raw_df.loc[:, keep_mask]
    print(f"  - Features filtered: {raw_df.shape[1]} -> {filtered_df.shape[1]}")
    log_df = np.log2(filtered_df.clip(lower=0) + 1)
    scaler = StandardScaler()
    scaler.fit(log_df.loc[train_samples]) 
    z_df = pd.DataFrame(scaler.transform(log_df), 
                        index=log_df.index, 
                        columns=log_df.columns)
    z_df = z_df.replace([np.inf, -np.inf, np.nan], 0)
    return z_df

In [17]:
mirna_z = process_modality(mirna_raw, X_train, "miRNA")
mrna_z  = process_modality(mrna_raw,  X_train, "mRNA")
sample_type_map = sample_types.value_counts().to_dict()
max_len_sample = max(len(s) for s in X_samples) if len(X_samples) > 0 else 20
max_len_mir    = max(len(s) for s in mirna_z.columns) if len(mirna_z.columns) > 0 else 20
max_len_mrna   = max(len(s) for s in mrna_z.columns) if len(mrna_z.columns) > 0 else 20

Processing miRNA...
  - Features filtered: 1881 -> 298
Processing mRNA...
  - Features filtered: 60660 -> 16800


In [18]:
with h5py.File(DST_H5, 'w') as h:
    g = h.create_group('processed_data')
    g.create_dataset('sample_ids',      data=X_samples.astype(f'S{max_len_sample}'), compression='gzip')
    g.create_dataset('expr_matrix_miRNA', data=mirna_z.values.astype(np.float32), compression='gzip')
    g.create_dataset('expr_matrix_mRNA',  data=mrna_z.values.astype(np.float32),  compression='gzip')
    g.create_dataset('task1_label',     data=t1_l.values.astype(np.int8), compression='gzip')
    g.create_dataset('task2_label',     data=t2_l.values.astype(np.int8), compression='gzip')
    g.create_dataset('split',           data=split_map.values.astype('S5'), compression='gzip')
    g.create_dataset('miRNA_ids',       data=np.array(mirna_z.columns).astype(f'S{max_len_mir}'), compression='gzip')
    g.create_dataset('mRNA_ids',        data=np.array(mrna_z.columns).astype(f'S{max_len_mrna}'), compression='gzip')
    meta = h.create_group('metadata')
    meta.attrs['description'] = 'Aligned Multi-omics data (LUAD/LUSC) - Patient Split'
    meta.attrs['leakage_check'] = 'Passed: Split by Patient ID; Norm fitted on TRAIN only.'
    meta.attrs['filter']      = 'RPM/TPM < 1 in >50% of TRAIN samples deleted'
    meta.attrs['norm']        = 'log2(x+1) -> Z-score (fitted on TRAIN)'
    meta.attrs['split']       = 'train/val/test = 70/15/15 (approx, group split)'
    meta.attrs['sample_counts']= str(sample_type_map)
print('>>> Done. Final file:', DST_H5)

>>> Done. Final file: C:\Users\Administrator\Desktop\lung_multi_omics_aligned_v2.h5
