In [1]:
import pandas as pd
import numpy as np
import os

dict_omic_id = {1: "Gene expression lv3",
                2: "CNV threshold",
                3: "DNA Methylation 27",
#                 3: "miRNA"
               }

dict_file_name = {1: "GE",
                  2: "CNA",
                  3: "Meth",
#                   3: "miRNA"
                 }

# input_raw_dir = '/kaggle/input/xena-luad'

# omic_data_dir = {
#     1: f"{input_raw_dir}/HiSeqV2",
# #     2: f"{input_raw_dir}/Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes",
#     2: f"{input_raw_dir}/HumanMethylation450",
#     3: f"{input_raw_dir}/miRNA_HiSeq_gene"
# }

input_raw_dir = '/kaggle/input/xena-gbm'

omic_data_dir = {
    1: f"{input_raw_dir}/HT_HG-U133A",
    2: f"{input_raw_dir}/Gistic2_CopyNumber_Gistic2_all_thresholded.by_genes",
    3: f"{input_raw_dir}/HumanMethylation27"
}

subtype_dir = f"{input_raw_dir}/TCGA.GBM.sampleMap_GBM_clinicalMatrix"
subtype_col_name = 'GeneExp_Subtype'
# subtype_dir = f"{input_raw_dir}/TCGASubtype.20170308.tsv"
# subtype_col_name = 'Subtype_other'
# subtype_dir = f"{input_raw_dir}/TCGA.LUAD.sampleMap_LUAD_clinicalMatrix"
# subtype_col_name = 'Expression_Subtype'

CpG_sites_dir = f"{input_raw_dir}/illumina_humanmethylation27_content.xlsx"

In [2]:
def read_csv_file(file_path, transpose=False):
    # Reading the data into a pandas DataFrame
    # Warning: DtypeWarning
    data = pd.read_csv(file_path, sep="\t", header=None, low_memory=False)

    # Transposing the DataFrame if transpose is True
    if transpose:
        data = data.transpose()
        data.dropna(axis=1, inplace=True)

    data.columns = data.iloc[0]
    # data = data.iloc[1:]
    data = data.drop(0)

    return data

def read_xlxs_file(file_path):
    data = pd.read_excel(file_path, header=None)
    data.columns = data.iloc[0]
    data = data.drop(0)

    return data

In [3]:
def inner_join_subtypes(subtype_data, df, str_df):
    merged_df = pd.merge(
        subtype_data[["sampleID", subtype_col_name]],
        df,
        left_on="sampleID",
        right_on=str_df,
        how="inner",
    )

    # Drop the duplicate "sampleID" column
    merged_df.drop(str_df, axis=1, inplace=True)

    return merged_df

### **STEP 1:** GET SUBTYPES AND OMIC DATAs

In [4]:
subtype_data = read_csv_file(subtype_dir)[["sampleID", subtype_col_name]]

In [5]:
# concerned_subtypes = {
# # # STAD
# #               'CIN': 'CIN',
# #               'EBV': 'EBV',
# #               'GS': 'GS',
# #               'HM-SNV': 'HM',
# #               'HM-indel': 'HM',
                      
# # # SARC
# #               'Dedifferentiated liposarcoma': 'DDLPS',
# #               'Leiomyosarcoma (LMS)': 'LMS',
# #               'Myxofibrosarcoma': 'MFS',
# #               'Pleomorphic MFH / Undifferentiated pleomorphic sarcoma': 'UPS', 
# #               'Synovial Sarcoma - Biphasic': 'SS', 
# #               'Synovial Sarcoma - Monophasic': 'SS', 
# #               'Undifferentiated Pleomorphic Sarcoma (UPS)': 'UPS', 
# #               'Sarcoma; synovial; poorly differentiated': 'SS', 
# #               'Giant cell MFH / Undifferentiated pleomorphic sarcoma with giant cells': 'UPS', 
# #               'Malignant Peripheral Nerve Sheath Tumors (MPNST)': 'MPNST', 
# #               'Desmoid Tumor': 'DT'
#              }
# subtype_data[subtype_col_name] = subtype_data[subtype_col_name].apply(lambda x: concerned_subtypes.get(x, np.nan))

In [6]:
subtype_data = subtype_data.dropna(subset=[subtype_col_name])
print(subtype_data[subtype_col_name].unique())

['Classical' 'Neural' 'Proneural' 'Mesenchymal']


In [7]:
omic_data = {}
unlabeled_omic_data = {}
sample_str = {
    1: "sample",
    2: "Gene Symbol",
#     2: "sample",
    3: "sample",
}

for omic_id in dict_omic_id.keys():

    data = read_csv_file(omic_data_dir[omic_id], transpose=True)
    data.rename(columns={sample_str[omic_id]: "sampleID"}, inplace=True)

    omic_data[omic_id] = data

    print(f"Omic data {dict_omic_id[omic_id]}: {omic_data[omic_id].shape}")

Omic data Gene expression lv3: (539, 12043)


Omic data CNV threshold: (577, 24777)


Omic data DNA Methylation 27: (288, 22978)


In [8]:
# CpG site to GeneID mapping DNA methylation
df2 = read_xlxs_file(CpG_sites_dir)[["Name", "Symbol"]]
df2 = df2.dropna(subset=["Name", "Symbol"])
cpg_site_mapping = dict(zip(df2["Name"], df2["Symbol"] + "|" + df2["Name"]))

for omic_id, omic_name in dict_omic_id.items():
    if "Methylation" in omic_name:
        # print(f"Omic data {omic_name} before mapping: {omic_data[omic_id].shape}")
        omic_data[omic_id] = omic_data[omic_id].rename(columns=cpg_site_mapping)

### **STEP 2:** Get common samples of omic datas. Label them

In [9]:
def get_common_samples(omic_data):
    # Bước 1: Lấy cột sampleID từ subtype_data
    common_samples = subtype_data[["sampleID", subtype_col_name]].copy()
    
    # Bước 2: Tìm giao của tất cả sampleID
    common_ids = set(common_samples["sampleID"])
    for key, df in omic_data.items():
        common_ids &= set(df["sampleID"])

    # Tạo DataFrame với các common_ids
    common_samples = common_samples[common_samples["sampleID"].isin(common_ids)]
    common_samples.sort_values("sampleID", inplace=True)
    common_samples.reset_index(drop=True, inplace=True)

    print(f"Common samples: {common_samples.shape[0]}")
    print("Shape of omic data: ")

    for key, df in omic_data.items():
        # Lọc các hàng theo common_ids
        df_filtered = df[df["sampleID"].isin(common_ids)].copy()
        df_filtered.sort_values("sampleID", inplace=True)
        df_filtered.reset_index(drop=True, inplace=True)
        
        unlabeled_omic_data[key] = df[~df["sampleID"].isin(common_ids)].copy()
        unlabeled_omic_data[key].sort_values("sampleID", inplace=True)
        unlabeled_omic_data[key].reset_index(drop=True, inplace=True)
        
        omic_data[key] = df_filtered
        print(f"{key}: {omic_data[key].shape}, {unlabeled_omic_data[key].shape}")

    # Kiểm tra lại các sampleID chung
    for df in omic_data.values():
        if not df["sampleID"].equals(common_samples["sampleID"]):
            print("Error: Common samples are not equal")

    return common_samples

In [10]:
common_samples_df = get_common_samples(omic_data)
common_samples_array = common_samples_df["sampleID"].to_numpy()

Common samples: 270
Shape of omic data: 


1: (270, 12043), (269, 12043)


2: (270, 24777), (307, 24777)


3: (270, 22978), (18, 22978)


In [11]:
labels_array = common_samples_df[subtype_col_name].to_numpy()
print(labels_array[:10])
# labels_array = np.vectorize(map_label1.get)(labels_array)
# print(labels_array)
# print(map_label)
# print("Labels: ", labels_array.shape)
# print("Labels: ", labels_array[:20])

['Classical' 'Proneural' 'Proneural' 'Classical' 'Proneural' 'Proneural'
 'Proneural' 'Classical' 'Proneural' 'Classical']


In [12]:
# Map labels_array to integer
# map_label = {"Classical": 0, "Neural": 1, "Proneural": 2, "Mesenchymal": 3}
map_label = {key: index + 1 for index, key in enumerate(np.unique(labels_array))}
labels_array = np.vectorize(map_label.get)(labels_array)
print(map_label)
print("Labels: ", labels_array.shape)
print("Labels: ", labels_array[:20])

{'Classical': 1, 'Mesenchymal': 2, 'Neural': 3, 'Proneural': 4}
Labels:  (270,)
Labels:  [1 4 4 1 4 4 4 1 4 1 4 2 2 1 1 4 4 3 2 3]


### **STEP 3:** Get common samples in omic datas. Add Label to omics data

In [13]:
print("Shape of samples after isin:")
for key, df in omic_data.items():
#     # Bỏ các cột có ít nhất một giá trị nan
#     selected_columns = []
#     for i, column in enumerate(df.columns):
#         if df[column].isnull().values.any():
#             continue
#         selected_columns.append(i)

#     omic_data[key] = df.iloc[:, selected_columns]
    omic_data[key].insert(1, subtype_col_name, labels_array, )
    print(omic_data[key].shape)
    # print(omic_data[key].head())

Shape of samples after isin:
(270, 12044)
(270, 24778)
(270, 22979)


In [14]:
for omic_id, omic_name in dict_omic_id.items():
    print(f"Omic data {dict_omic_id[omic_id]}:")
    print(omic_data[omic_id].shape)
    print(omic_data[omic_id].iloc[:5, :5])
    print()

Omic data Gene expression lv3:
(270, 12044)
0         sampleID  GeneExp_Subtype             RNF14            UBE2Q1  \
0  TCGA-02-0001-01                1  6.49791955133042  8.49434620281726   
1  TCGA-02-0003-01                4  6.84261390916503  9.61918147566407   
2  TCGA-02-0007-01                4  6.47919385070934  10.0666908297196   
3  TCGA-02-0009-01                1  7.21399161939877  9.29024304653481   
4  TCGA-02-0010-01                4   7.2000207793856  9.57749740336243   

0             RNF17  
0  4.81908126516905  
1   4.6079529999682  
2  4.62369766743086  
3  4.44427585272261  
4  4.52320574349238  

Omic data CNV threshold:
(270, 24778)
0         sampleID  GeneExp_Subtype ACAP3 ACTRT2 AGRN
0  TCGA-02-0001-01                1     1      1    1
1  TCGA-02-0003-01                4     0      0    0
2  TCGA-02-0007-01                4     0      0    0
3  TCGA-02-0009-01                1     0      0    0
4  TCGA-02-0010-01                4     0      0    0

Omic data

In [15]:
save_raw_dir = '/kaggle/working/raw_tcga_luad'
os.makedirs(save_raw_dir, exist_ok=True)
common_samples_df = common_samples_df.rename(columns={subtype_col_name: "disease_subtypes"})
#common_samples_df.to_csv(f"{save_raw_dir}/df_labeled.csv", index=False)
for key in dict_file_name.keys():
    omic_data[key] = omic_data[key].drop(subtype_col_name, axis=1)
    #omic_data[key].to_csv(f"{save_raw_dir}/df_{dict_file_name[key]}_labeled.csv", index=False)
    unlabeled_omic_data[key] = unlabeled_omic_data[key] #.drop(subtype_col_name, axis=1)
    #unlabeled_omic_data[key].to_csv(f"{save_raw_dir}/df_{dict_file_name[key]}_unlabeled.csv", index=False)

# Code anh Hoang

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

RANDOM_STATE_1 = 42
RANDOM_STATE_2 = 42

dct_var_threshold = {'GE': 0.01,
                     'CNA': 0.1,
                     'Meth' : 0.001,
#                      'miRNA': 0.1
                    }
dct_pre_selection_top = {'GE': 2000,
                         'CNA': 2000,
                         'Meth' : 2000,
#                          'miRNA': 2000
                        }
# dct_pre_selection_top['GE'] = None  # only do variance filtering
# dct_pre_selection_top['CNA'] = None # only do variance filtering

dct_adjusted_p_value_threshold = {'GE': 0.05,
                                  'CNA': 0.05,
                                  'Meth' : 0.05,
#                                   'miRNA': 0.05
                                 }

pca_exp_var_first_comp_cond = 0.5

type_scale = 'min_max_0_1'
# type_scale = 'z_scaler'
scaler = StandardScaler() if type_scale == 'z_scaler' else MinMaxScaler((0,1)) if type_scale == 'min_max_0_1' else None

dict_bool_scale = {'GE': True,
                   'CNA': False,
                   'Meth': True,
#                    'miRNA': True
                  } # apply scaler or not

n_folds = 4

In [17]:
import pandas as pd

default_dir = save_raw_dir
lst_cohort = ['TCGA_GBM']
lst_omics = ['GE',
             'CNA',
             'Meth',
#              'miRNA'
            ]
dict_id_omics = {idx+1: omic for idx, omic in enumerate(lst_omics)}

lst_dataset = ['train', 'val', 'test']

dct_df= {}
unlabeled = {}

for cohort in lst_cohort:
#     print(f'Cohort: {cohort}')
    dir_cohort = f'{default_dir}/'
    
    dct_df[cohort] = {
        'labeled': common_samples_df.set_index('sampleID'), #.apply(pd.to_numeric, errors='coerce'),
    }
    
    unlabeled[cohort] = {}   
    
    for key, value in dict_file_name.items():
        dct_df[cohort][value] = omic_data[key].set_index('sampleID').apply(pd.to_numeric, errors='coerce')
        unlabeled[cohort][value] = unlabeled_omic_data[key].set_index('sampleID').apply(pd.to_numeric, errors='coerce')
    
#     for df in dct_df[cohort].values():
#         print(df.iloc[0, 0])
    common_samples_df = None
    for key in dict_file_name.keys():
        omic_data[key] = None
        unlabeled_omic_data[key] = None
#     lst_data_name = ['labeled'] + lst_omics
#     lst_data_csv_name = ['df_labeled'] + [f'df_{omic}_labeled' for omic in lst_omics]
#     for data_name, data_csv_name in zip(lst_data_name, lst_data_csv_name):
#         loc_data_csv = dir_cohort + f'{data_csv_name}.csv'
#         print(f'\t{data_name}: {loc_data_csv}')
#         dct_df[cohort][data_name] = pd.read_csv(loc_data_csv)
#         dct_df[cohort][data_name] = dct_df[cohort][data_name].set_index('sampleID')

In [18]:
for cohort in lst_cohort:
    print(f'Cohort: {cohort}')
    for omic_idx, omic in enumerate(lst_omics):
        if omic_idx < len(lst_omics)-1:
            assert (dct_df[cohort][lst_omics[omic_idx]].index == dct_df[cohort][lst_omics[omic_idx+1]].index).all(), \
            f'Sample IDs are NOT consistent between {omic} and {lst_omics[omic_idx+1]}'
            print(f'Sample IDs are consistent between {omic} and {lst_omics[omic_idx+1]}')

Cohort: TCGA_GBM
Sample IDs are consistent between GE and CNA
Sample IDs are consistent between CNA and Meth


In [19]:
sum(dct_df[cohort]['GE'].min() < 0)

0

In [20]:
from sklearn.feature_selection import VarianceThreshold

def get_indices_feat_sel_var_threshold(X, threshold=0):
    sel = VarianceThreshold(threshold)
    sel.fit(X)
    bool_sels = sel.get_support()
    lst_indices_sels = [idx for idx, bool_val in enumerate(bool_sels) if bool_val]
    return lst_indices_sels


from sklearn.feature_selection import SelectFdr, f_classif
def get_indices_feat_sel_fdr_anova_fval_alpha(X, y, adjusted_p_value_threshold=0.05, top_k_score=None):
    sel = SelectFdr(score_func=f_classif, alpha=adjusted_p_value_threshold)
    sel.fit(X, y)
    
    bool_sels = sel.get_support()
    
    # smaller p-value is better
    # first filter by adjusted_pvalue threshold 
    lst_indices_sels = [idx for idx, bool_val in enumerate(bool_sels) if bool_val]
    
    # also bigger f_statistic score is better
    # second get top by f_statistic
    f_statistic = sel.scores_    
    f_statistic_asc_indices = np.argsort(f_statistic)
    f_statistic_desc_indices = f_statistic_asc_indices[::-1]
    # # exclude idx feat that excluded before by adjusted p value threshold
    
    f_statistic_desc_indices = [idx for idx in f_statistic_desc_indices if idx in lst_indices_sels] 
    
    # if not defined top_k_score (=None) then get all features == not selected by top k score f statistic
    top_k_score = top_k_score if top_k_score is not None else len(f_statistic_desc_indices)
    top_idx_by_score = f_statistic_desc_indices[:top_k_score]
    
    return top_idx_by_score

import numpy as np
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA

def bool_check_exp_var_first_comp_cond(X, threshold=0.5, n_components=None, batch_size=1000):
    """
        for each classification task, ANOVA F-value was calculated sequentially
        using the training data to evaluate whether a feature was significantly
        different across different classes, where FDR controlling procedures were
        applied for multiple-testing compensation. 
        However, selecting too few features might also result in only selecting
        highly correlated features, which could potentially restrain the models
        from taking advantage of complementary information from diverse features.
        To avoid this situation, we determined the number of preselected features
        for each omics data type with an additional rule, i.e., the first principal component
        of the data after feature preselection should explain <50% of the variance.
    """

    pca_model = IncrementalPCA(n_components=n_components if n_components is not None else min(X.shape[0],X.shape[-1]), batch_size=batch_size)
    pca_model.fit(X)
#     print(pca_model.explained_variance_ratio_.cumsum()[0])
    return pca_model.explained_variance_ratio_.cumsum()[0] < threshold

In [21]:
import numpy as np
from joblib import Parallel, delayed
from sklearn.feature_selection import VarianceThreshold, SelectFdr, f_classif
from sklearn.utils import resample
from sklearn.decomposition import IncrementalPCA

def parallel_variance_threshold(X, threshold, n_jobs=-1):
    sel = VarianceThreshold(threshold)
    sel.fit(X)
    bool_sels = sel.get_support()
    lst_indices_sels = np.where(bool_sels)[0].tolist()
    return lst_indices_sels

def parallel_fdr_anova_fval(X_chunk, y_chunk, adjusted_p_value_threshold):
    sel = SelectFdr(score_func=f_classif, alpha=adjusted_p_value_threshold)
    sel.fit(X_chunk, y_chunk)
    return sel.scores_, sel.pvalues_

def get_indices_feat_sel_var_threshold(X, threshold=0, n_jobs=-1):
    return parallel_variance_threshold(X, threshold, n_jobs)

def get_indices_feat_sel_fdr_anova_fval_alpha(X, y, adjusted_p_value_threshold=0.05, top_k_score=None, n_jobs=-1):
    n_samples, n_features = X.shape
    n_chunks = n_jobs if n_jobs > 0 else 1
    
    chunk_size = n_features // n_chunks
    chunks = [(X[:, i*chunk_size:(i+1)*chunk_size], y) for i in range(n_chunks)]
    
    results = Parallel(n_jobs=n_jobs)(
        delayed(parallel_fdr_anova_fval)(X_chunk, y_chunk, adjusted_p_value_threshold) for X_chunk, y_chunk in chunks
    )
    
    f_statistic = np.hstack([res[0] for res in results])
    p_values = np.hstack([res[1] for res in results])
    
    bool_sels = p_values < adjusted_p_value_threshold
    lst_indices_sels = np.where(bool_sels)[0]
    
    f_statistic_desc_indices = np.argsort(f_statistic)[::-1]
    f_statistic_desc_indices = np.intersect1d(f_statistic_desc_indices, lst_indices_sels)
    
    top_k_score = top_k_score if top_k_score is not None else len(f_statistic_desc_indices)
    top_idx_by_score = f_statistic_desc_indices[:top_k_score]
    
    return top_idx_by_score.tolist()

def bool_check_exp_var_first_comp_cond(X, threshold=0.5, n_components=None, batch_size=1000):
    n_components = n_components if n_components is not None else min(X.shape[0], X.shape[1])
    ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
    ipca.fit(X)
    return ipca.explained_variance_ratio_.cumsum()[0] < threshold

In [22]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import joblib
import json 

for cohort in lst_cohort:
    print(f'Cohort: {cohort}')
    !mkdir '{cohort}'
    %cd '{cohort}'
    
    # Convert label in name format to label in numeric id format ~ as asc order of name
    arr_subtype, arr_num_ex_subtype = np.unique(dct_df[cohort]['labeled']['disease_subtypes'].values, return_counts=True)
    
    valid_subtypes = arr_subtype[arr_num_ex_subtype >= n_folds]
    dct_df[cohort]['labeled'] = dct_df[cohort]['labeled'][dct_df[cohort]['labeled']['disease_subtypes'].isin(valid_subtypes)]
    
    # Recalculate the unique subtypes and their counts after filtering
    arr_subtype, arr_num_ex_subtype = np.unique(dct_df[cohort]['labeled']['disease_subtypes'].values, return_counts=True)
    
    n_examples = len(dct_df[cohort]['labeled'])
    lst_subtype = list(arr_subtype)
    lst_num_ex_subtype = list(arr_num_ex_subtype)
    dct_index_subtype = {index : subtype for index, subtype in enumerate(lst_subtype)} 
    dct_subtype_index = {subtype : index for index, subtype in enumerate(lst_subtype)} 
    dct_df[cohort]['labeled']['disease_subtype_ids'] = dct_df[cohort]['labeled'].replace({'disease_subtypes': dct_subtype_index})['disease_subtypes'].values
    print(dct_df[cohort]['labeled'].head())
    print(dct_subtype_index)
    print()
    
    print(f"Before train/test split: \n\tTotals {n_examples}-{list(zip(list(dct_index_subtype.keys()), lst_subtype, lst_num_ex_subtype))}")
    y = dct_df[cohort]['labeled']['disease_subtype_ids'].values
    X_idx = np.arange(n_examples)

    X_train_idx, \
    X_test_idx, \
    y_train, \
    y_test = train_test_split(
        X_idx, y, 
        test_size=0.2, 
        stratify=y,
        shuffle=True,
        random_state=RANDOM_STATE_1)
    print(f"All train: \n\tTotals {len(y_train)}-{np.unique(y_train, return_counts=True)}")
    print(f"All test: \n\tTotals {len(y_test)}-{np.unique(y_test, return_counts=True)}")
    

    train_test_split_org = 'train_test_split_org'
    !mkdir '{train_test_split_org}'
    pd.DataFrame(y[X_train_idx]).to_csv(f'{train_test_split_org}/labels_tr.csv',
                                        index=False, header=False)
    pd.DataFrame(y[X_test_idx]).to_csv(f'{train_test_split_org}/labels_te.csv',
                                                index=False, header=False) 
    
    dct_name_feat_to_keep = {}
    for omic_id, omic in enumerate(lst_omics):
        omic_id = omic_id+1
        print(omic_id,omic)
        
        tmp_X_tr = dct_df[cohort][omic].iloc[X_train_idx]
        print(f'len train org: {len(tmp_X_tr)}')
        
        
        sel_idx_var = get_indices_feat_sel_var_threshold(tmp_X_tr.values, dct_var_threshold[omic])
        tmp_X_tr = tmp_X_tr.iloc[:,sel_idx_var]
        
        sel_idx_anova_f = get_indices_feat_sel_fdr_anova_fval_alpha(X=tmp_X_tr.values, y=y_train, 
                                                                    adjusted_p_value_threshold=dct_adjusted_p_value_threshold[omic], 
                                                                    top_k_score=dct_pre_selection_top[omic])
        sel_idx_anova_f = sorted(sel_idx_anova_f) # sorted (return in order of score decreasing, top => idx not in order => sort in order to not change original order)
        tmp_X_tr = tmp_X_tr.iloc[:,sel_idx_anova_f]
        
        assert bool_check_exp_var_first_comp_cond(tmp_X_tr.values, pca_exp_var_first_comp_cond), f'Not pass first component explained threshold with omic {omic}, preselection = {dct_pre_selection_top[omic]}'
        
        dct_name_feat_to_keep[omic] = tmp_X_tr.columns.tolist()
        
        tmp_X_te = dct_df[cohort][omic].iloc[X_test_idx] # is exactly independent test set
        tmp_X_te = tmp_X_te[dct_name_feat_to_keep[omic]]
        unlabeled[cohort][omic] = unlabeled[cohort][omic][dct_name_feat_to_keep[omic]]
        print(f'len test idp: {len(tmp_X_te)}')

        tmp_X_tr.to_csv(f'{train_test_split_org}/{omic_id}_tr.csv',
                                              index=False, header=False)
        tmp_X_te.to_csv(f'{train_test_split_org}/{omic_id}_te.csv',
                                              index=False, header=False)
        
        #save features name or genes names
        tmp_X_tr.head(0).T.to_csv(f'{train_test_split_org}/{omic_id}_featname.csv',header=False)
        
    skf = StratifiedKFold(n_splits=n_folds, random_state=RANDOM_STATE_2, shuffle=True)
    result_skf = skf.split(X_train_idx,y_train)

    for idx_fold, (train_index, val_index) in enumerate(result_skf):
        idx_fold = idx_fold+1
        print(f'\nFold {idx_fold}:')
        
        !mkdir '{idx_fold}'

        with open(f"{idx_fold}/dict_id_omics.json", "w") as outfile: 
            json.dump(dict_id_omics, outfile)
        with open(f"{idx_fold}/dct_index_subtype.json", "w") as outfile: 
            json.dump(dct_index_subtype, outfile)

        pd.DataFrame(y_train[train_index]).to_csv(f'{idx_fold}/labels_tr.csv',
                                            index=False, header=False)
        print(f'dist train fold: {np.unique(y_train[train_index],return_counts=True)}')

        pd.DataFrame(y_train[val_index]).to_csv(f'{idx_fold}/labels_val.csv',
                                                    index=False, header=False)
        print(f'dist val fold: {np.unique(y_train[val_index],return_counts=True)}')
        
        pd.DataFrame(y[X_test_idx]).to_csv(f'{idx_fold}/labels_te.csv',
                                                    index=False, header=False)
        print(f'dist test fold: {np.unique(y[X_test_idx],return_counts=True)}')

        
        for omic_id, omic in enumerate(lst_omics):
            omic_id = omic_id+1
            print(omic_id,omic)
            
            #save features name or genes names
            dct_df[cohort][omic][dct_name_feat_to_keep[omic]].head(0).T.to_csv(f'{idx_fold}/{omic_id}_featname.csv',header=False)

            
            tmp_X_tr = dct_df[cohort][omic].iloc[X_train_idx[train_index]]
            tmp_X_tr = tmp_X_tr[dct_name_feat_to_keep[omic]]
            print(f'len train fold: {len(tmp_X_tr)}')
            
            tmp_X_val = dct_df[cohort][omic].iloc[X_train_idx[val_index]] # is exactly test of kfolds != independent test set
            tmp_X_val = tmp_X_val[dct_name_feat_to_keep[omic]]
            print(f'len val fold: {len(tmp_X_val)}')
            
            tmp_X_te = dct_df[cohort][omic].iloc[X_test_idx] # is exactly test of kfolds != independent test set
            tmp_X_te = tmp_X_te[dct_name_feat_to_keep[omic]]
            print(f'len test fold: {len(tmp_X_te)}')
            
            if scaler is not None and dict_bool_scale[omic]:
                scaler.fit(tmp_X_tr.values)
                joblib.dump(scaler, f'{idx_fold}/{scaler.__class__.__name__}_{omic}.pkl')
                
                tmp_X_tr = pd.DataFrame(scaler.transform(tmp_X_tr.values), index=tmp_X_tr.index, columns=tmp_X_tr.columns)
                tmp_X_val = pd.DataFrame(scaler.transform(tmp_X_val.values), index=tmp_X_val.index, columns=tmp_X_val.columns)
                tmp_X_te = pd.DataFrame(scaler.transform(tmp_X_te.values), index=tmp_X_te.index, columns=tmp_X_te.columns)
                
            tmp_X_tr.to_csv(f'{idx_fold}/{omic_id}_tr.csv',
                                                  index=False, header=False)
            tmp_X_val.to_csv(f'{idx_fold}/{omic_id}_val.csv',
                                                  index=False, header=False)
            tmp_X_te.to_csv(f'{idx_fold}/{omic_id}_te.csv',
                                                  index=False, header=False)
            unlabeled[cohort][omic].to_csv(f'{idx_fold}/{omic_id}_unlabeled.csv',
                                              index=False, header=False)
    %cd ..

Cohort: TCGA_GBM


/kaggle/working/TCGA_GBM
0               disease_subtypes  disease_subtype_ids
sampleID                                             
TCGA-02-0001-01        Classical                    0
TCGA-02-0003-01        Proneural                    3
TCGA-02-0007-01        Proneural                    3
TCGA-02-0009-01        Classical                    0
TCGA-02-0010-01        Proneural                    3
{'Classical': 0, 'Mesenchymal': 1, 'Neural': 2, 'Proneural': 3}

Before train/test split: 
	Totals 270-[(0, 'Classical', 71), (1, 'Mesenchymal', 81), (2, 'Neural', 46), (3, 'Proneural', 72)]
All train: 
	Totals 216-(array([0, 1, 2, 3]), array([57, 65, 37, 57]))
All test: 
	Totals 54-(array([0, 1, 2, 3]), array([14, 16,  9, 15]))


1 GE
len train org: 216


len test idp: 54


2 CNA
len train org: 216


len test idp: 54


3 Meth
len train org: 216


len test idp: 54



Fold 1:


dist train fold: (array([0, 1, 2, 3]), array([43, 49, 27, 43]))
dist val fold: (array([0, 1, 2, 3]), array([14, 16, 10, 14]))
dist test fold: (array([0, 1, 2, 3]), array([14, 16,  9, 15]))
1 GE
len train fold: 162
len val fold: 54
len test fold: 54


2 CNA
len train fold: 162
len val fold: 54
len test fold: 54


3 Meth
len train fold: 162
len val fold: 54
len test fold: 54



Fold 2:


dist train fold: (array([0, 1, 2, 3]), array([43, 49, 28, 42]))
dist val fold: (array([0, 1, 2, 3]), array([14, 16,  9, 15]))
dist test fold: (array([0, 1, 2, 3]), array([14, 16,  9, 15]))
1 GE
len train fold: 162
len val fold: 54
len test fold: 54


2 CNA
len train fold: 162
len val fold: 54
len test fold: 54


3 Meth
len train fold: 162
len val fold: 54
len test fold: 54



Fold 3:


dist train fold: (array([0, 1, 2, 3]), array([43, 48, 28, 43]))
dist val fold: (array([0, 1, 2, 3]), array([14, 17,  9, 14]))
dist test fold: (array([0, 1, 2, 3]), array([14, 16,  9, 15]))
1 GE
len train fold: 162
len val fold: 54
len test fold: 54


2 CNA
len train fold: 162
len val fold: 54
len test fold: 54


3 Meth
len train fold: 162
len val fold: 54
len test fold: 54



Fold 4:


dist train fold: (array([0, 1, 2, 3]), array([42, 49, 28, 43]))
dist val fold: (array([0, 1, 2, 3]), array([15, 16,  9, 14]))
dist test fold: (array([0, 1, 2, 3]), array([14, 16,  9, 15]))
1 GE
len train fold: 162
len val fold: 54
len test fold: 54


2 CNA
len train fold: 162
len val fold: 54
len test fold: 54


3 Meth
len train fold: 162
len val fold: 54
len test fold: 54


/kaggle/working


In [23]:
ls

[0m[01;34mTCGA_GBM[0m/  __notebook__.ipynb  [01;34mraw_tcga_luad[0m/


In [24]:
output_path=\
    f'{cohort}_RS{RANDOM_STATE_1}{"__" + scaler.__class__.__name__ if scaler is not None else ""}' \
    + '_' \
    + '_'.join([f'{key}_{value}' for key, value in dict_bool_scale.items()])
output_path

'TCGA_GBM_RS42__MinMaxScaler_GE_True_CNA_False_Meth_True'

In [None]:
import shutil
for cohort in lst_cohort:
    shutil.make_archive(output_path, 'zip')