In [1]:
import os

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold

import pickle

SEED = 42

tissue_param = {"lung": {
                "name": "Lung", 
                "age_data": r"../../metadata/lung_annotation_meth.csv",
                "test_data": "../../metadata/lung_test_metadata.csv"
            }, 
            "ovary": {
                "name": "Ovary", 
                "age_data": r"../../metadata/ovary_annotation_meth.csv",
                "test_data": "../../metadata/ovary_test_metadata.csv"
            },
        }



def load_lung_data() -> pd.DataFrame:
    return pd.read_csv("../../data/methylation_lung.csv")

def load_ovary_data() -> pd.DataFrame:
    return pd.read_csv("../../data/methylation_ovary.csv")


tissues_to_run = ["lung", "ovary"]

In [2]:

def split_in_train_test(meth, age_data, test_set):
    """Split the dataset into train and test"""
    #Metadata
    metadata_test = age_data.loc[age_data["tissue_sample_id"].isin(test_set["sample_id"])]
    metadata_train = age_data.loc[~age_data["tissue_sample_id"].isin(test_set["sample_id"])]
    #Age data
    y_test = list(metadata_test["AGE"])
    y_train = list(metadata_train["AGE"])
    #Methylation data
    meth_t = meth.transpose()
    X_train = meth_t.loc[metadata_train.Sample_ID]
    X_test = meth_t.loc[metadata_test.Sample_ID]

    return(X_train, X_test, y_train, y_test)


In [12]:
for tissue in tissues_to_run:
    res_dir = f"../../results/3.feature_selection_multimodal/{tissue}"
    if not os.path.exists(res_dir): 
        os.mkdir(res_dir)

    if tissue == "lung":
        meth_data = load_lung_data()
        tissue_name = "Lung"
    elif tissue == "ovary": 
        meth_data = load_ovary_data()
        tissue_name = "Ovary"
    else: 
        print("Error.. No tissue with name, ", tissue)
        next

    # Load age data
    age_data = tissue_param[tissue]["age_data"]
    age_data = pd.read_csv(age_data)

    # Load test set
    test_data = tissue_param[tissue]["test_data"]
    test_set = pd.read_csv(test_data)
    
    #Filter for complete samples (with all data types)
    multi_modal_table = "../../metadata/sample_ids_multiomics_updated_tl_data.csv"
    multi_modal_table = pd.read_csv(multi_modal_table)

    filtered_multi_modal_table = multi_modal_table[multi_modal_table['tissue'] == tissue_name]
    complete_samples = filtered_multi_modal_table[(filtered_multi_modal_table["metadata"]) == 1 & (filtered_multi_modal_table["gene_expression"]) & (filtered_multi_modal_table["metilation"]) &  (filtered_multi_modal_table["telemore"])]


    #Load Athos samples (the ones he was able to process)
    if tissue == "lung": 
        hist_data = pickle.load(open("../../data/features_histology/lung/lung_features_mean256_cls4k.pkl", "rb"))
    if tissue == "ovary": 
        hist_data = pickle.load(open("../../data/features_histology/ovary/ovary_features_mean256_cls4k.pkl", "rb"))
    # Remove samples with histology that were not processed
    complete_samples = complete_samples[complete_samples.sample_id.isin(hist_data.index)]

    #Divide in train and test set
    age_data_multi_modal = age_data.loc[age_data["tissue_sample_id"].isin(complete_samples["sample_id"])]
    X_train, X_test, y_train, y_test = split_in_train_test(meth_data, age_data_multi_modal, test_set)
    
    #Define folds for CV
    n_folds = 5

    age_bins = np.arange(18, 76, 5)
    y_bins = pd.cut(y_train, age_bins, labels=False)
    skf_gen = StratifiedKFold(n_folds, shuffle = True, random_state = SEED).split(X_train, y_bins)

    folds = [[t[0], t[1]] for t in skf_gen]

    # Save folds to a text file
    for i in range(len(folds)): 
        fold = folds[i]
        train_data = fold[0]
        test_data = fold[1] 

        train_samples = X_train.index[train_data]
        test_samples = X_train.index[test_data]
        
        train_data = pd.DataFrame({"index": train_data, "sample": train_samples})
        test_data = pd.DataFrame({"index": test_data, "sample": test_samples})
        
        # Save train and test data to separate CSV files
        train_data.to_csv(f'../../results/3.feature_selection_multimodal/{tissue}/fold_{i}_train.csv', index=False)
        test_data.to_csv(f'../../results/3.feature_selection_multimodal/{tissue}/fold_{i}_test.csv', index=False)
        
        # Save exclusive DNA methyl data (used during feature selection)
        exclusive_methylation_samples = age_data[~age_data["tissue_sample_id"].isin(complete_samples.sample_id)]
        exclusive_methylation_samples.to_csv(f'../../results/3.feature_selection_multimodal/{tissue}/exclusive_methylation_train.csv', index=False)

        # Save exclusive Gene expression data (used during feature selection)
        gexp_metadata = pd.read_csv(f"../../metadata/gene_expression_metadata/metadata_{tissue}.tsv", sep ="\t")
        gexp_metadata_exclusive = gexp_metadata[~gexp_metadata["tissue_sample_id"].isin(complete_samples.sample_id)]
        gexp_metadata_exclusive.to_csv(f'../../results/3.feature_selection_multimodal/{tissue}/exclusive_gene_expression_train.csv', index=False)
            

