In [53]:
import os

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold

import pickle

SEED = 42

tissue_param = {"lung": {
                "name": "Lung", 
                "age_data": r"../../metadata/gene_expression_metadata/metadata_lung.tsv",
                "test_data": "../../metadata/lung_test_metadata.csv"
            }, 
            "ovary": {
                "name": "Ovary", 
                "age_data": r"../../metadata/gene_expression_metadata/metadata_ovary.tsv",
                "test_data": "../../metadata/ovary_test_metadata.csv"
            },
        }



def load_lung_data() -> pd.DataFrame:
    return pd.read_csv("../../data/X_coding_lung_log2.csv")

def load_ovary_data() -> pd.DataFrame:
    return pd.read_csv("../../data/X_coding_ovary_log2.csv")


tissues_to_run = ["lung", "ovary"]

In [54]:
def split_in_train_test(gexp, age_data, test_set):
    """Split the dataset into train and test"""
    #Metadata
    metadata_test = age_data.loc[age_data["tissue_sample_id"].isin(test_set["sample_id"])]
    metadata_train = age_data.loc[~age_data["tissue_sample_id"].isin(test_set["sample_id"])]
    #Age data
    y_test = list(metadata_test["age"])
    y_train = list(metadata_train["age"])
    #Gene Expression data
    X_train = gexp.loc[metadata_train.tissue_sample_id]
    X_test = gexp.loc[metadata_test.tissue_sample_id]

    return(X_train, X_test, y_train, y_test)


In [67]:
for tissue in tissues_to_run:
    res_dir = f"../../results/3.feature_selection_gexp_hist/{tissue}"
    if not os.path.exists(res_dir): 
        os.mkdir(res_dir)

    if tissue == "lung":
        gexp_data = load_lung_data()
        tissue_name = "Lung"
    elif tissue == "ovary": 
        gexp_data = load_ovary_data()
        tissue_name = "Ovary"
    else: 
        print("Error.. No tissue with name, ", tissue)
        next
    
    gexp_data = gexp_data.set_index("tissue_sample_id")

    # Load age data
    age_data = tissue_param[tissue]["age_data"]
    age_data = pd.read_csv(age_data, sep = "\t")

    # Load test set
    test_data = tissue_param[tissue]["test_data"]
    test_set = pd.read_csv(test_data)
    
    #Filter for complete samples (with all data types)
    multi_modal_table = "../../metadata/sample_ids_multiomics_updated_tl_data.csv"
    multi_modal_table = pd.read_csv(multi_modal_table)

    filtered_multi_modal_table = multi_modal_table[multi_modal_table['tissue'] == tissue_name]
    common_samples = filtered_multi_modal_table[(filtered_multi_modal_table["metadata"]) == 1 & (filtered_multi_modal_table["gene_expression"])]

    print(f"Number of Common samples: {common_samples.shape[0]} in {tissue}")

    #Load Athos samples (the ones he was able to process)
    if tissue == "lung": 
        hist_data = pickle.load(open("../../data/features_histology/lung/lung_features_mean256_cls4k.pkl", "rb"))
    if tissue == "ovary": 
        hist_data = pickle.load(open("../../data/features_histology/ovary/ovary_features_mean256_cls4k.pkl", "rb"))
    
    # Remove samples with histology that were not processed
    common_samples = common_samples[common_samples.sample_id.isin(hist_data.index)]
    print(f"Number of Common samples after removing histology samples not processed: {common_samples.shape[0]}")
    
    #Divide in train and test set
    age_data_multi_modal = age_data.loc[age_data["tissue_sample_id"].isin(common_samples["sample_id"])]
    print(f"Number of Common samples after removing histology samples with no age: {common_samples.shape[0]}")
    X_train, X_test, y_train, y_test = split_in_train_test(gexp_data, age_data_multi_modal, test_set)
    
    print(f"Train dataset shape: {X_train.shape}")
    print(f"Test dataset shape: {X_test.shape}")

    #Define folds for CV
    n_folds = 5

    age_bins = np.arange(18, 76, 5)
    y_bins = pd.cut(y_train, age_bins, labels=False)
    skf_gen = StratifiedKFold(n_folds, shuffle = True, random_state = SEED).split(X_train, y_bins)

    folds = [[t[0], t[1]] for t in skf_gen]

    # Save folds to a text file
    for i in range(len(folds)): 
        fold = folds[i]
        train_data = fold[0]
        test_data = fold[1] 

        train_samples = X_train.index[train_data]
        test_samples = X_train.index[test_data]
        
        train_data = pd.DataFrame({"index": train_data, "sample": train_samples})
        test_data = pd.DataFrame({"index": test_data, "sample": test_samples})
        
        # Save train and test data to separate CSV files
        train_data.to_csv(f'../../results/3.feature_selection_gexp_hist/{tissue}/fold_{i}_train.csv', index=False)
        test_data.to_csv(f'../../results/3.feature_selection_gexp_hist/{tissue}/fold_{i}_test.csv', index=False)
        
        print(f"Number of samples in fold train {train_data.shape[0]}")
        print(f"Number of samples in fold test {test_data.shape[0]}")
        
        # Save exclusive Gene expression data (used during feature selection)
        gexp_metadata = pd.read_csv(f"../../metadata/gene_expression_metadata/metadata_{tissue}.tsv", sep ="\t")
        gexp_metadata_exclusive = gexp_metadata[~gexp_metadata["tissue_sample_id"].isin(common_samples.sample_id)]
        gexp_metadata_exclusive.to_csv(f'../../results/3.feature_selection_gexp_hist/{tissue}/exclusive_gene_expression_train.v2.csv', index=False)
            

Number of Common samples: 577 in lung
Number of Common samples after removing histology samples not processed: 557
Number of Common samples after removing histology samples with no age: 557
Train dataset shape: (508, 15745)
Test dataset shape: (43, 15745)
Number of samples in fold train 406
Number of samples in fold test 102
Number of samples in fold train 406
Number of samples in fold test 102
Number of samples in fold train 406
Number of samples in fold test 102
Number of samples in fold train 407
Number of samples in fold test 101
Number of samples in fold train 407
Number of samples in fold test 101
Number of Common samples: 180 in ovary
Number of Common samples after removing histology samples not processed: 175
Number of Common samples after removing histology samples with no age: 175
Train dataset shape: (141, 14711)
Test dataset shape: (32, 14711)
Number of samples in fold train 112
Number of samples in fold test 29
Number of samples in fold train 113
Number of samples in fold 

