In [1]:
import os
import re
import glob
import pandas as pd
from collections import defaultdict, Counter

In [2]:
prediction_files = glob.glob("*/*/prediction_report.tsv")

In [3]:
for file in prediction_files:
    outdir = os.path.dirname(file)
    predict_df = pd.read_csv(file, sep="\t", index_col=0)
    auc_keys = [x for x in predict_df.columns if 'auc' in x]
    auc_df = predict_df[auc_keys]
    new_keys = [re.sub("_auc", "", x) for x in auc_keys]
    auc_df = auc_df.rename(columns=dict(zip(auc_df.columns, new_keys)))
    auc_df.to_csv(f"{outdir}/roc_auc.tsv", sep="\t")

In [4]:
auc_files = glob.glob("*/*/roc_auc.tsv")

In [5]:
all_dmrs = glob.glob("*/*/DML_All.importance.tsv")
total_dmrs = defaultdict(dict)
for dmrfile in all_dmrs:
    dmr_df = pd.read_csv(dmrfile, names=['id', 'weight'])
    expo, cell = dmrfile.split("/")[:2]
    expo = re.sub("[0-9]+\.", "", expo)
    total_dmrs[expo][cell] = dmr_df.shape[0]

In [18]:
minimun_dmrs = defaultdict(dict)
for aucfile in auc_files:
    auc_df = pd.read_csv(aucfile, sep="\t", index_col=0)
    expo_dir, cell = aucfile.split("/")[:2]
    expo = re.sub("[0-9]+\.", "", expo_dir)
    conditions = auc_df.columns
    
    found_minimun = dict()
    for percentage, row in auc_df.sort_index().iterrows():
        for cond in conditions:
            if row[cond] >= 0.95 and cond not in found_minimun:
                weight_df = pd.read_csv(f"{expo_dir}/{cell}/DML_Percent{percentage}0.0.importance.tsv", sep="\t", names=['id', 'weight'])
                weight_df = weight_df[weight_df['weight']!=0]
                minimum_dmr = weight_df.shape[0]
                minimun_dmrs[f'{expo}_{cond}'][cell] = minimum_dmr
                found_minimun[cond] = 1

In [19]:
minimun_dmrs_df = pd.DataFrame.from_dict(minimun_dmrs, orient='index')

In [20]:
minimun_dmrs_df

Unnamed: 0,B-Mem,B-Naive,Monocyte,NK-cell1,NK-cell2,Tc-Mem,Tc-Naive,Th-Mem,Th-Naive
HIV_cro,492.0,371.0,684.0,398.0,394.0,742.0,346.0,491.0,189.0
HIV_pre,492.0,208.0,385.0,398.0,394.0,742.0,346.0,491.0,189.0
HIV_acu,836.0,260.0,230.0,398.0,246.0,742.0,346.0,491.0,189.0
MRSA_Ctrl,289.0,188.0,444.0,230.0,257.0,485.0,485.0,485.0,530.0
MRSA_MRSA,289.0,188.0,444.0,493.0,257.0,485.0,485.0,485.0,530.0
MRSA_MSSA,289.0,188.0,444.0,376.0,650.0,485.0,485.0,485.0,530.0
BA_Ctrl,265.0,100.0,239.0,262.0,146.0,331.0,241.0,256.0,235.0
BA_frequent,265.0,100.0,239.0,,375.0,331.0,241.0,256.0,235.0
BA_unfrequent,643.0,188.0,239.0,262.0,375.0,331.0,241.0,739.0,235.0
OP_Ctrl,468.0,474.0,758.0,479.0,480.0,1214.0,635.0,769.0,808.0


In [21]:
auc_df = pd.read_csv("06.Flu/roc_auc.tsv", sep="\t", index_col=0)

In [22]:
expo = 'Flu'
expo_dir = '06.Flu'
celltypes = auc_df.columns

for celltype in celltypes:
    found_minimun = 0
    for percentage, row in auc_df.sort_index().iterrows():
        if row[celltype] >= 0.95 and found_minimun == 0:
            weight_df = pd.read_csv(f"{expo_dir}/{celltype}/Flu_{celltype}_minimal_DMRs_weight.bed", sep="\t", names=['id', 'weight'])
            minimum_dmr = weight_df.shape[0]
            minimun_dmrs[f'{expo}'][celltype] = minimum_dmr
            found_minimun = 1

In [23]:
minimun_dmrs_df = pd.DataFrame.from_dict(minimun_dmrs, orient='index')

In [24]:
minimun_dmrs_df

Unnamed: 0,B-Mem,B-Naive,Monocyte,NK-cell1,NK-cell2,Tc-Mem,Tc-Naive,Th-Mem,Th-Naive
HIV_cro,492.0,371.0,684.0,398.0,394.0,742.0,346.0,491.0,189.0
HIV_pre,492.0,208.0,385.0,398.0,394.0,742.0,346.0,491.0,189.0
HIV_acu,836.0,260.0,230.0,398.0,246.0,742.0,346.0,491.0,189.0
MRSA_Ctrl,289.0,188.0,444.0,230.0,257.0,485.0,485.0,485.0,530.0
MRSA_MRSA,289.0,188.0,444.0,493.0,257.0,485.0,485.0,485.0,530.0
MRSA_MSSA,289.0,188.0,444.0,376.0,650.0,485.0,485.0,485.0,530.0
BA_Ctrl,265.0,100.0,239.0,262.0,146.0,331.0,241.0,256.0,235.0
BA_frequent,265.0,100.0,239.0,,375.0,331.0,241.0,256.0,235.0
BA_unfrequent,643.0,188.0,239.0,262.0,375.0,331.0,241.0,739.0,235.0
OP_Ctrl,468.0,474.0,758.0,479.0,480.0,1214.0,635.0,769.0,808.0


In [25]:
minimun_dmrs_df.to_csv("minimum_number_of_DMRs.tsv")

In [44]:
mimimun_dmr_files = glob.glob(f'*/*/*_minimal_DMRs_weight.bed')

In [45]:
for dmrfile in mimimun_dmr_files:
    weight_df = pd.read_csv(dmrfile, sep="\t", names=['chrom', 'start', 'end', 'id', 'weight'])
    expo, cel

['01.HIV/B-Mem/HIV_B-Mem_minimal_DMRs_weight.bed',
 '01.HIV/B-Naive/HIV_B-Naive_minimal_DMRs_weight.bed',
 '01.HIV/Monocyte/HIV_Monocyte_minimal_DMRs_weight.bed',
 '01.HIV/NK-cell1/HIV_NK-cell1_minimal_DMRs_weight.bed',
 '01.HIV/NK-cell2/HIV_NK-cell2_minimal_DMRs_weight.bed',
 '01.HIV/Tc-Mem/HIV_Tc-Mem_minimal_DMRs_weight.bed',
 '01.HIV/Tc-Naive/HIV_Tc-Naive_minimal_DMRs_weight.bed',
 '01.HIV/Th-Mem/HIV_Th-Mem_minimal_DMRs_weight.bed',
 '01.HIV/Th-Naive/HIV_Th-Naive_minimal_DMRs_weight.bed',
 '02.MRSA/B-Mem/MRSA_B-Mem_minimal_DMRs_weight.bed',
 '02.MRSA/B-Naive/MRSA_B-Naive_minimal_DMRs_weight.bed',
 '02.MRSA/Monocyte/MRSA_Monocyte_minimal_DMRs_weight.bed',
 '02.MRSA/NK-cell1/MRSA_NK-cell1_minimal_DMRs_weight.bed',
 '02.MRSA/NK-cell2/MRSA_NK-cell2_minimal_DMRs_weight.bed',
 '02.MRSA/Tc-Mem/MRSA_Tc-Mem_minimal_DMRs_weight.bed',
 '02.MRSA/Tc-Naive/MRSA_Tc-Naive_minimal_DMRs_weight.bed',
 '02.MRSA/Th-Mem/MRSA_Th-Mem_minimal_DMRs_weight.bed',
 '02.MRSA/Th-Naive/MRSA_Th-Naive_minimal_DMRs_w