Compare the result of Checkm1 and Checkm2

In [13]:
import os

def score(com, con):
    return com - 5. * con


def readMetaInfo(info_path: str):
    hq = 0
    mq = 0
    lq = 0
    summedQS = 0.
    meta_set = set()
    name2info = {}
    with open(info_path, "r") as rh:
        for line in rh:
            info = line.strip("\n").split("\t")
            pre, suffix = os.path.splitext(info[0])
            meta_set.add(pre)
            com = float(info[1])
            con = float(info[2])
            summedQS += score(com, con)
            if com >= 90 and con <= 5:
                hq += 1
                name2info[pre] = ("hq", com, con)
            elif com >= 50 and con <= 10:
                mq += 1
                name2info[pre] = ("mq", com, con)
            else:
                lq += 1
                name2info[pre] = ("lq", com, con)
    return hq, mq, lq, summedQS, meta_set, name2info


def readQualityTsv(tsv_file: str, meta_set):
    hq = 0
    mq = 0
    lq = 0
    summedQS = 0.
    name2info = {}
    with open(tsv_file, "r") as rh:
        for i, line in enumerate(rh):
            if i != 0:
                info = line.strip("\n").split("\t")
                if info[0] not in meta_set:
                    continue
                com = float(info[1])
                con = float(info[2])
                summedQS += score(com, con)
                if com >= 90 and con <= 5:
                    hq += 1
                    name2info[info[0]] = ("hq", com, con)
                elif com >= 50 and con <= 10:
                    mq += 1
                    name2info[info[0]] = ("mq", com, con)
                else:
                    lq += 1
                    name2info[info[0]] = ("lq", com, con)
    return hq, mq, lq, summedQS, name2info


f_input_folder = "/media/zbh/Data/Data/Deepurify_data_wsl/RealDataBinningModels/"
f_output_folder = "/media/zbh/Data/Data/Deepurify_review"

binning_tools_folder = ["Concoct_Data", "MaxBin_Data", "Metabeta2_Data", "VAMB_Data"]
datasets = ["CAMI/high", "CAMI/low", "CAMI/medium1", "CAMI/medium2", "HLJ"]
suffix = ["fa", "fasta", "fa", "fasta"]

for i, tool in enumerate(binning_tools_folder):
    for dataset in datasets:
        cur_input_folder = os.path.join(f_input_folder, tool, dataset, "DeepurifyBins")
        cur_output_folder = os.path.join(f_output_folder, tool, dataset, "DeepurifyBins_Checkm2")
        
        input_path_checkm1 = os.path.join(cur_input_folder, "MetaInfo.txt")
        input_path_checkm2 = os.path.join(cur_output_folder, "quality_report.tsv")
        
        print(f"#############  {tool}, {dataset}  #############")
        hq1, mq1, lq1, summQS1, meta_set, name2info_1 = readMetaInfo(input_path_checkm1)
        print(f"Checkm1: high {hq1}; medium {mq1}; low {lq1}; quality score {summQS1}.")
        hq2, mq2, lq2, summQS2, name2info_2 = readQualityTsv(input_path_checkm2, meta_set)
        print(f"Checkm2: high {hq2}; medium {mq2}; low {lq2}; quality score {summQS2}.")
        
        comp_ins_num = 0
        comp_des_num = 0
        cont_ins_num = 0
        cont_des_num = 0
        
        comp_change = 0.
        cont_change = 0.
        
        for key, values_1 in name2info_1.items():
            if key not in name2info_2:
                continue
            values_2 = name2info_2[key]
            if values_1[0] != values_2[0]:
                print(f"{key}, checkm1: {values_1}, checkm2: {values_2}")
            _, comp1, cont1 = values_1
            _, comp2, cont2 = values_2
            comp_c = comp2 - comp1
            cont_c = cont2 - cont1
            if comp_c > 0:
                comp_ins_num += 1
            else:
                comp_des_num += 1
            
            if cont_c > 0:
                cont_ins_num += 1
            else:
                cont_des_num += 1
            
            comp_change += comp_c
            cont_change += cont_c
        
        print(f"completeness changed val: {comp_change}, contamination changed val: {cont_change}")
        print(f"completeness increased val: {comp_ins_num}, decreased val: {comp_des_num}")
        print(f"contamination increased val: {cont_ins_num}, decreased val: {cont_des_num}")



#############  Concoct_Data, CAMI/high  #############
Checkm1: high 59; medium 120; low 95; quality score 12648.99.
Checkm2: high 51; medium 100; low 123; quality score 11458.79999999999.
206_NoCore_3_5, checkm1: ('mq', 50.0, 0.0), checkm2: ('lq', 40.0, 0.27)
229_NoCore_5_5, checkm1: ('mq', 50.0, 0.0), checkm2: ('lq', 36.86, 0.47)
121_NoCore_6_1, checkm1: ('hq', 90.44, 0.29), checkm2: ('mq', 88.18, 0.58)
183_Core_19_4, checkm1: ('mq', 51.68, 0.34), checkm2: ('lq', 47.63, 1.26)
183_NoCore_15_6, checkm1: ('hq', 92.47, 0.04), checkm2: ('mq', 88.11, 0.52)
201_Core_1_1, checkm1: ('mq', 51.22, 3.45), checkm2: ('lq', 60.06, 11.35)
45_Core_2_1, checkm1: ('lq', 48.59, 5.17), checkm2: ('mq', 51.76, 9.01)
163_0, checkm1: ('hq', 97.75, 4.49), checkm2: ('mq', 100.0, 7.15)
114_0, checkm1: ('mq', 87.67, 0.0), checkm2: ('hq', 90.04, 1.1)
185_3, checkm1: ('mq', 83.33, 1.72), checkm2: ('lq', 100.0, 14.26)
109_NoCore_0_6, checkm1: ('mq', 53.45, 0.0), checkm2: ('lq', 35.09, 1.66)
109_NoCore_0_4, checkm1: 

In [None]:
import os
from Deepurify.Utils.IOUtils import writeFasta



def processOneFile(file_path: str, output_path: str):
    name2seq = {}
    k = 0
    with open(file_path, "r") as rh:
        for line in rh:
            cur_line = line.strip("\n")
            name2seq[f">seq_{k}"] = cur_line
            k += 1
    writeFasta(name2seq, output_path)


output_folder = "/media/zbh/Data/Data/Deepurify_review/Data/TrainingData/"
input_folder = "/media/zbh/Data/Data/Deepurify_data/TrainingTestingDataset/GenomesDataFilter_Total_Bigger15"
if os.path.exists(output_folder) is False:
    os.makedirs(output_folder)

files = os.listdir(input_folder)
for i, file in enumerate(files):
    print(i, file)
    pre_suffix, _ = os.path.splitext(file)
    processOneFile(os.path.join(input_folder, file),
                   os.path.join(output_folder, pre_suffix + ".fasta"))




In [8]:
import os
from shutil import copy

f_input_folder = "/media/zbh/Data/Data/Deepurify_data_wsl/RealDataBinningModels/"
f_output_folder = "/media/zbh/Data/Data/Deepurify_review/Data/BinToolsS1CAMI"


binning_tools_folder = ["Concoct_Data", "MaxBin_Data", "Metabeta2_Data", "VAMB_Data"]
datasets = ["CAMI/high", "CAMI/low", "CAMI/medium1", "CAMI/medium2", "HLJ"]
data_folder = ["fasta_bins", "maxbin", "MetaBat2", "bins"]
suffixs = ["fa", "fasta", "fa", "fasta"]

for i, tool in enumerate(binning_tools_folder):
    for dataset in datasets:
        cur_input_folder = os.path.join(f_input_folder, tool, dataset, data_folder[i])
        files = os.listdir(cur_input_folder)
        toolname = tool.split("_")[0]
        dataname = "_".join([toolname] + dataset.split("/"))
        folder_path = os.path.join(f_output_folder, dataname)
        if os.path.exists(folder_path) is False:
            os.mkdir(folder_path)
        for file in files:
            _, suffix = os.path.splitext(file)
            if suffix[1:] == suffixs[i]:
                copy(
                os.path.join(cur_input_folder, file),
                folder_path)
        
        

In [9]:

import os
from Deepurify.Utils.IOUtils import readCheckMResultAndStat


def readCheckm2(file_path: str):
    res = {}
    h = 0
    m = 0
    l = 0
    with open(file_path, "r") as rh:
        for line in rh:
            if "Name" not in line:
                info = line.strip("\n").split("\t")
                comp = float(info[1])
                conta = float(info[2])
                if comp >= 90 and conta <= 5:
                    state = "HighQuality"
                    h += 1
                elif comp >= 50 and conta <= 10:
                    state = "MediumQuality"
                    m += 1
                else:
                    state = "LowQuality"
                    l += 1
                res[info[0]] = (comp, conta, state)
    return res, h, m, l


def readMetaInfo(file_path: str):
    res = {}
    h = 0
    m = 0
    l = 0
    with open(file_path, "r") as rh:
        for line in rh:
            if "Name" not in line:
                info = line.strip("\n").split("\t")
                comp = float(info[1])
                conta = float(info[2])
                if comp >= 90 and conta <= 5:
                    state = "HighQuality"
                    h += 1
                elif comp >= 50 and conta <= 10:
                    state = "MediumQuality"
                    m += 1
                else:
                    state = "LowQuality"
                    l += 1
                res[info[0]] = (comp, conta, state)
    return res, h, m, l


def IQS(res_ori: dict, res_clean: dict):
    qs_ori = 0.
    qs_res = 0.
    for key, values in res_ori.items():
        if values[-1] == "HighQuality" or values[-1] == "MediumQuality":
            cur_qs = values[0] - 5. * values[1]
            qs_ori += cur_qs
    
    for key, values in res_clean.items():
        if values[-1] == "HighQuality" or values[-1] == "MediumQuality":
            cur_qs = values[0] - 5. * values[1]
            qs_res += cur_qs
    return qs_res - qs_ori


def printResult(checkm1_ori_path, checkm1_deepurify_res_path, checkm2_ori_path, checkm2_deepurify_res_path):
    checkm1_ori_res, checkm1_h, checkm1_m, checkm1_l = readCheckMResultAndStat(checkm1_ori_path)
    checkm1_deep_res, checkm1_deep_h, checkm1_deep_m, checkm1_deep_l = readMetaInfo(checkm1_deepurify_res_path)
    print(f"Checkm1: h: {checkm1_h}, m: {checkm1_m}, l: {checkm1_l}.")
    print(f"Checkm1_deepurify: h: {checkm1_deep_h}, m: {checkm1_deep_m}, l: {checkm1_deep_l}.")
    print(f"Checkm 1 IQS: {IQS(checkm1_ori_res, checkm1_deep_res)}")
    i_h_ch1 = checkm1_deep_h - checkm1_h
    i_m_ch1 = checkm1_deep_m - checkm1_m

    checkm2_ori_res, checkm2_h, checkm2_m, checkm2_l = readCheckm2(checkm2_ori_path)
    checkm2_deep_res, checkm2_deep_h, checkm2_deep_m, checkm2_deep_l = readCheckm2(checkm2_deepurify_res_path)
    print(f"Checkm2: h: {checkm2_h}, m: {checkm2_m}, l: {checkm2_l}.")
    print(f"Checkm2_deepurify: h: {checkm2_deep_h}, m: {checkm2_deep_m}, l: {checkm2_deep_l}.")
    print(f"Checkm2 IQS: {IQS(checkm2_ori_res, checkm2_deep_res)}")
    
    i_h_ch2 = checkm2_deep_h - checkm2_h
    i_m_ch2 = checkm2_deep_m - checkm2_m
    
    return i_h_ch1, i_m_ch1, i_h_ch2, i_m_ch2, IQS(checkm1_ori_res, checkm1_deep_res), IQS(checkm2_ori_res, checkm2_deep_res)
    

f_input_folder = "/media/zbh/Data/Data/Deepurify_data_wsl/RealDataBinningModels/"
f_output_folder = "/media/zbh/DataFast/Deepurify_review"

binning_tools_folder = ["Concoct_Data", "MaxBin_Data", "Metabeta2_Data", "VAMB_Data"]
datasets = ["CAMI/high", "CAMI/low", "CAMI/medium1", "CAMI/medium2", "HLJ"]
folderNames = ["fasta_bins", "maxbin", "MetaBat2", "bins"]
suffix = ["fa", "fasta", "fa", "fasta"]

ch1_h = []
ch2_h = []
ch1_m = []
ch2_m = []
ch1_iqs = []
ch2_iqs = []

for i, tool in enumerate(binning_tools_folder):
    for dataset in datasets:
        checkm1_ori_path = os.path.join(f_input_folder, tool, dataset, "original_checkm.txt")
        checkm1_deepurify_res_path = os.path.join(f_input_folder, tool, dataset, "DeepurifyBins", "MetaInfo.txt")
        checkm2_ori_path = os.path.join(f_output_folder, tool, dataset, "OriBins_Checkm2", "quality_report.tsv")
        checkm2_deepurify_res_path = os.path.join(f_output_folder, tool, dataset, "DeepurifyBins_Checkm2", "quality_report.tsv")
        print("##################################### " + "_".join([tool, dataset]))
        i_h_ch1, i_m_ch1, i_h_ch2, i_m_ch2,iqs_ch1, iqs_ch2 = printResult(checkm1_ori_path, checkm1_deepurify_res_path, checkm2_ori_path, checkm2_deepurify_res_path)
        print(i_h_ch1,i_h_ch2)
        ch1_h.append(i_h_ch1)
        ch2_h.append(i_h_ch2)
        ch1_m.append(i_m_ch1)
        ch2_m.append(i_m_ch2)
        ch1_iqs.append(iqs_ch1)
        ch2_iqs.append(iqs_ch2)


print("###########################")
print(f"Checkm1 increased high quality bins: {sum(ch1_h)}")
print(f"Checkm2 increased high quality bins: {sum(ch2_h)}")
print(f"Checkm1 increased medium quality bins: {sum(ch1_m)}")
print(f"Checkm2 increased medium quality bins: {sum(ch2_m)}")
print(f"Checkm1 increased quality scores: {sum(ch1_iqs)}")
print(f"Checkm2 increased quality scores: {sum(ch2_iqs)}")

print("In 8 datasets, checkm1 has more nc bins than checkm2.")
print("In 9 datasets, checkm2 has more nc bins than checkm1.")


##################################### Concoct_Data_CAMI/high
Checkm1: h: 29, m: 29, l: 178.
Checkm1_deepurify: h: 59, m: 120, l: 95.
Checkm 1 IQS: 6944.57
Checkm2: h: 24, m: 38, l: 174.
Checkm2_deepurify: h: 59, m: 143, l: 228.
Checkm2 IQS: 8271.39
30 35
##################################### Concoct_Data_CAMI/low
Checkm1: h: 11, m: 5, l: 19.
Checkm1_deepurify: h: 11, m: 7, l: 17.
Checkm 1 IQS: 129.46000000000004
Checkm2: h: 9, m: 6, l: 20.
Checkm2_deepurify: h: 11, m: 7, l: 36.
Checkm2 IQS: 241.24000000000024
0 2
##################################### Concoct_Data_CAMI/medium1
Checkm1: h: 28, m: 9, l: 38.
Checkm1_deepurify: h: 30, m: 11, l: 35.
Checkm 1 IQS: 298.2799999999993
Checkm2: h: 26, m: 12, l: 37.
Checkm2_deepurify: h: 28, m: 16, l: 68.
Checkm2 IQS: 379.47999999999956
2 2
##################################### Concoct_Data_CAMI/medium2
Checkm1: h: 34, m: 14, l: 34.
Checkm1_deepurify: h: 36, m: 19, l: 29.
Checkm 1 IQS: 495.8700000000008
Checkm2: h: 31, m: 16, l: 35.
Checkm2_deepur

In [16]:
class HmmModel(object):
    """Store HMM parameters."""

    def __init__(self, keys):
        self.acc = keys["acc"]
        self.ga = keys["ga"]
        self.tc = keys["tc"]
        self.nc = keys["nc"]
        

def getHMMModels(input_hmm_file: str):
    hmmName2model = {}
    cur_keys = None
    with open(input_hmm_file, "r") as rh:
        for line in rh:
            info = line.strip("\n").split(" ")
            if "HMMER3/f" == info[0]:
                if cur_keys is not None:
                    hmmName2model[cur_keys["acc"]] = HmmModel(cur_keys)
                cur_keys = {}
            if "ACC" == info[0]:
                cur_keys["acc"] = info[-1]
            if "GA" == info[0]:
                cur_keys["ga"] = (float(info[-2]), float(info[-1]))
            if "TC" == info[0]:
                cur_keys["tc"] = (float(info[-2]), float(info[-1]))
            if "NC" == info[0]:
                cur_keys["nc"] = (float(info[-2]), float(info[-1]))
    
    hmmName2model[cur_keys["acc"]] = HmmModel(cur_keys)
    return hmmName2model

input_file = "./DeepurifyInfoFiles/HMM/hmm_925.hmm"
res = getHMMModels(input_file)
print(len(res), res["PF13190.1"].tc, res["TIGR03625"].nc)


925 (27.3, 27.3) (107.4, 107.4)


In [8]:
from Deepurify.Utils.IOUtils import readFasta
import os


def readDiamond(file_path: str):
    res = {}
    with open(file_path, "r") as rh:
        for line in rh:
            thisline = line.strip("\n").split("\t")
            bin_name, contig_name = thisline[0].split("Ω")
            if bin_name not in res:
                cur_dict = {}
                cur_dict[contig_name] = thisline[1:]
                res[bin_name] = cur_dict
            else:
                cur_dict = res[bin_name]
                cur_dict[contig_name] = thisline[1:]
                res[bin_name] = cur_dict
    return res


def buildCheckm2TmpFiles(
    original_checkm2_res_folder, 
    modified_bins_folder, 
    modified_checkm2_tmp_folder):
    
    if os.path.exists(modified_checkm2_tmp_folder) is False:
        os.mkdir(modified_checkm2_tmp_folder)
    
    output_faa_folder = os.path.join(modified_checkm2_tmp_folder, "protein_files")
    if os.path.exists(output_faa_folder) is False:
        os.mkdir(output_faa_folder)
    output_dimond_folder = os.path.join(modified_checkm2_tmp_folder, "diamond_output")
    if os.path.exists(output_dimond_folder) is False:
        os.mkdir(output_dimond_folder)
    output_dimond_file = os.path.join(output_dimond_folder, "DIAMOND_RESULTS.tsv")
    modified_bin_names = os.listdir(modified_bins_folder)
    
    faa_files_folder = os.path.join(original_checkm2_res_folder, "protein_files")
    diam_file = os.path.join(original_checkm2_res_folder, "diamond_output", "DIAMOND_RESULTS.tsv")
    diamond_info = readDiamond(diam_file)
    
    wdh = open(output_dimond_file, "w")
    for modified_bin_name in modified_bin_names:
        bin_name, _ = os.path.splitext(modified_bin_name)
        ori_bin_name = bin_name.split("___")[0]
        modified_contig2seq = readFasta(os.path.join(modified_bins_folder, modified_bin_name))
        contig_names = set(list(modified_contig2seq.keys()))
        
        if os.path.exists(os.path.join(faa_files_folder, ori_bin_name + ".faa")):
            faa_contig2seq = readFasta(os.path.join(faa_files_folder, ori_bin_name + ".faa"))
            with open(os.path.join(output_faa_folder, bin_name + ".faa"), "w") as wfh:
                for faa_contig_name, seq in faa_contig2seq.items():
                    cur_name = "_".join(faa_contig_name.split(" ")[0].split("_")[0:-1])
                    if cur_name in contig_names:
                        wfh.write(faa_contig_name + "\n")
                        wfh.write(seq + "\n")
        
        if ori_bin_name in diamond_info:
            cur_diamond_info = diamond_info[ori_bin_name]
            for dia_contig_name, dia_info in cur_diamond_info.items():
                cur_name = ">" + "_".join(dia_contig_name.split(" ")[0].split("_")[0:-1])
                if cur_name in contig_names:
                    wdh.write("\t".join([bin_name + "Ω" + dia_contig_name] + dia_info) + "\n")
    wdh.close()

    
original_bins_folder = "/media/zbh/DataFast/Deepurify_review/Data/BinToolsS1CAMI/Concoct_CAMI_high/"
original_checkm2_res_folder = "/media/zbh/DataFast/Deepurify_review/Data/BinToolsS1CAMI_Deepurify/Concoct_CAMI_high/DeepurifyTmpFiles/FilterOutput/original_checkm2_res/"

modified_bins_folder = "/media/zbh/DataFast/Deepurify_review/Data/BinToolsS1CAMI_Deepurify/Concoct_CAMI_high/DeepurifyTmpFiles/FilterOutput/T5_filter/0/"
modified_checkm2_tmp_folder = "/media/zbh/DataFast/Deepurify_review/Data/BinToolsS1CAMI_Deepurify/Concoct_CAMI_high/DeepurifyTmpFiles/FilterOutput/T5_0_checkm2_modified_tmp"

buildCheckm2TmpFiles(original_checkm2_res_folder, modified_bins_folder, modified_checkm2_tmp_folder)



In [3]:
from Deepurify.Utils.IOUtils import readCheckMResultAndStat
import os
from shutil import copy

f_input_folder = "D:\\AllData\\Deepurify_data\\Data\\RealDataBinningModels"
output_folder = "D:\\AllData\\Deepurify_review\\CAMI_ALIGNMENT"

binning_tools_folder = ["Concoct_Data", "MaxBin_Data", "Metabeta2_Data", "VAMB_Data"]
datasets = ["CAMI/high", "CAMI/low", "CAMI/medium1", "CAMI/medium2"]
folderNames = ["fasta_bins", "maxbin", "MetaBat2", "bins"]
suffix = ["fa", "fasta", "fa", "fasta"]

index = 0
for i, tool in enumerate(binning_tools_folder):
    for dataset in datasets:
        checkm1_ori_path = os.path.join(f_input_folder, tool, dataset, "original_checkm.txt")
        cur_checkm1_res = readCheckMResultAndStat(checkm1_ori_path)
        if "high" in dataset:
            cur_output = os.path.join(output_folder, "high")
        elif "medium1" in dataset:
            cur_output = os.path.join(output_folder, "medium1")
        elif "medium2" in dataset:
            cur_output = os.path.join(output_folder, "medium2")
        elif "low" in dataset:
            cur_output = os.path.join(output_folder, "low")
        
        for name, res in cur_checkm1_res[0].items():
            if res[0] > 90 and res[1] > 10:
                bin_name = tool.split("_")[0]
                copy(
                    os.path.join(f_input_folder, tool, dataset, folderNames[i], f"{name}.{suffix[i]}"),
                    os.path.join(cur_output, f"{bin_name}_bin_{index}.fasta")
                )
                index += 1
        
        

In [11]:

import os


def readTaxaMarker(marker_file):
    taxaname2set = {}
    with open(marker_file, "r") as rh:
        for line in rh:
            info = line.strip("\n").split("\t")
            if info[0] == "life":
                accs = info[6].replace("[", "").replace("]", "").replace("set", "").replace("(", "").replace(")", "").replace("'", "").replace(" ", "").split(",")
                cur_set = set()
                with open(f"./GTDB_Taxa_Info/hmm/life_{info[1]}", "w") as wh:
                    for acc in accs:
                        wh.write(acc + "\n")
                


readTaxaMarker("D:\\Download\\checkm_data_2015_01_16.tar\\checkm_data_2015_01_16\\taxon_marker_sets.tsv")



In [15]:
import os
phys_vocab = []

with open("./GTDB_Taxa_Info/phy2count.txt", "r") as rh:
    for line in rh:
        info = line.strip("\n").split("\t")[0]
        phys_vocab.append(info)
        

phy_hmm = os.listdir("./GTDB_Taxa_Info/hmm/")

with open("./phy2hmm", "w") as wh:
    for element in phys_vocab:
        cur_name = element.split("_")[2]
        sign = True
        for ele in phy_hmm:
            cur_ele = ele.split("_")[1]
            if cur_name == cur_ele:
                wh.write(element + "\t" + ele + "\n")
                sign = False
                break
        if sign:
            wh.write(element + "\t" + "dom_Bacteria" + "\n")




In [16]:
import os

pres_set = set()
phy_hmm = os.listdir("./GTDB_Taxa_Info/hmm/")
with open("./phy2hmm", "r") as rh:
    for line in rh:
        info = line.strip("\n").split("\t")
        pres_set.add(info[-1])

for file in phy_hmm:
    if file not in pres_set:
        os.remove(os.path.join("./GTDB_Taxa_Info/hmm/", file))



In [17]:
import os


final_hmms = "./GTDB_Taxa_Info/all_hmms.txt"
phy_hmm = os.listdir("./GTDB_Taxa_Info/hmm/")

all_hmms = set()
with open(final_hmms, "w") as wh:
    for file in phy_hmm:
        with open(os.path.join("./GTDB_Taxa_Info/hmm/", file), "r") as rh:
            for line in rh:
                all_hmms.add(line.strip("\n"))

    for ele in all_hmms:
        wh.write(ele + "\n")



In [39]:


def getHMMModels(input_hmm_file: str):
    hmmAcc2model = {}
    cur_line = None
    acc = None
    with open(input_hmm_file, "r") as rh:
        for line in rh:
            info = line.strip("\n").split(" ")
            if "HMMER3/f" == info[0]:
                if cur_line is None:
                    cur_line = ""
                    acc = ""
                else:
                    hmmAcc2model[acc] = cur_line
                    cur_line = ""
                    acc = ""
            if "ACC" == info[0]:
                acc = info[-1]

            cur_line += line

    return hmmAcc2model

accs = set()
hmm_file = "./GTDB_Taxa_Info/HMM/hmm_models.hmm"
with open("./GTDB_Taxa_Info/useless/hmm/dom_Bacteria", "r") as rh:
    for line in rh:
        accs.add(line.strip("\n"))

print(len(accs), accs)
hmmacc2lines = getHMMModels(hmm_file)
with open("./GTDB_Taxa_Info/Simple-HMM/hmm_models_simple.hmm", "w") as wh:
    for acc in accs:
        wh.write(hmmacc2lines[acc])
    


104 {'PF01193.19', 'PF06421.7', 'PF00410.14', 'PF00237.14', 'TIGR02432', 'PF02978.14', 'TIGR00755', 'PF03946.9', 'PF00466.15', 'PF00453.13', 'PF00252.13', 'PF00828.14', 'PF00298.14', 'PF01018.17', 'PF01649.13', 'PF00203.16', 'PF11987.3', 'PF13184.1', 'PF00411.14', 'PF02130.12', 'TIGR00810', 'TIGR02075', 'PF00673.16', 'PF00312.17', 'PF00366.15', 'TIGR00019', 'PF05000.12', 'PF04565.11', 'TIGR00392', 'PF00347.18', 'PF00623.15', 'PF00861.17', 'PF02033.13', 'PF01746.16', 'PF03948.9', 'TIGR01079', 'PF08529.6', 'PF03719.10', 'TIGR00084', 'PF00831.18', 'PF02367.12', 'PF01195.14', 'TIGR00967', 'PF01795.14', 'PF00238.14', 'PF01016.14', 'PF00562.23', 'TIGR00250', 'TIGR03594', 'TIGR00329', 'PF01000.21', 'PF00886.14', 'PF03947.13', 'PF01765.14', 'PF01668.13', 'PF00189.15', 'PF10385.4', 'PF00338.17', 'PF13603.1', 'PF00573.17', 'PF01281.14', 'PF00829.16', 'PF01245.15', 'PF00380.14', 'PF04563.10', 'TIGR00615', 'TIGR03723', 'PF01196.14', 'PF00281.14', 'PF01250.12', 'PF01509.13', 'PF04997.7', 'PF05491.8

KeyError: 'PF01196.14'

In [1]:
import os
import pickle
phy2phyhmms = {}

with open("./GTDB_Taxa_Info/phy2hmms", "r") as rh:
    for line in rh:
        info = line.strip("\n").split("\t")
        phy2phyhmms[info[0]] = info[1]


phyhmms2accs = {}
for file in os.listdir("./GTDB_Taxa_Info/hmm/"):
    cur_list = []
    with open(os.path.join("./GTDB_Taxa_Info/hmm/", file), "r") as rh:
        for line in rh:
            cur_list.append(line.strip("\n"))
    
    phyhmms2accs[file] = cur_list
    

phy2accs = {}
for phy, hmm_name in phy2phyhmms.items():
    phy2accs[phy] = phyhmms2accs[hmm_name]
    

def readPickle(readPath: str) -> object:
    with open(readPath, "rb") as rh:
        obj = pickle.load(rh)
    return obj


def writePickle(writePath: str, obj: object) -> None:
    with open(writePath, "wb") as wh:
        pickle.dump(obj, wh, pickle.HIGHEST_PROTOCOL)
        wh.flush()


writePickle("./GTDB_Taxa_Info/phy2accs.pkl", phy2accs)




In [2]:
phy2accs = readPickle("./GTDB_Taxa_Info/phy2accs.pkl")

for phy, accs_list in phy2accs.items():
    print(phy, accs_list)

p__Acidobacteriota ['PF00410.14', 'PF00673.16', 'PF00831.18', 'PF00861.17', 'PF00281.14', 'PF00238.14', 'PF00828.14', 'PF00253.16', 'PF00344.15', 'PF00203.16', 'PF00347.18', 'PF00327.15', 'PF00297.17', 'PF00276.15', 'PF00338.17', 'PF00252.13', 'PF00237.14', 'PF00573.17', 'PF00189.15', 'PF03947.13', 'PF00181.18', 'PF03719.10', 'PF00333.15', 'PF00366.15', 'TIGR01079', 'PF00298.14', 'PF00584.15', 'PF03946.9', 'TIGR00922', 'PF01250.12', 'PF01084.15', 'PF01782.13', 'PF00886.14', 'PF02843.11', 'PF01071.14', 'PF02844.10', 'PF00318.15', 'PF00572.13', 'PF00889.14', 'PF00380.14', 'PF00627.26', 'PF11967.3', 'PF02565.10', 'PF05697.8', 'PF05698.9', 'PF01783.18', 'PF02504.10', 'PF02620.12', 'PF01000.21', 'PF00416.17', 'PF01196.14', 'PF00444.13', 'PF00411.14', 'PF01193.19', 'PF07479.9', 'PF02660.10', 'PF01210.18', 'TIGR00536', 'TIGR03534', 'PF02491.15', 'PF08478.5', 'PF14450.1', 'PF04997.7', 'PF04983.13', 'PF04998.12', 'PF10385.4', 'PF04560.15', 'PF04565.11', 'PF00562.23', 'PF00623.15', 'PF05000.12',

In [3]:
import os

def readFasta(path: str, output_folder: str):
    """This function is used to read fasta file and
    it will return a dict, which key is the name of seq and the value is the sequence.

    Args:
        path (str): _description_

    Returns:
        Dict[str, str]: _description_
    """
    curContig = None
    curSeq = None
    with open(path, mode="r") as rh:
        for line in rh:
            curLine = line.strip("\n")
            if curLine[0] == ">":
                if curContig is None:
                    curContig = ""
                    curSeq = ""
                else:
                    name = curContig.split("|")[0][1:]
                    wh = open(os.path.join(output_folder, name), "a")
                    wh.write(curContig + "\n")
                    wh.write(curSeq + "\n")
                    wh.close()
                curContig = curLine
                curSeq = ""
            else:
                curSeq += curLine

    name = curContig.split("|")[0][1:]
    wh = open(os.path.join(output_folder, name), "a")
    wh.write(curContig + "\n")
    wh.write(curSeq + "\n")
    wh.close()


clus_fasta_path = "D:\\AllData\\Deepurify_review\\clu_rep_seq.fasta"
output_folder = "D:\\AllData\\Deepurify_review\\GTDB_clu_rep_seqs"

readFasta(clus_fasta_path, output_folder)




In [2]:
from Deepurify.Utils.IOUtils import readCheckMResultAndStat


input_path = "/media/zbh/DataFast/Deepurify_review/Data/BinToolsS1CAMI_Deepurify/Concoct_CAMI_high_checkm1-2.0.2.txt"


_, h, m, l = readCheckMResultAndStat(input_path)
print(h, m, l)

input_path = "/media/zbh/DataFast/Deepurify_review/Data/BinToolsS1CAMI_Deepurify/Concoct_CAMI_high_checkm1-1.3.2.txt"
_, h, m, l = readCheckMResultAndStat(input_path)
print(h, m, l)

83 99 134
58 98 139


In [1]:
from Deepurify.Utils.IOUtils import readFasta, writeFasta


input_fasta = "G:\\1.fa"
output_fasta = "G:\\1_out.fa"

writeFasta(readFasta(input_fasta), output_fasta, True)


print(list(range(0, 100, 11)))


[0, 11, 22, 33, 44, 55, 66, 77, 88, 99]


In [11]:
import numpy as np

a = [1,2,4,5,3,34]

np.random.seed(316)
np.random.shuffle(a)
print(a)



[4, 2, 5, 3, 1, 34]


In [1]:
num_gpu = 8
num_threads_per_device = 2
totalNum = 4
gpus_work_ratio = [0.125, 0.125,0.125,0.125,0.125,0.125,0.125,0.125]
binFilesList = [1,2,3,4]
nextIndex = 0

for i in range(num_gpu * num_threads_per_device):
    if i != (num_gpu * num_threads_per_device) - 1:
        cutFileLength = int(totalNum * gpus_work_ratio[i // num_threads_per_device] / num_threads_per_device + 0.0) + 1
        curDataFilesList = binFilesList[nextIndex: nextIndex + cutFileLength]
        nextIndex += cutFileLength
    else:
        curDataFilesList = binFilesList[nextIndex:]
    print(curDataFilesList)

[1]
[2]
[3]
[4]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


In [4]:

def foo(res):
    res[1] = 1
    res[2] = 2
    res[0] = 0
    
    
res = {}
foo(res)
print(res)






{1: 1, 2: 2, 0: 0}


In [5]:


import os
from shutil import copy

input_folder = "/home/datasets/ZOUbohao/MetaBAT2_IBS/"
output_folder = "/home/datasets/ZOUbohao/MetaBAT2_IBS_Sample_Split"


folders = os.listdir(input_folder)
for folder in folders:
    print(folder)
    for file in os.listdir(os.path.join(input_folder, folder)):
        s_name = file.split("_")[0]
        cur_output_folder = os.path.join(output_folder, s_name)
        if os.path.exists(cur_output_folder) is False:
            os.mkdir(cur_output_folder)
        copy(
            os.path.join(input_folder, folder, file),
            cur_output_folder
        )











0
1000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
2000
20000
21000
22000
3000
4000
5000
6000
7000
8000
9000


In [None]:
import os


input_folder = "/home/datasets/ZOUbohao/Semibin2_IBS_Deepurify"


for folder in os.listdir(input_folder):
    for file in os.listdir(os.path.join(input_folder, folder)):
        print(file)
        if "Deepurify_Bin_" in file:
            os.rename(
                os.path.join(input_folder,folder, file),
                os.path.join(input_folder, folder,file + ".fasta")
            )





In [37]:
import os 
from Deepurify.Utils.IOUtils import readCheckMResultAndStat

def IQS(res_ori: dict, res_clean: dict):
    qs_ori = 0.
    qs_res = 0.
    for key, values in res_ori.items():
        if values[-1] == "HighQuality" or values[-1] == "MediumQuality":
            cur_qs = values[0] - 5. * values[1]
            qs_ori += cur_qs
    
    for key, values in res_clean.items():
        if values[-1] == "HighQuality" or values[-1] == "MediumQuality":
            cur_qs = values[0] - 5. * values[1]
            qs_res += cur_qs
    return qs_res - qs_ori, qs_ori, qs_res


def printResult(checkm1_ori_path, checkm1_deepurify_res_path, wh, file):
    checkm1_ori_res, checkm1_h, checkm1_m, checkm1_l = readCheckMResultAndStat(checkm1_ori_path)
    checkm1_deep_res, checkm1_deep_h, checkm1_deep_m, checkm1_deep_l = readCheckMResultAndStat(checkm1_deepurify_res_path)
    iqs, qs_ori, qs_res = IQS(checkm1_ori_res, checkm1_deep_res)
    wh.write(file + "\t" + str(checkm1_h) + "\t" + str(checkm1_deep_h) + "\t" + str(checkm1_m) + "\t" + str(checkm1_deep_m) + "\t" + \
        str(checkm1_l) + "\t" + str(checkm1_deep_l) + "\t" + str(qs_ori) + "\t" + str(qs_res) + "\t" + str(iqs) + "\n" )
    

original_checkm_folder = "/home/datasets/ZOUbohao/Bins_Data_checkm1"
purified_checkm_folder = "/home/datasets/ZOUbohao/Bins_Data_Deepurify-2.0.3/"
output_path  = "./CAMI_Real_2.0.3.tsv"

wh = open(output_path, "w")
wh.write("Result" + "\t" + "Ori-High-Num" + "\t" + "Deepurify-High-Num" + "\t" + "Ori-Medium-Num" + "\t" + "Deepurify-Medium-Num" + "\t" +\
        "Ori-Low-Num" + "\t" + "Deepurify-Low-Num" + "\t" + "Ori-Quality-Score" + "\t" + "Deepurify-Quality-Score" + "\t" + "IQS" + "\n")
for file in os.listdir(original_checkm_folder):
    print(file)
    data_name = file.split(".")[0]
    printResult(
        os.path.join(original_checkm_folder, file),
        os.path.join(purified_checkm_folder, file),
        wh,
        data_name
    )
wh.close()








Concoct_CAMI_high.checkm1.txt
Concoct_CAMI_low.checkm1.txt
Concoct_CAMI_medium1.checkm1.txt
Concoct_CAMI_medium2.checkm1.txt
Concoct_HLJ.checkm1.txt
MaxBin_CAMI_high.checkm1.txt
MaxBin_CAMI_low.checkm1.txt
MaxBin_CAMI_medium1.checkm1.txt
MaxBin_CAMI_medium2.checkm1.txt
MaxBin_HLJ.checkm1.txt
Metabeta2_CAMI_high.checkm1.txt
Metabeta2_CAMI_low.checkm1.txt
Metabeta2_CAMI_medium1.checkm1.txt
Metabeta2_CAMI_medium2.checkm1.txt
Metabeta2_HLJ.checkm1.txt
Metadecoder_freshwater.checkm1.txt
Metadecoder_plant.checkm1.txt
Metadecoder_soil.checkm1.txt
Semibin2_freshwater.checkm1.txt
Semibin2_marine.checkm1.txt
Semibin2_plant.checkm1.txt
Semibin2_soil.checkm1.txt
VAMB_CAMI_high.checkm1.txt
VAMB_CAMI_low.checkm1.txt
VAMB_CAMI_medium1.checkm1.txt
VAMB_CAMI_medium2.checkm1.txt
VAMB_HLJ.checkm1.txt


In [38]:
import os
from shutil import copytree

source_folder = "/home/datasets/ZOUbohao/Bins_Data_Deepurify/"
target_folder = "/home/datasets/ZOUbohao/Bins_Data_Deepurify-2.0.3"

for folder in os.listdir(source_folder):
    if "checkm" not in folder:
        print(folder)
        cur_source = os.path.join(source_folder, folder, "DeepurifyTmpFiles")
        cur_target = os.path.join(target_folder, folder, "DeepurifyTmpFiles")
        copytree(
            cur_source,
            cur_target
        )



Concoct_CAMI_high
Concoct_CAMI_low
Concoct_CAMI_medium1
Concoct_CAMI_medium2
Concoct_HLJ
MaxBin_CAMI_high
MaxBin_CAMI_low
MaxBin_CAMI_medium1
MaxBin_CAMI_medium2
MaxBin_HLJ
Metabeta2_CAMI_high
Metabeta2_CAMI_low
Metabeta2_CAMI_medium1
Metabeta2_CAMI_medium2
Metabeta2_HLJ
Metadecoder_freshwater
Metadecoder_plant
Metadecoder_soil
Semibin2_freshwater
Semibin2_marine
Semibin2_plant
Semibin2_soil
VAMB_CAMI_high
VAMB_CAMI_low
VAMB_CAMI_medium1
VAMB_CAMI_medium2
VAMB_HLJ


In [35]:
out_name = "a___s___5.fa"
print("a".split("___"))


['a']


In [36]:
a = {}
a.update()

TypeError: object of type 'int' has no len()

In [42]:
from Deepurify.Utils.HmmUtils import HmmModel
def getHMMModels(input_hmm_file: str):
    hmmAcc2model = {}
    cur_keys = None
    with open(input_hmm_file, "r") as rh:
        for line in rh:
            info = line.strip("\n").split(" ")
            if "HMMER3/f" == info[0]:
                if cur_keys is not None:
                    hmmAcc2model[cur_keys["acc"]] = HmmModel(cur_keys)
                cur_keys = {}
            if "ACC" == info[0]:
                cur_keys["acc"] = info[-1]
            if "GA" == info[0]:
                cur_keys["ga"] = (float(info[-2]), float(info[-1]))
            if "TC" == info[0]:
                cur_keys["tc"] = (float(info[-2]), float(info[-1]))
            if "NC" == info[0]:
                cur_keys["nc"] = (float(info[-2]), float(info[-1]))

    hmmAcc2model[cur_keys["acc"]] = HmmModel(cur_keys)
    return hmmAcc2model

a = getHMMModels("./GTDB_Taxa_Info/HMM/hmm_models.hmm")
print(a["PF01196.14"].acc)

PF01196.14


In [43]:
from Deepurify.Utils.IOUtils import readPickle, writePickle
from copy import deepcopy

phy2accs_list = readPickle("./GTDB_Taxa_Info/HMM/phy2accs.pkl")

phy2accs_list["UNK"] = deepcopy(phy2accs_list["p__UBP7_A"])

writePickle("./GTDB_Taxa_Info/HMM/phy2accs_new.pkl", phy2accs_list)

In [1]:
from Deepurify.Utils.HmmUtils import getHMMModels


a = getHMMModels("./GTDB_Taxa_Info/HMM/hmm_models.hmm")

for acc, val in a.items():
    print(acc, val.ga, val.tc, val.nc)


PF00813.15 (23.7, 23.7) (24.4, 24.1) (23.6, 23.6)
TIGR00485 (522.4, 522.4) (522.4, 522.4) (135.3, 135.3)
PF01820.16 (21.6, 21.6) (23.0, 21.6) (21.3, 21.5)
TIGR00246 (149.7, 149.7) (149.7, 149.7) (49.1, 49.1)
PF03631.10 (25.6, 25.6) (25.6, 25.8) (25.5, 25.5)
PF13277.1 (27.0, 27.0) (27.0, 27.0) (26.8, 26.9)
TIGR01966 (141.5, 141.5) (141.5, 141.5) (117.35, 117.35)
PF08275.6 (21.4, 21.4) (21.5, 21.5) (21.3, 21.2)
TIGR01933 (159.4, 159.4) (159.4, 159.4) (133.1, 133.1)
PF01208.12 (23.9, 23.9) (23.9, 23.9) (23.7, 23.8)
PF02576.12 (20.2, 20.2) (21.0, 20.2) (19.9, 19.5)
TIGR02023 (343.3, 343.3) (343.3, 343.3) (215.7, 215.7)
PF00912.17 (20.7, 20.7) (20.8, 20.9) (20.5, 20.0)
PF01668.13 (20.2, 20.2) (23.2, 25.9) (19.7, 18.3)
TIGR01473 (210.3, 210.3) (210.3, 210.3) (138.3, 138.3)
PF13482.1 (21.7, 21.7) (21.7, 21.7) (21.6, 21.6)
TIGR00459 (373.85, 373.85) (373.85, 373.85) (317.5, 317.5)
PF04468.7 (21.1, 21.1) (22.6, 21.9) (19.8, 19.4)
TIGR01405 (1043.45, 1043.45) (1043.45, 1043.45) (618.45, 618.45)


In [3]:


path = "/home/datasets/ZOUbohao/Bins_Data_Deepurify-SCG-Changed/VAMB_HLJ/DeepurifyTmpFiles/FilterOutput/original_checkm2_res/diamond_output/DIAMOND_RESULTS.tsv"
with open(path, "r") as rh:
    for line in rh:
        thisline = line.strip("\n").split("\t")
        bin_name, contig_name = thisline[0].split("Ω")
        cur_contig_name = ">" + "_".join(contig_name.split("_")[0: -1])
        b = ">" + "_".join(contig_name.split(" ")[0].split("_")[0: -1])
        print(cur_contig_name, len(cur_contig_name))
        print(b, len(b))
        break



>NODE_5333_length_17663_cov_12.526011 37
>NODE_5333_length_17663_cov_12.526011 37


In [1]:
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity

a = [[0.5, 0.5, 0], [0.2, 1.1, 0], [0.8, 0, 1.3], [0.5, -1.5, -2.1]]
a = np.array(a)

print(cdist(a, a, metric="euclidean"))
print(cdist(a, a, metric="cosine"))
print(cosine_similarity(a, a))

[[0.         0.67082039 1.42478068 2.9       ]
 [0.67082039 0.         1.80554701 3.35559235]
 [1.42478068 1.80554701 0.         3.72827038]
 [2.9        3.35559235 3.72827038 0.        ]]
[[2.22044605e-16 1.77807808e-01 6.29407156e-01 1.26899610e+00]
 [1.77807808e-01 0.00000000e+00 9.06246602e-01 1.52739702e+00]
 [6.29407156e-01 9.06246602e-01 0.00000000e+00 1.58068277e+00]
 [1.26899610e+00 1.52739702e+00 1.58068277e+00 1.11022302e-16]]
[[ 1.          0.82219219  0.37059284 -0.2689961 ]
 [ 0.82219219  1.          0.0937534  -0.52739702]
 [ 0.37059284  0.0937534   1.         -0.58068277]
 [-0.2689961  -0.52739702 -0.58068277  1.        ]]


In [1]:

from Deepurify.Utils.KMeans import COPKMeans
import numpy as np

input_matrix = np.random.rand(13000, 1024)
must_link = []
cannot_link = [(1, 2), (3, 4), (5, 6), (0, 9)]
cop_kmenas = COPKMeans(150, max_iter=100)
cop_kmenas.fit(input_matrix, ml = [], cl = cannot_link)


print(cop_kmenas.labels_)
print(cop_kmenas.cluster_centers_)


0
1
2
3
4
5
6
7
8
9
10


In [27]:
from Deepurify.Utils.IOUtils import readPickle
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity


test_path = "/home/datasets/ZOUbohao/Metabeta2_CAMI_high_test_code/DeepurifyTmpFiles/First_tmp/AnnotOutput/bin.57.pkl"


contigName2repV = readPickle(test_path)
a = []

i = 0
for name, repv in contigName2repV.items():
    if i <= 5:
        a.append(repv * 2.0)
        i += 1
    else:
        break
a = np.array(a)

print(cdist(a, a, metric="euclidean"))
print("#########")
print(cdist(a, a, metric="cosine"))
print("############")
print(1. - cdist(a, a, metric="cosine"))
print("#########")
print(cosine_similarity(a, a))


[[0.         2.62933347 0.71704957 0.73020403 2.04771582 0.87087017]
 [2.62933347 0.         2.49259133 2.65390027 2.6954951  2.66774236]
 [0.71704957 2.49259133 0.         0.98547555 1.88164636 0.95231155]
 [0.73020403 2.65390027 0.98547555 0.         2.20491343 0.87424312]
 [2.04771582 2.6954951  1.88164636 2.20491343 0.         1.80185803]
 [0.87087017 2.66774236 0.95231155 0.87424312 1.80185803 0.        ]]
#########
[[0.         0.86417436 0.06427001 0.06664974 0.52414252 0.09480186]
 [0.86417436 0.         0.77662648 0.88039837 0.90821177 0.88960618]
 [0.06427001 0.77662648 0.         0.12139526 0.44257413 0.11336216]
 [0.06664974 0.88039837 0.12139526 0.         0.60770541 0.09553763]
 [0.52414252 0.90821177 0.44257413 0.60770541 0.         0.40583654]
 [0.09480186 0.88960618 0.11336216 0.09553763 0.40583654 0.        ]]
############
[[1.         0.13582564 0.93572999 0.93335026 0.47585748 0.90519814]
 [0.13582564 1.         0.22337352 0.11960163 0.09178823 0.11039382]
 [0.93572

In [23]:
import numpy as np

nums = [3,2,1,4,5]

np.percentile(nums, [10, 50, 90])

array([1.4, 3. , 4.6])

In [24]:
a = ["a", "b", "c", "d"]
res = []
for i in range(len(a)):
    for j in range(i, len(a)):
        if i != j:
            res.append((a[i], a[j]))

print(res)

[('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]


In [5]:
from igraph.clustering import Clustering
import numpy as np

a = [0,1,2,0,1,2]
a = np.array(a)
cl = Clustering(a)
label = np.zeros(6)
for i, r in enumerate(list(cl)):
    print(r, type(r))
    label[r] = i

print(label)

print(list(cl))



[0, 3] <class 'list'>
[1, 4] <class 'list'>
[2, 5] <class 'list'>
[0. 1. 2. 0. 1. 2.]
[[0, 3], [1, 4], [2, 5]]


In [2]:
from Deepurify.Recluster import get_TNF_normlized_vec

seq = "ATCCTACGGTATTCCCGGGAATACCC"
print(len(seq))
print(get_TNF_normlized_vec(seq), sum(get_TNF_normlized_vec(seq)))

26
[0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 2 0 0 0 0
 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1] 23
[0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 2 0 0 0 0
 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1] 23
[4.3475688e-07 4.3475688e-07 4.3475688e-07 4.3475688e-07 8.6951815e-02
 4.3475688e-07 4.3475688e-07 4.3475688e-07 4.3475688e-07 4.3475688e-07
 4.3475688e-07 4.3475688e-07 4.3475688e-07 4.3475688e-07 4.3475688e-07
 4.3475688e-07 4.3475688e-07 4.3475688e-07 4.3475688e-07 8.6951815e-02
 4.3475688e-07 4.3475688e-07 8.6951815e-02 4.3475688e-07 4.3475688e-07
 4.3475688e-07 4.3475688e-07 4.3475688e-07 4.3475688e-07 4.3475688e-07
 4.3476127e

In [3]:
import numpy as np

a = np.array([1,2,3,4])
a.fill(0)
print(a)



[0 0 0 0]


In [4]:


threshold = 0
max_axis1 = np.arange(0, 2., step = 0.01)
while threshold < 2.:
    threshold += 0.05
    n_above = np.sum(max_axis1 > threshold)
    print(round(n_above / len(max_axis1), 2), threshold)
    if round(n_above / len(max_axis1), 2) < 1.:
        break

0.97 0.05


In [24]:


from scipy.sparse import csr_matrix, find

a = np.array([[0,1,2,0,3], [0,1,2,0,3],[0,1,2,0,3],[0,1,2,0,3],[0,1,2,0,3]], dtype=np.float32)
b = csr_matrix(a)
X, Y, V = find(b)
above_diag = Y > X
print(above_diag, X, Y)
X = X[above_diag]
Y = Y[above_diag]
print(X, Y)

[ True  True  True False  True  True False False  True False False  True
 False False False] [0 0 0 1 1 1 2 2 2 3 3 3 4 4 4] [1 2 4 1 2 4 1 2 4 1 2 4 1 2 4]
[0 0 0 1 1 2 3] [1 2 4 2 4 4 4]


In [None]:

import os

os.rename(
    "/home/datasets/ZOUbohao/Metabeta2_CAMI_high_test_code/a",
    "/home/datasets/ZOUbohao/Metabeta2_CAMI_high_test_code/b"
)



In [3]:
import numpy as np

a = [1,2,3,4,5,6]
np.random.seed(1024)
np.random.shuffle(a)
print(a)


[3, 6, 1, 5, 2, 4]


In [4]:
import numpy as np
from sklearn.cluster import MiniBatchKMeans

a = np.random.randn(100, 100)
m = MiniBatchKMeans(3)
m.fit(a)
print(m.labels_)

  super()._check_params_vs_input(X, default_n_init=3)


[2 0 2 0 2 1 0 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 0 1 1 2 2 2 2 0 2 2 1 0 2 2
 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 1 0 2 2 2 2 2 2 2 1 1 2 2 2 2 2 1 1 1 2 2
 2 2 1 1 2 1 2 2 2 2 2 2 0 0 2 1 2 1 1 1 2 2 1 1 2 2]


In [9]:
import numpy as np

a = np.random.randn(2, 5)
b =  np.random.randn(2, 5)
print(a)
print(b)
print(np.concatenate([a, b], axis=-1))

[[-0.35528623 -0.03330367  1.57491731  1.18280528 -0.31004235]
 [ 1.36431392 -0.91092126 -0.52323241  0.42613115 -1.58047145]]
[[ 0.73573957  1.89056838  0.00775935 -0.34350401 -0.55572134]
 [-0.21251982 -1.81719468  1.093978    0.60559871 -0.95733602]]
[[-0.35528623 -0.03330367  1.57491731  1.18280528 -0.31004235  0.73573957
   1.89056838  0.00775935 -0.34350401 -0.55572134]
 [ 1.36431392 -0.91092126 -0.52323241  0.42613115 -1.58047145 -0.21251982
  -1.81719468  1.093978    0.60559871 -0.95733602]]


In [12]:
import numpy as np

collected_list = []
for i in range(10):
    for j in range(10):
        collected_list.append((j, j))
print(collected_list, len(collected_list))

[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9)] 100


In [13]:
from Deepurify.Utils.IOUtils import readCheckMResultAndStat

input_file = "/home/datasets/ZOUbohao/Metabeta2_CAMI_high_test_code/test_output_099.txt"
res, h, m, l = readCheckMResultAndStat(input_file)
print(h, m, l)



117 189 322


In [5]:
start_time = 6
for re_times in range(start_time + 1, start_time + 3):
    print((re_times - start_time) // 2, re_times, start_time)
    print((re_times - start_time) // 2 == 1)

0 7 6
False
1 8 6
True


GUNC

In [9]:
from Deepurify.Utils.IOUtils import readMetaInfo, readCheckm2Res
import os

gunc_path = "/home/datasets/ZOUbohao/Metabat2_CAMI_high_GUNC/GUNC.progenomes_2.1.maxCSS_level.tsv"
checkm_path = "/home/datasets/ZOUbohao/Metabat2_CAMI_high_Checkm2/quality_report.tsv"

gunc_info = {}
i = 0
with open(gunc_path, "r") as rh:
    for line in rh:
        if i == 0:
            i += 1
        else:
            info = line.strip("\n").split("\t")
            gunc_info[info[0][0:-1]] = str(info[-1])
checkm_info = readCheckm2Res(checkm_path)[0]

h = 0
m = 0
for name, info in checkm_info.items():
    if info[-1] == "HighQuality" and gunc_info[name] == "True":
        h += 1
    elif info[-1] == "MediumQuality" and gunc_info[name] == "True":
        m += 1

print(h, m)


108 98


In [14]:
import time
from func_timeout import func_set_timeout

class A:
    
    def __init__(self) -> None:
        pass
    
    @func_set_timeout(5)
    def fit(self):
        time.sleep(4)
        print("OK")

a = A()
a.fit()


OK


In [22]:

from Deepurify.Utils.KMeans import COPKMeans
import numpy as np
from func_timeout import func_timeout

X = np.random.randn(150000, 1024)

model = COPKMeans(100)
print("Start to run")
# model.fit(X)
func_timeout(10, model.fit, args=(X,))





Start to run


FunctionTimedOut: Function fit (args=(array([[-0.1705878 ,  0.95742698,  1.17887526, ...,  1.04534391,
         1.88968404,  0.12345239],
       [ 0.17621742,  0.42862044,  0.23764173, ...,  0.11527229,
         1.14077738,  0.19540715],
       [-1.17496249,  0.23875415,  1.06658677, ..., -1.05232714,
        -0.84414047, -0.20662176],
       ...,
       [-0.40269866, -0.37409293, -0.7851084 , ...,  0.45274861,
         1.78056168, -0.06007282],
       [ 0.68999228,  0.1953582 , -0.34419648, ..., -0.97503862,
        -0.81299059, -0.62387409],
       [-0.64634489, -0.47818741, -2.36276172, ...,  2.2564965 ,
         0.57759295,  0.50274295]]),)) (kwargs={}) timed out after 10.000000 seconds.


In [21]:
import numpy as np
from scipy.spatial.distance import cdist

a = np.random.randn(300000, 3000)

print(cdist(a, a, metric="euclidean"))

MemoryError: Unable to allocate 671. GiB for an array with shape (300000, 300000) and data type float64

In [28]:
X = [[0], [3], [1]]
from sklearn.neighbors import radius_neighbors_graph
A = radius_neighbors_graph(X, 1.5, mode='connectivity',
                           include_self=False)
sum(A.data) // 3


0.0

In [31]:
import numpy as np

a = [2,3,1,4,6,5]
np.percentile(a, 25)



2.25

In [35]:

from func_timeout import func_timeout, FunctionTimedOut 

def foo(i):
    if i == 0:
        raise ValueError("value error")
    else:
        raise FunctionTimedOut("time out")
    
try:
    foo(1)
except ValueError:
    print("ValueError")
except FunctionTimedOut:
    print("time out")
    try:
        foo(0)
    except:
        print("Value error")

time out
Value error


In [37]:
import os

a = "/home/datasets/ZOUbohao/"
b = "Deepurify/*.fasta"

print(os.path.join(a,b))

/home/datasets/ZOUbohao/Deepurify/*.fasta


In [None]:
names = []
visRepVectorList = []
batchList = []
k = 0
name2seq = {}
batch_size = 5
for i in range(100):
    name2seq[i] = i
for i, (name, seq) in enumerate(name2seq.items()):
    names.append(name)
    batchList.append(seq)
    if (i + 1) % batch_size == 0:
        for repVector in batchList:
            visRepVectorList.append(repVector)
        batchList = []
if len(batchList) != 0:
    for repVector in batchList:
        visRepVectorList.append(repVector)

print(names)
print(visRepVectorList)

: 

In [2]:
from Deepurify.Utils.RunCMDUtils import convertMetabat2CONCOCT


in_path = "/home/datasets/ZOUbohao/Proj1-Deepurify/1021520_test_concoct_metabat2/contigs.fasta.depth.txt"
o_path = "/home/datasets/ZOUbohao/Proj1-Deepurify/1021520_test_concoct_metabat2/contigs.cococt.tsv"
convertMetabat2CONCOCT(in_path, o_path)

In [5]:
u = u'hello\u03a9world'

print(u.encode("utf-8"))

b'hello\xce\xa9world'


In [1]:
import os

input_folder = "/home/datasets/ZOUbohao/Proj1-Deepurify/IBS_reads"

files = os.listdir(input_folder)
ids = set()

with open("./IBS_ids.txt", 'w') as wh:
    for file in files:
        id_name = file.split(".")[0]
        ids.add(id_name)
    for cur_id in ids:
        wh.write(cur_id + "\n")



In [6]:
import os
from Deepurify.Utils.IOUtils import readFasta

s = ">NODE_42381_length_3514_cov_15.464585"
input_path = "/home/datasets/ZOUbohao/Proj1-Deepurify/marine-original-reculster/1102218/DeepurifyTmpFiles/deconta_tmp/re_cluster_-1"

for file in os.listdir(input_path):
    _, bin_suffix = os.path.splitext(file)
    if bin_suffix[1:] == "fasta":
        cur_name2seq = readFasta(os.path.join(input_path, file))
        if s in cur_name2seq:
            print(file, len(cur_name2seq))



Deepurify_Bin_152.fasta 315


In [26]:

from Deepurify.Utils.IOUtils import readCheckm2Res, readMetaInfo
import os


def readGUNCRes(gunc_path: str):
    res = {}
    with open(gunc_path, "r") as rh:
        for i, line in enumerate(rh):
            if i != 0:
                info = line.strip("\n").split("\t")
                res[info[0][0:-1]] = info[-1]
    return res

input_folder = "/home/datasets/ZOUbohao/Proj1-Deepurify/CAMI-original-deepurify-only-clean"

# datasets = ["CAMI_medium1", "CAMI_medium2"]
# binnings = ["Semibin2", "Metabeta2", "Concoct"]
# binnings = ["Metabeta2"]

# datasets = ["ERR4195020_nodrep", "ERR9631077_nodrep", "SRR26420192_nodrep"]
datasets = os.listdir(input_folder)
# datasets = ["CAMI_high_norebin", "CAMI_medium1_norebin", "CAMI_medium2_norebin", "CAMI_low_norebin"]

for data in datasets:
    # for b in binnings:
        # print(data)
        if "checkm2" in data or "gunc" in data or "_drep" in data:
            continue
        # checkm2_path = os.path.join(input_folder, f"{b}_{data}_checkm2", "quality_report.tsv")
        # checkm2_res = readCheckm2Res(checkm2_path)[0]
        id_name = data.split("_")[0]
        gunc_path = os.path.join(input_folder, f"{data}_gunc", "GUNC.progenomes_2.1.maxCSS_level.tsv")
        gunc_res = readGUNCRes(gunc_path)
        # print(len(gunc_res),gunc_path)
        metainfo_path = os.path.join(input_folder,  f"{data}", "MetaInfo.tsv")
        checkm2_res = readMetaInfo(metainfo_path)[0]
        qs = 0.
        h = 0
        m_80 = 0
        m_70 = 0
        m_60 = 0
        m_50 = 0
        for name, qua in checkm2_res.items():
            profix, suffix = os.path.splitext(name)
            name = profix
            # print(name)
            # if qua[-1] == "HighQuality":
            #     print(name, qua, gunc_res[name])
            if gunc_res[name] == "True":
                if qua[-1] == "HighQuality":
                    # print(name, qua, gunc_res[name])
                    h += 1
                    qs += qua[0] - 5. * qua[1]
                elif qua[-1] == "MediumQuality":
                    qs += qua[0] - 5. * qua[1]
                    if qua[0] >= 80:
                        m_80 += 1
                    elif qua[0] >= 70:
                        m_70 += 1
                    elif qua[0] >= 60:
                        m_60 += 1 
                    else:
                        m_50 += 1
        
        print(f"{data},{h},{m_80},{m_70},{m_60},{m_50},{qs}")



Concoct_CAMI_high,126,54,51,41,38,22659.129999999994
Concoct_CAMI_low,14,1,2,3,4,1835.5500000000002
Concoct_CAMI_medium1,28,5,4,7,6,3880.9600000000005
Concoct_CAMI_medium2,36,9,5,7,6,4946.929999999999
Metabeta2_CAMI_high,120,52,54,63,72,25516.74000000002
Metabeta2_CAMI_low,13,3,2,5,4,2087.7599999999998
Metabeta2_CAMI_medium1,22,7,6,7,4,3547.510000000001
Metabeta2_CAMI_medium2,38,5,2,11,5,4950.04
Semibin2_CAMI_high,127,52,53,66,79,26994.34000000002
Semibin2_CAMI_low,15,1,1,3,4,1945.3899999999999
Semibin2_CAMI_medium1,26,11,5,8,8,4455.209999999999
Semibin2_CAMI_medium2,39,2,4,8,8,5001.710000000001


In [124]:
import os
from shutil import copy


input_folder = "/home/datasets/ZOUbohao/Proj1-Deepurify/plant-ensemble-Deepurify/"
output_folder = "/home/datasets/ZOUbohao/Proj1-Deepurify/plant-MDMcleaner-bins/"
bin_suffix = "fasta"
folders = os.listdir(input_folder)
if os.path.exists(output_folder) is False:
    os.mkdir(output_folder)

for folder in folders:
    print(f"{folder}")
    if "gunc" in folder or "checkm2" in folder:
        continue
    cur_input_folder = os.path.join(input_folder, folder, "DeepurifyTmpFiles/deconta_tmp/re_cluster_-1")
    files = os.listdir(cur_input_folder)
    for i, file in enumerate(files):
        _, suff = os.path.splitext(file)
        if suff[1:] == bin_suffix:
            output_path = os.path.join(output_folder, f"{folder}_{i}.fasta")
            print(output_path)
            copy(
                os.path.join(cur_input_folder, file),
                output_path
            )



# input_folder = "/home/datasets/ZOUbohao/Proj1-Deepurify/CAMI-original/"
# output_folder = "/home/datasets/ZOUbohao/Proj1-Deepurify/CAMI-MDMcleaner-bins"

# datasets = ["CAMI_high", "CAMI_medium1", "CAMI_medium2", "CAMI_low"]
# binnings = ["Semibin2", "Metabeta2", "Concoct"]
# for data in datasets:
#     for binning in binnings:
#         if binning == "Semibin2":
#             cur_input_folder = os.path.join(input_folder, f"{binning}_{data}", "output_bins")
#         else:
#             cur_input_folder = os.path.join(input_folder, f"{binning}_{data}")
#         for i, file in enumerate(os.listdir(cur_input_folder)):
#             print(os.path.join(output_folder, f"{binning}_{data}_{i}.fasta"))
#             if os.path.isdir(os.path.join(cur_input_folder, file)):
#                 continue
#             copy(
#                 os.path.join(cur_input_folder, file),
#                 os.path.join(output_folder, f"{binning}_{data}_{i}.fasta")
#             )



SRR10968246
/home/datasets/ZOUbohao/Proj1-Deepurify/plant-MDMcleaner-bins/SRR10968246_4.fasta
/home/datasets/ZOUbohao/Proj1-Deepurify/plant-MDMcleaner-bins/SRR10968246_5.fasta
/home/datasets/ZOUbohao/Proj1-Deepurify/plant-MDMcleaner-bins/SRR10968246_6.fasta
/home/datasets/ZOUbohao/Proj1-Deepurify/plant-MDMcleaner-bins/SRR10968246_7.fasta
/home/datasets/ZOUbohao/Proj1-Deepurify/plant-MDMcleaner-bins/SRR10968246_8.fasta
/home/datasets/ZOUbohao/Proj1-Deepurify/plant-MDMcleaner-bins/SRR10968246_9.fasta
/home/datasets/ZOUbohao/Proj1-Deepurify/plant-MDMcleaner-bins/SRR10968246_10.fasta
/home/datasets/ZOUbohao/Proj1-Deepurify/plant-MDMcleaner-bins/SRR10968246_11.fasta
/home/datasets/ZOUbohao/Proj1-Deepurify/plant-MDMcleaner-bins/SRR10968246_12.fasta
/home/datasets/ZOUbohao/Proj1-Deepurify/plant-MDMcleaner-bins/SRR10968246_13.fasta
/home/datasets/ZOUbohao/Proj1-Deepurify/plant-MDMcleaner-bins/SRR10968246_14.fasta
/home/datasets/ZOUbohao/Proj1-Deepurify/plant-MDMcleaner-bins/SRR10968246_15.fast

In [91]:
import os 


input_folder = "/home/datasets/ZOUbohao/Proj1-Deepurify/IBS-ensemble-Deepurify"
folders_list = os.listdir(input_folder)

for folder in folders_list:
    file_path = os.path.join(input_folder, folder, "Deepurify_Bin_0.fasta")
    if os.path.exists(file_path):
        print(folder)






0002
0004
0005
0006
0010
0011
0013
0019
0020
0027
0032
C0001
C0004
C0009
C0010
C0013
C0015
C0016
C0017
C0019
C0020
C0021
C0022
C0024
C0027
C0029
C0031
C0032
C0033
C0034
C0038
C0041
C0043
C0049
C0051
C0052
C0057
C0058
C0060
C0061
C0063
C0064
C0066
C0068
C0071
C0073
C0075
C0076
C0079
C0093
C0094
C0095
C0098
C0100
C0101
C0105
C0111
C0113
C0117
C0119
C0122
C0125
C0126
C0131
C0132
C0141
C0142
C0144
C0147
C0150
C0152
C0157
C0160
C0164
HC0002
HC0003
HC0005
HC0006
HK0001
HK0006
HK0019
HK0021
HK0023
HK0028
HK0029
HK0034
HK0036
HK0037
HK0038
HK0041
HK0042
HK0043
HK0044
HK0046
HK0048
HK0049
HK0051
HK0052
HK0053
HK0054
HK0059
HK0061
HK0065
HK0070
HK0071
HK0074
HK0075
HK0076
HK0078
HK0079
HK0080
HK0082
HK0084
HK0085
HK0087
HK0088
HK0089
HK0090
HK0092
HK0094
HK0095
K0003
K0004
K0005
K0006
K0009
K0011
K0012
K0013
K0014
K0016
K0017
K0021
K0025
K0031
K0032
K0034
K0035
K0037
K0040
K0041
K0043
K0045
K0046
K0047
K0048
K0058
K0059
K0063
K0064
K0066
K0067
K0069
K0071
K0074
K0076
K0077
K0082
K0083
K0084
K008

In [122]:
a = ">k141_2277971 flag=1 multi=1.0000 len=587"

print(a.split())

['>k141_2277971', 'flag=1', 'multi=1.0000', 'len=587']


In [142]:
from scipy.spatial.distance import pdist, squareform
import numpy as np

a = np.array([0,0,0,1])
b = np.array([1,0,0,0])

c = [a, b]
for i in range(3):
    c.append(np.random.randn(4))

mean_point = np.mean(c, axis=0)
print(mean_point)
Y = squareform(pdist(c, metric="cosine"))
print(Y)
ind = np.unravel_index(np.argmax(Y, axis=None), Y.shape)
print(ind)

[-0.33278172 -0.25381949 -0.69184662  0.4279178 ]
[[0.         1.         1.00752506 0.88683176 0.49216389]
 [1.         0.         1.80380844 1.62395645 0.68610837]
 [1.00752506 1.80380844 0.         0.3247307  0.78336825]
 [0.88683176 1.62395645 0.3247307  0.         0.98320848]
 [0.49216389 0.68610837 0.78336825 0.98320848 0.        ]]
(1, 2)


In [15]:
import os 

ids_set = set()
with open("./IBS_ids_completed.txt", "r") as rh:
    for line in rh:
        ids_set.add(line.strip("\n"))

files = os.listdir("/home/datasets/ZOUbohao/Proj1-Deepurify/IBS-ensemble-MAGpurify")
for file in files:
        pre_name = file.split("_")[0]
        if os.path.exists(os.path.join("/home/datasets/ZOUbohao/Proj1-Deepurify/IBS-ensemble-MAGpurify", f"{pre_name}_gunc")) is False:
            print(file)





HK0059_nodrep
HK0061_nodrep
HK0065_nodrep
HK0070_nodrep
HK0071_nodrep
HK0074_nodrep
HK0075_nodrep
HK0076_nodrep
HK0078_nodrep
HK0079_nodrep
HK0080_nodrep
HK0082_nodrep
HK0084_nodrep
HK0085_nodrep


In [9]:

from Deepurify.Utils.IOUtils import readCheckm2Res

input_path = "/home/datasets/ZOUbohao/Proj1-Deepurify/CAMI-Ensemble-Deepurify/CAMI_high_old_4/DeepurifyTmpFiles/deconta_tmp/de_temp_-1/FilterOutput/original_checkm2_res/quality_report.tsv"
print(readCheckm2Res(input_path)[1:])



(220, 307, 668)


Time collect

In [25]:
import os

def read(path):
    with open(path, "r") as rh:
        for line in rh:
            info = line.strip("\n")
            return info

input_tmp_folder = "/home/datasets/ZOUbohao/Proj1-Deepurify/IBS-time/K0273/DeepurifyTmpFiles/deconta_tmp"

for i in range(-1, 3):
    print(f"####{i}####")
    for folder in ["re_cluster_", "de_temp_"]:
        cur_input_folder = os.path.join(input_tmp_folder, f"{folder}{i}")
        if os.path.exists(cur_input_folder) is False:
            continue
        if "re_cluster_" in folder:
            time = read(os.path.join(cur_input_folder, "binning.time"))
            print(time)
        else:
            for time_file in ["Annot.time", "CallGene.time", "Filter.time", "Checkm2_first.time", "Reuse.time", "Checkm2_second.time", "Select_Res.time"]:
                print(read(os.path.join(cur_input_folder, time_file)))
                





####-1####
2525.4167518615723
2287.7271151542664
480.24342584609985
310.06171774864197
699.4887223243713
34.77270579338074
318.88691329956055
3.2820887565612793
####0####
2462.7762475013733
4.571408271789551
391.9069263935089
343.52013325691223
435.5968840122223
40.477916955947876
287.3455636501312
2.488719940185547
####1####
2217.7146558761597
4.420710802078247
381.0069246292114
305.88248920440674
423.4501531124115
28.8607656955719
269.450692653656
2.1030614376068115
####2####
2250.1360790729523
4.581784248352051
337.51256227493286
241.04442763328552
654.3899643421173
46.82252311706543
292.1406145095825
2.3189423084259033
