In [13]:
import os
import subprocess
from collections import defaultdict

import pandas as pd
from Bio import SeqIO

In [14]:
def calculate_depth(assembly_id, min_depth=3):
    reference_fasta = os.path.join(reference_genome_path, f'{assembly_id}.fasta')
    depth_file = os.path.join(depth_file_path, f"{assembly_id}.depth")
    
    with open(reference_fasta, "r") as handle:
        genome_length = 0
        for record in SeqIO.parse(handle, "fasta"):
            genome_length += len(record.seq)

    genome_pos_count = 0
    genome_totol_count = 0

    with open(depth_file, "r") as depth:
        for line in depth.readlines():

            pos = int(line.split("\t")[1])
            depth = int(line.strip().split("\t")[2])

            if depth >= min_depth:
                genome_pos_count += 1
                genome_totol_count += depth

    breadth_coverage = genome_pos_count/genome_length
    depth_coverage = genome_totol_count/genome_pos_count
    
    return breadth_coverage, depth_coverage

In [15]:
output_directory = "/home/Users/yl181/seqscreen_nano/ZymoBIOMICS.STD.Even.ont.minimap2"

In [16]:
reference_metadata = pd.read_csv(os.path.join(output_directory, 'reference_metadata.csv'))
reference_genome_path = os.path.join(output_directory, 'reference_genomes')
depth_file_path = os.path.join(output_directory, 'depth_files')

In [55]:
min_depth = 1

In [56]:
downloaded_assemblies = reference_metadata[reference_metadata['Downloaded']]

breadth_coverage_list = []
depth_coverage_list = []
for assembly_id in downloaded_assemblies['Assembly Accession ID']:
    breadth_coverage, depth_coverage = calculate_depth(assembly_id, min_depth=min_depth)
    breadth_coverage_list.append(breadth_coverage)
    depth_coverage_list.append(depth_coverage)

In [57]:
downloaded_assemblies['Breadth Coverage'] = breadth_coverage_list
downloaded_assemblies['Depth Coverage'] = depth_coverage_list

In [58]:
downloaded_assemblies.to_csv(os.path.join(output_directory, 'coverage.csv'), index=False)

In [59]:
downloaded_assemblies

Unnamed: 0,Taxonomy ID,Species,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Downloaded,Breadth Coverage,Depth Coverage
0,1280,Staphylococcus aureus,GCF_000013425.1,University of Oklahoma Health Sciences Center,True,Complete Genome,Staphylococcus aureus subsp. aureus NCTC 8325,True,0.923425,28.811838
1,1613,Limosilactobacillus fermentum,GCF_022819245.1,NCBI RefSeq,True,Complete Genome,Limosilactobacillus fermentum,True,0.887391,37.477806
2,1351,Enterococcus faecalis,GCF_001598635.1,NCBI RefSeq,False,Complete Genome,Enterococcus faecalis,True,0.895596,28.436297
3,1423,Bacillus subtilis,GCF_000009045.1,BSNR,True,Complete Genome,Bacillus subtilis subsp. subtilis str. 168,True,0.871755,28.929011
4,287,Pseudomonas aeruginosa,GCF_000006765.1,PathoGenesis Corporation,True,Complete Genome,Pseudomonas aeruginosa PAO1,True,0.969221,9.803418
5,5207,Cryptococcus neoformans,GCA_022832995.1,Broad Institute,False,Complete Genome,Cryptococcus neoformans,True,0.434962,1.476958
6,1639,Listeria monocytogenes,GCF_000196035.1,European Consortium,True,Complete Genome,Listeria monocytogenes EGD-e,True,0.941636,31.114196
7,1642,Listeria innocua,GCF_009648575.1,NCBI RefSeq,True,Complete Genome,Listeria innocua,True,0.909327,29.911235
8,562,Escherichia coli,GCF_000008865.2,GIRC,True,Complete Genome,Escherichia coli O157:H7 str. Sakai,True,0.794427,21.601007
9,1638,Listeria ivanovii,GCF_000763515.1,NCBI RefSeq,True,Complete Genome,Listeria ivanovii subsp. ivanovii,True,0.849922,27.91754


In [60]:
zymo_theoretical_abundance = dict()
with open('/home/Users/yl181/seqscreen_nano/ZymoBIOMICS.STD.refseq.v2/theoretical_composition.txt', 'r') as ground_truth_f:
    for line in ground_truth_f.readlines():
        tax_name = line.strip().split(" - ")[0]
        abundance = float(line.strip().split(" - ")[1])/100
        zymo_theoretical_abundance[tax_name] = abundance

In [61]:
zymo_theoretical_abundance

{'Listeria monocytogenes': 0.12,
 'Pseudomonas aeruginosa': 0.12,
 'Bacillus subtilis': 0.12,
 'Escherichia coli': 0.12,
 'Salmonella enterica': 0.12,
 'Limosilactobacillus fermentum': 0.12,
 'Enterococcus faecalis': 0.12,
 'Staphylococcus aureus': 0.12,
 'Saccharomyces cerevisiae': 0.02,
 'Cryptococcus neoformans': 0.02}

In [62]:
theoretical_abundance = []
labels = []
for taxname in downloaded_assemblies['Species']:
    try:
        theoretical_abundance.append(zymo_theoretical_abundance[taxname])
        labels.append("TP")
    except KeyError:
        theoretical_abundance.append(0)
        labels.append("FP")

In [63]:
downloaded_assemblies['Theoretical Abundance'] = theoretical_abundance
downloaded_assemblies['Labels'] = labels

In [64]:
result_df = downloaded_assemblies[['Taxonomy ID', 'Species', 'Breadth Coverage', 'Depth Coverage', 'Theoretical Abundance', 'Labels']]

In [65]:
result_df.sort_values(['Breadth Coverage', 'Theoretical Abundance'], ascending=False)

Unnamed: 0,Taxonomy ID,Species,Breadth Coverage,Depth Coverage,Theoretical Abundance,Labels
15,96241,Bacillus spizizenii,1.0,29.770779,0.0,FP
4,287,Pseudomonas aeruginosa,0.969221,9.803418,0.12,TP
6,1639,Listeria monocytogenes,0.941636,31.114196,0.12,TP
10,28901,Salmonella enterica,0.929836,21.580957,0.12,TP
0,1280,Staphylococcus aureus,0.923425,28.811838,0.12,TP
7,1642,Listeria innocua,0.909327,29.911235,0.0,FP
2,1351,Enterococcus faecalis,0.895596,28.436297,0.12,TP
1,1613,Limosilactobacillus fermentum,0.887391,37.477806,0.12,TP
19,1643,Listeria welshimeri,0.886593,29.402895,0.0,FP
3,1423,Bacillus subtilis,0.871755,28.929011,0.12,TP


In [66]:
result_df.sort_values(['Species', 'Theoretical Abundance'], ascending=False)

Unnamed: 0,Taxonomy ID,Species,Breadth Coverage,Depth Coverage,Theoretical Abundance,Labels
0,1280,Staphylococcus aureus,0.923425,28.811838,0.12,TP
12,623,Shigella flexneri,0.834539,22.208736,0.0,FP
10,28901,Salmonella enterica,0.929836,21.580957,0.12,TP
14,4932,Saccharomyces cerevisiae,0.630432,1.926546,0.02,TP
18,294,Pseudomonas fluorescens,0.460645,7.494233,0.0,FP
4,287,Pseudomonas aeruginosa,0.969221,9.803418,0.12,TP
19,1643,Listeria welshimeri,0.886593,29.402895,0.0,FP
6,1639,Listeria monocytogenes,0.941636,31.114196,0.12,TP
9,1638,Listeria ivanovii,0.849922,27.91754,0.0,FP
7,1642,Listeria innocua,0.909327,29.911235,0.0,FP


In [67]:
for taxname in zymo_theoretical_abundance:
    if taxname not in set(downloaded_assemblies['Species']):
        print(taxname)
    

In [68]:
temp = result_df.sort_values(['Breadth Coverage', 'Theoretical Abundance'], ascending=False)

In [71]:
temp[temp['Breadth Coverage'] >= 0.5].to_csv(os.path.join(output_directory, 'coverage_above_50.csv'), index=False)