In [2]:
import os
import subprocess
from collections import defaultdict
import math
import pandas as pd
from Bio import SeqIO

In [3]:
def cal_ani(assembly_id, output_directory, consensus_record_dict, ignore_del=False, del_count_as_match=False):
    reference_fasta = os.path.join(reference_genome_path, f'{assembly_id}.fasta')

    with open(reference_fasta, "r") as handle:
        total_count = 0
        matched_count = 0

        for record in SeqIO.parse(handle, "fasta"):
            if record.id in consensus_record_dict:
                for idx, base in enumerate(record.seq):
                    if consensus_record_dict[record.id][idx] != 'N':
                        if consensus_record_dict[record.id][idx] == '*' and ignore_del:
                            continue
                        elif consensus_record_dict[record.id][idx] == '*':
                            total_count += 1
                            if del_count_as_match:
                                matched_count += 1
                        else:
                            total_count += 1
                            if consensus_record_dict[record.id][idx] == base:
                                matched_count += 1
                            
            else:
                print("No alignment found:", record.id, record.description)
                    
    if total_count != 0:
        return matched_count/total_count
    else:
        return 0

In [4]:
def samtools_merged_consensus(output_directory, threads):
    merged_bam = os.path.join(output_directory, 'bam_files', 'merged.sorted.bam')
    subprocess.run(['samtools', 'consensus', 
                    '--show-ins', 'no', 
                    '--show-del', 'yes', 
                    '-a',
                    '--mode', "simple",
                    '--threads', str(threads),
                    merged_bam, 
                    '-o', os.path.join(output_directory, 'merged_consensus.fasta')],
                  check=True)
    
    consensus_record_dict = SeqIO.to_dict(SeqIO.parse(os.path.join(output_directory, 'merged_consensus.fasta'), "fasta"))
    return consensus_record_dict

In [5]:
threads=20

In [6]:
output_directory = "/home/Users/yl181/seqscreen_nano/test"
reference_genome_path = os.path.join(output_directory, 'reference_genomes')

In [7]:
re_alignment_metadata = pd.read_csv(os.path.join(output_directory, 'alignment.csv'))

In [8]:
re_alignment_metadata

Unnamed: 0,Taxonomy ID,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Strain,Downloaded,Species,Breadth Coverage,Expected Coverage,Coverage Score,Depth Coverage,BC2,EC2,CS2,DC2
0,96241,GCF_006094475.1,NCBI RefSeq,True,Complete Genome,Bacillus subtilis subsp. spizizenii ATCC 6633 ...,ATCC 6633,True,Bacillus spizizenii,1.0,1.0,1.0,28.817028,1.0,1.0,1.0,28.435225
1,287,GCF_000006765.1,PathoGenesis Corporation,True,Complete Genome,Pseudomonas aeruginosa PAO1,PAO1,True,Pseudomonas aeruginosa,0.969225,0.999858,0.969363,9.133267,0.969233,0.999831,0.969397,8.957587
2,5207,GCA_022832995.1,Broad Institute,False,Complete Genome,Cryptococcus neoformans,VNII,True,Cryptococcus neoformans,0.423306,0.441691,0.958376,1.376761,0.42334,0.441638,0.958569,1.376418
3,1639,GCF_000196035.1,European Consortium,True,Complete Genome,Listeria monocytogenes EGD-e,EGD-e,True,Listeria monocytogenes,0.943656,1.0,0.943656,29.246736,0.932411,1.0,0.932411,26.44673
4,28901,GCF_000006945.2,Washington University Genome Sequencing Center,True,Complete Genome,Salmonella enterica subsp. enterica serovar Ty...,LT2,True,Salmonella enterica,0.929463,1.0,0.929463,20.590929,0.925799,0.999995,0.925804,13.176773
5,1280,GCF_000013425.1,University of Oklahoma Health Sciences Center,True,Complete Genome,Staphylococcus aureus subsp. aureus NCTC 8325,NCTC 8325,True,Staphylococcus aureus,0.923386,1.0,0.923386,27.298416,0.923337,1.0,0.923337,26.221774
6,4932,GCF_000146045.2,Saccharomyces Genome Database,True,Complete Genome,Saccharomyces cerevisiae S288C,S288C,True,Saccharomyces cerevisiae,0.619563,0.678052,0.91374,1.829052,0.619487,0.677297,0.914646,1.825484
7,1351,GCF_001598635.1,NCBI RefSeq,False,Complete Genome,Enterococcus faecalis,LD33,True,Enterococcus faecalis,0.894675,1.0,0.894675,26.467055,0.894216,1.0,0.894216,25.709091
8,1613,GCF_022819245.1,NCBI RefSeq,True,Complete Genome,Limosilactobacillus fermentum,SCB0035,True,Limosilactobacillus fermentum,0.863003,1.0,0.863003,34.935137,0.863029,1.0,0.863029,34.607012
9,562,GCF_000008865.2,GIRC,True,Complete Genome,Escherichia coli O157:H7 str. Sakai,Sakai substr. RIMD 0509952,True,Escherichia coli,0.783966,1.0,0.783966,20.741795,0.651192,0.990895,0.657175,7.212012


In [9]:
merged_bam = os.path.join(output_directory, 'bam_files', 'merged.sorted.bam')
subprocess.run(['samtools', 'consensus', 
                '--show-ins', 'no', 
                '--show-del', 'yes', 
                '-a',
                '--mode', "simple",
                '--threads', str(threads),
                merged_bam, 
                '-o', os.path.join(output_directory, 'simple_merged_consensus.fasta')],
              check=True)

CompletedProcess(args=['samtools', 'consensus', '--show-ins', 'no', '--show-del', 'yes', '-a', '--mode', 'simple', '--threads', '20', '/home/Users/yl181/seqscreen_nano/test/bam_files/merged.sorted.bam', '-o', '/home/Users/yl181/seqscreen_nano/test/simple_merged_consensus.fasta'], returncode=0)

In [10]:
consensus_record_dict = SeqIO.to_dict(SeqIO.parse(os.path.join(output_directory, 'simple_merged_consensus.fasta'), "fasta"))

In [None]:
ani_list = []
for idx, row in re_alignment_metadata.iterrows():
    if row['CS2'] != 0:
        assembly_id = row['Assembly Accession ID']
        ani_list.append(cal_ani(assembly_id))
    else:
        ani_list.append(0)

In [None]:
re_alignment_metadata['Simple Consensus ANI'] = ani_list

In [None]:
#re_alignment_metadata.sort_values(['Coverage Score'], ascending=False).to_csv(os.path.join(output_directory, 're-alignment.csv'), index=False)

In [12]:
ani_list = []
for idx, row in re_alignment_metadata.iterrows():
    if row['CS2'] != 0:
        assembly_id = row['Assembly Accession ID']
        ani_list.append(cal_ani(assembly_id, output_directory, consensus_record_dict, ignore_del=True))
    else:
        ani_list.append(0)

No alignment found: NC_002127.1 NC_002127.1 Escherichia coli O157:H7 str. Sakai plasmid pOSAK1, complete sequence


In [13]:
re_alignment_metadata['Simple Consensus ANI (Ignore DEL)'] = ani_list

In [None]:
ani_list = []
for idx, row in re_alignment_metadata.iterrows():
    if row['CS2'] != 0:
        assembly_id = row['Assembly Accession ID']
        ani_list.append(cal_ani(assembly_id, del_count_as_match=True))
    else:
        ani_list.append(0)

In [None]:
re_alignment_metadata['Simple Consensus ANI (DEL count as match)'] = ani_list

In [None]:
re_alignment_metadata.sort_values(['CS2'], ascending=False)[['Species', 'CS2', 'Consensus ANI', 'Simple Consensus ANI', 'Simple Consensus ANI (Ignore DEL)', 'Simple Consensus ANI (DEL count as match)']]

In [14]:
re_alignment_metadata

Unnamed: 0,Taxonomy ID,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Strain,Downloaded,Species,Breadth Coverage,Expected Coverage,Coverage Score,Depth Coverage,BC2,EC2,CS2,DC2,Simple Consensus ANI (Ignore DEL)
0,96241,GCF_006094475.1,NCBI RefSeq,True,Complete Genome,Bacillus subtilis subsp. spizizenii ATCC 6633 ...,ATCC 6633,True,Bacillus spizizenii,1.0,1.0,1.0,28.817028,1.0,1.0,1.0,28.435225,0.999993
1,287,GCF_000006765.1,PathoGenesis Corporation,True,Complete Genome,Pseudomonas aeruginosa PAO1,PAO1,True,Pseudomonas aeruginosa,0.969225,0.999858,0.969363,9.133267,0.969233,0.999831,0.969397,8.957587,0.99579
2,5207,GCA_022832995.1,Broad Institute,False,Complete Genome,Cryptococcus neoformans,VNII,True,Cryptococcus neoformans,0.423306,0.441691,0.958376,1.376761,0.42334,0.441638,0.958569,1.376418,0.86417
3,1639,GCF_000196035.1,European Consortium,True,Complete Genome,Listeria monocytogenes EGD-e,EGD-e,True,Listeria monocytogenes,0.943656,1.0,0.943656,29.246736,0.932411,1.0,0.932411,26.44673,0.962211
4,28901,GCF_000006945.2,Washington University Genome Sequencing Center,True,Complete Genome,Salmonella enterica subsp. enterica serovar Ty...,LT2,True,Salmonella enterica,0.929463,1.0,0.929463,20.590929,0.925799,0.999995,0.925804,13.176773,0.992369
5,1280,GCF_000013425.1,University of Oklahoma Health Sciences Center,True,Complete Genome,Staphylococcus aureus subsp. aureus NCTC 8325,NCTC 8325,True,Staphylococcus aureus,0.923386,1.0,0.923386,27.298416,0.923337,1.0,0.923337,26.221774,0.993944
6,4932,GCF_000146045.2,Saccharomyces Genome Database,True,Complete Genome,Saccharomyces cerevisiae S288C,S288C,True,Saccharomyces cerevisiae,0.619563,0.678052,0.91374,1.829052,0.619487,0.677297,0.914646,1.825484,0.976131
7,1351,GCF_001598635.1,NCBI RefSeq,False,Complete Genome,Enterococcus faecalis,LD33,True,Enterococcus faecalis,0.894675,1.0,0.894675,26.467055,0.894216,1.0,0.894216,25.709091,0.993251
8,1613,GCF_022819245.1,NCBI RefSeq,True,Complete Genome,Limosilactobacillus fermentum,SCB0035,True,Limosilactobacillus fermentum,0.863003,1.0,0.863003,34.935137,0.863029,1.0,0.863029,34.607012,0.995154
9,562,GCF_000008865.2,GIRC,True,Complete Genome,Escherichia coli O157:H7 str. Sakai,Sakai substr. RIMD 0509952,True,Escherichia coli,0.783966,1.0,0.783966,20.741795,0.651192,0.990895,0.657175,7.212012,0.980826
