In [None]:
import os
import subprocess
from collections import defaultdict
import math
import pandas as pd
from Bio import SeqIO

In [None]:
def cal_ani(assembly_id, output_directory, consensus_record_dict, ignore_del=False, del_count_as_match=False):
    reference_fasta = os.path.join(reference_genome_path, f'{assembly_id}.fasta')

    with open(reference_fasta, "r") as handle:
        total_count = 0
        matched_count = 0

        for record in SeqIO.parse(handle, "fasta"):
            if record.id in consensus_record_dict:
                for idx, base in enumerate(record.seq):
                    if consensus_record_dict[record.id][idx] != 'N':
                        if consensus_record_dict[record.id][idx] == '*' and ignore_del:
                            continue
                        elif consensus_record_dict[record.id][idx] == '*':
                            total_count += 1
                            if del_count_as_match:
                                matched_count += 1
                        else:
                            total_count += 1
                            if consensus_record_dict[record.id][idx] == base:
                                matched_count += 1
                            
            else:
                print("No alignment found:", record.id, record.description)
                    
    if total_count != 0:
        return matched_count/total_count
    else:
        return 0

In [None]:
def samtools_merged_consensus(output_directory, threads):
    merged_bam = os.path.join(output_directory, 'bam_files', 'merged.sorted.bam')
    subprocess.run(['samtools', 'consensus', 
                    '--show-ins', 'no', 
                    '--show-del', 'yes', 
                    '-a',
                    '--mode', "simple",
                    '--threads', str(threads),
                    merged_bam, 
                    '-o', os.path.join(output_directory, 'merged_consensus.fasta')],
                  check=True)
    
    consensus_record_dict = SeqIO.to_dict(SeqIO.parse(os.path.join(output_directory, 'merged_consensus.fasta'), "fasta"))
    return consensus_record_dict

In [None]:
threads=20

In [None]:
output_directory = "/home/Users/yl181/seqscreen_nano/test"
reference_genome_path = os.path.join(output_directory, 'reference_genomes')

In [None]:
re_alignment_metadata = pd.read_csv(os.path.join(output_directory, 'alignment.csv'))

In [None]:
re_alignment_metadata

In [None]:
merged_bam = os.path.join(output_directory, 'bam_files', 'merged.sorted.bam')
subprocess.run(['samtools', 'consensus', 
                '--show-ins', 'no', 
                '--show-del', 'yes', 
                '-a',
                '--mode', "simple",
                '--threads', str(threads),
                merged_bam, 
                '-o', os.path.join(output_directory, 'simple_merged_consensus.fasta')],
              check=True)

In [None]:
consensus_record_dict = SeqIO.to_dict(SeqIO.parse(os.path.join(output_directory, 'simple_merged_consensus.fasta'), "fasta"))

In [None]:
ani_list = []
for idx, row in re_alignment_metadata.iterrows():
    if row['CS2'] != 0:
        assembly_id = row['Assembly Accession ID']
        ani_list.append(cal_ani(assembly_id))
    else:
        ani_list.append(0)

In [None]:
re_alignment_metadata['Simple Consensus ANI'] = ani_list

In [None]:
#re_alignment_metadata.sort_values(['Coverage Score'], ascending=False).to_csv(os.path.join(output_directory, 're-alignment.csv'), index=False)

In [None]:
ani_list = []
for idx, row in re_alignment_metadata.iterrows():
    if row['CS2'] != 0:
        assembly_id = row['Assembly Accession ID']
        ani_list.append(cal_ani(assembly_id, output_directory, consensus_record_dict, ignore_del=True))
    else:
        ani_list.append(0)

In [None]:
re_alignment_metadata['Simple Consensus ANI (Ignore DEL)'] = ani_list

In [None]:
ani_list = []
for idx, row in re_alignment_metadata.iterrows():
    if row['CS2'] != 0:
        assembly_id = row['Assembly Accession ID']
        ani_list.append(cal_ani(assembly_id, del_count_as_match=True))
    else:
        ani_list.append(0)

In [None]:
re_alignment_metadata['Simple Consensus ANI (DEL count as match)'] = ani_list

In [None]:
re_alignment_metadata.sort_values(['CS2'], ascending=False)[['Species', 'CS2', 'Consensus ANI', 'Simple Consensus ANI', 'Simple Consensus ANI (Ignore DEL)', 'Simple Consensus ANI (DEL count as match)']]

In [None]:
re_alignment_metadata