# Dependencies

In [1]:
import os
import argparse
import pathlib
import sys
import subprocess
import math
import warnings
from collections import defaultdict
from multiprocessing import Pool, Manager
from itertools import repeat
warnings.filterwarnings("ignore")

import pandas as pd
from ete3 import NCBITaxa
from Bio import SeqIO

In [2]:
from utils.reference_finder import prepare_reference_genomes
from utils.alignment import run_minimap2, run_bwa, sort_samfile, samtools_calculate_coverage
from utils.summary import alignment_summary, merge_reference_fasta, call_present_absent
from utils.ani import samtools_merged_consensus, ani_summary
from utils.input_parsing import parsing_input_f, filter_input_df, get_seq2assembly_dict

In [3]:
import numpy as np

In [4]:
from sklearn.cluster import AgglomerativeClustering

In [5]:
from ast import literal_eval

In [6]:
def samtools_calculate_coverage(output_dir, include_supp=False):
    coverage_files = os.path.join(output_dir, "coverage_files")
    bam_files = os.path.join(output_dir, "bam_files")

    command = ["samtools",
               "coverage",
               os.path.join(bam_files, f"merged.sorted.bam")]
    
    if include_supp:
        # samtools coverage --ff UNMAP,QCFAIL,DUP -q 0 merged.sorted.bam > secondary_coverage.tsv
        coverage_file = os.path.join(coverage_files, f"secondary_coverage.tsv")
        command += ['--ff', 'UNMAP,QCFAIL,DUP', '-q', str(0)]
    else:
        # samtools coverage merged.sorted.bam -q 20 > primary_coverage.tsv
        coverage_file = os.path.join(coverage_files, f"primary_coverage.tsv")
        command += ['-q', str(20)]
        # command += ['--ff', 'UNMAP,QCFAIL,DUP', '-q', str(20)]

    subprocess.run(command,
                    check=True,
                    stdout=open(coverage_file, "w"))

In [7]:
def parse_fastani_line(line):
    ret_list = []
    for item in line.strip().split('\t')[1:]:
        if item == 'NA':
            ret_list.append(float(0))
        else:
            ret_list.append(float(item))
    return ret_list

In [8]:
def find_representative_genome(fastani_path, fastani_assemblies, downloaded_assemblies):
    fastani_result = os.path.join(fastani_path, f'pairwise_ani.matrix')
    #fastani_assemblies = downloaded_assemblies[downloaded_assemblies['Genus Taxid'] == genus_taxid]['Assembly Accession ID'].values
    

    ani_matrix = []
    with open(fastani_result, 'r') as fastani_out_f:
        lines = fastani_out_f.readlines()
        num_seqs = int(lines[0].strip())
        
        for idx, line in enumerate(lines[1:]):
            ani_matrix.append(parse_fastani_line(line)+list(np.ones(num_seqs-idx)*100))

    dist_nparray = 1-np.array(ani_matrix)/100
    dist_df = pd.DataFrame(dist_nparray.T + dist_nparray,
                 columns=fastani_assemblies,
                 index=fastani_assemblies)

    model = AgglomerativeClustering(affinity='precomputed', n_clusters=None, compute_full_tree=True,
                                    linkage='complete', 
                                    distance_threshold=0.05).fit(dist_df)
    cluster_df = dist_df.copy()
    #print(cluster_df)
    cluster_df['Cluster Label'] = model.labels_
    
    representative_genomes = defaultdict(list)
    member2representative = dict()
    for cluster_idx in cluster_df['Cluster Label'].unique():
        cluster_members = cluster_df[cluster_df['Cluster Label'] == cluster_idx].index.to_list()
        selected_df = downloaded_assemblies[downloaded_assemblies['Assembly Accession ID'].isin(cluster_members)].copy()
        if selected_df[selected_df['Assembly Level'] == 'Complete Genome'].shape[0] == 1:
            representative_genome = selected_df[selected_df['Assembly Level'] == 'Complete Genome']['Assembly Accession ID'].values[0]
        elif selected_df[selected_df['Assembly Level'] == 'Complete Genome'].shape[0] > 1:
            complete_genomes = list(selected_df[selected_df['Assembly Level'] == 'Complete Genome']['Assembly Accession ID'].values)
            representative_genome = dist_df.loc[complete_genomes].sum().idxmin()
        else:
            representative_genome = dist_df.loc[cluster_members].sum().idxmin()
        
        representative_genomes[representative_genome] = cluster_members
        for member in cluster_members:
            member2representative[member] = representative_genome
            
    return representative_genomes, member2representative

In [9]:
def samtools_merged_consensus(output_directory, threads):
    merged_bam = os.path.join(output_directory, 'bam_files', 'merged.sorted.bam')
    subprocess.run(['samtools', 'consensus', 
                    '--show-ins', 'no', 
                    '--show-del', 'yes',
                    '--min-MQ', str(20),
                    '-a',
                    '--mode', "simple",
                    '--threads', str(threads),
                    merged_bam, 
                    '-o', os.path.join(output_directory, 'merged_consensus.fasta')],
                  check=True)
    
    consensus_record_dict = SeqIO.to_dict(SeqIO.parse(os.path.join(output_directory, 'merged_consensus.fasta'), "fasta"))
    return consensus_record_dict

# Full Run

In [10]:
def run_magnet(cmd_args):
    args = parser.parse_args(cmd_args)

    input_tsv = args.classification
    input_fastq = args.fastq
    input_fastq2 = args.fastq2
    mode = args.mode
    working_directory = args.output

    taxid_col_idx = args.taxid_idx
    abundance_col_idx = args.abundance_idx
    min_abundance = args.min_abundance
    min_mapq = args.min_mapq
    min_coverage_score = args.min_covscore
    threads = args.threads
    valid_kingdom_str = args.kingdom

    valid_kingdom = set()
    for i in valid_kingdom_str.split(','):
        valid_kingdom.add(int(i))

    if args.include_mag:
        mag_flag = 'all'
    else:
        mag_flag = 'exclude'

    accession_flag = args.accession
    call_subspecies = args.subspecies

    sep = '\t'
    if str(input_tsv)[-3:] == 'csv':
        sep = ','

    ncbi_taxa_db = NCBITaxa()
    if not os.path.exists(working_directory):
        os.mkdir(working_directory)

    if accession_flag:
        abundance_col_idx = None
        min_abundance = 0
        input_df, min_abundance = parsing_input_f(input_tsv, sep, taxid_col_idx, abundance_col_idx, min_abundance)
        valid_taxids = list(input_df['tax_id'].values)
    else:
        input_df, min_abundance = parsing_input_f(input_tsv, sep, taxid_col_idx, abundance_col_idx, min_abundance)
        # make valid_kingdom a variable?
        valid_taxids = filter_input_df(input_df, min_abundance, ncbi_taxa_db, valid_kingdom=valid_kingdom, ret_subspecies=call_subspecies)

    reference_metadata = prepare_reference_genomes(valid_taxids, working_directory, ncbi_taxa_db, accession_flag=accession_flag, mag_flag=mag_flag)
    downloaded_assemblies = reference_metadata[reference_metadata['Downloaded']]

    reference_genome_path = os.path.join(working_directory, 'reference_genomes')

    fastani_path = os.path.join(working_directory, 'fastANI')
    if not os.path.exists(fastani_path):
        os.mkdir(fastani_path)

    fastani_assemblies = downloaded_assemblies['Assembly Accession ID'].values
    assemblie_list_f = os.path.join(fastani_path, f"assemblie_list.txt")
    with open(assemblie_list_f, "w") as rl_f:
        for accession in fastani_assemblies:
            reference_genome = os.path.join(reference_genome_path, f'{accession}.fasta')
            rl_f.write(f"{reference_genome}\n")

    subprocess.run(['fastANI',
                    '--rl', assemblie_list_f,
                    '--ql', assemblie_list_f,
                    "--matrix",
                    '--threads', str(threads),
                    '-o', os.path.join(fastani_path, f'pairwise_ani')],
                   check=True,
                   stdout=open(os.path.join(fastani_path, "fastani.log"), "a"),
                   stderr=open(os.path.join(fastani_path, "fastani.err"), "a"))

    representative_genomes, member2representative = find_representative_genome(fastani_path, fastani_assemblies, downloaded_assemblies)

    representative_labels = []
    cluster_members = []
    for idx, row in downloaded_assemblies.iterrows():
        accession = row['Assembly Accession ID']
        if accession in representative_genomes.keys():
            representative_labels.append(True)
            cluster_members.append(','.join(representative_genomes[accession]))
        else:
            representative_labels.append(False)
            cluster_members.append(','.join(representative_genomes[member2representative[accession]]))
    downloaded_assemblies['Cluster Representative'] = representative_labels
    downloaded_assemblies['Cluster Members'] = cluster_members

    representative_df = downloaded_assemblies[downloaded_assemblies['Cluster Representative']]

    seq2assembly_dict = get_seq2assembly_dict(working_directory, representative_df)
    reference_fasta = merge_reference_fasta(list(representative_df['Assembly Accession ID']), working_directory)

    if mode == 'ont':
        aligner_output = run_minimap2(input_fastq, reference_fasta, 'merged', working_directory, threads=threads)
    if mode == 'illumina':
        aligner_output = run_bowtie2(input_fastq, input_fastq2, reference_fasta, 'merged', working_directory, threads=threads)
    sort_samfile('merged', aligner_output, working_directory, min_mapq=0, threads=threads)

    coverage_files = os.path.join(working_directory, "coverage_files")
    if not os.path.exists(coverage_files):
        os.mkdir(coverage_files)

    pool = Pool(processes=threads)
    pool.starmap(samtools_calculate_coverage, zip(repeat(working_directory), [True, False]))
    pool.close()
    pool.join()


    representative_df = alignment_summary(representative_df,
                                          working_directory,
                                          seq2assembly_dict,
                                          include_supp=True)

    representative_df = alignment_summary(representative_df,
                                          working_directory,
                                          seq2assembly_dict,
                                          include_supp=False)

    consensus_record_dict = samtools_merged_consensus(working_directory, threads)
    representative_df = ani_summary(representative_df, consensus_record_dict, working_directory, threads)
    representative_df = call_present_absent(representative_df, min_coverage_score)
    representative_df.sort_values(['Primary Score'], ascending=False).to_csv(os.path.join(working_directory, 
                                                                                          'cluster_representative.csv'), 
                                                                             index=False)

# Argparse

In [11]:
parser = argparse.ArgumentParser(description="Universal Taxonomic Classification Verifier.")

parser.add_argument("-c", "--classification", type=pathlib.Path, required=True, help="Path to the Taxonomic Classification Report. Accepting csv/tsv file format, other text formats are treated as tsv.")
parser.add_argument("-i", "--fastq", type=pathlib.Path, required=True, help="Path to the first fastq file.")
parser.add_argument("-I", "--fastq2", type=pathlib.Path, required=False, help="Path to the second fastq file for paired-end reads.")
parser.add_argument("-m", "--mode", type=str, required=False, choices=['ont', 'illumina'], help="Modes for different sequencing platforms [ont, illumina]. Default:[ont]",  default='ont')
parser.add_argument("-o", "--output", type=pathlib.Path, required=True, help="Path to the output directory.")
parser.add_argument("-t", "--taxid-idx", type=int, required=False, help="The column index (0-based) of the taxids. Default:[0]", default=0)
parser.add_argument("-a", "--abundance-idx", type=int, required=False, help="The column index (0-based) of the abundance. Default:[None]")
parser.add_argument("--min-abundance", type=float, required=False, help="Minimum abundance (0-1) for pre-filtering, exclude taxa below the threshold.", default=0)
parser.add_argument("--min-mapq", type=int, required=False, help="Minimum MAPQ for primary alignments. Default:[20]", default=20)
parser.add_argument("--min-covscore", type=float, required=False, help="Minimum Coverage Score for supplementary alignments. Default:[0.7]", default=0.7)
parser.add_argument("--threads", type=int, required=False, help="Number of threads for Multi-threading. Default:[1]", default=1)
parser.add_argument("--kingdom", type=str, help="A comma separated list of taxids of valid kingdoms. Default:[2,4751,2157,10239]", default='2,4751,2157,10239')
parser.add_argument("--include-mag", action='store_true', required=False, help="Include metagenomic assemble genomes. Default:[off]")
parser.set_defaults(include_mag=False)
parser.add_argument("--subspecies", action='store_true', required=False, help="Verify taxonomic classification at subspecies rank. Default:[off]")
parser.set_defaults(subspecies=False)
parser.add_argument("--accession", action='store_true', required=False, help="Take accession ids as taxids. Does not work with min-abundance. Default:[off]")
parser.set_defaults(accession=False)

## Metamap-sim

In [16]:
cmd_records = []

magnet_output = '/home/Users/yl181/memu/magnet_v222_MetaMapsSim'
if not os.path.exists(magnet_output):
    os.mkdir(magnet_output)

classification_output = '/home/Users/ns58/Mob-experiments/Lemur-outputs/MetaMaps-sim/nof-i100-relative_abundance.tsv'
if not os.path.exists(classification_output):
    print("classification_output missing:", classification_output)
input_fastq = '/home/Users/ns58/Mob-experiments/Data/Metamaps-sim/simulations_i100_specifiedFrequencies/0/reads.fastq'
if not os.path.exists(input_fastq):
    print("input_fastq missing:", input_fastq)
    
magnet_output_prefix = 'nof-i100'
if not os.path.exists(os.path.join(magnet_output, magnet_output_prefix)):
    os.mkdir(os.path.join(magnet_output, magnet_output_prefix))
magnet_sample_output = os.path.join(magnet_output, magnet_output_prefix, magnet_output_prefix)

cmd_args = ['-c', classification_output,
'-i', input_fastq,
'-m', 'ont',
'-t', str(4),
'-a', str(0),
'--min-abundance', str(0.01),
'-o', magnet_sample_output,
'--threads', '40']

print(" ".join(cmd_args))
cmd_records.append(cmd_args)

-c /home/Users/ns58/Mob-experiments/Lemur-outputs/MetaMaps-sim/nof-i100-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data/Metamaps-sim/simulations_i100_specifiedFrequencies/0/reads.fastq -m ont -t 4 -a 0 --min-abundance 0.01 -o /home/Users/yl181/memu/magnet_v222_MetaMapsSim/nof-i100/nof-i100 --threads 40


## Kraken

In [12]:
mob_output_paths = ['/home/Users/ns58/Mob-experiments/Kraken2-outputs/Zymo-Log-0.10']
input_fastq_paths = ['/home/Users/ns58/Mob-experiments/Data-grouped/Zymo-Log-p0.10']

magnet_output = '/home/Users/yl181/memu/magnet_v222_ZYMO_Log'
if not os.path.exists(magnet_output):
    os.mkdir(magnet_output)
    

In [17]:
cmd_records = []

for idx, mob_output_path in enumerate(mob_output_paths):
    for f in os.listdir(mob_output_path):
        if f.startswith('Zymo-Log-') and 'report' in f:
            sample_id = f.split('.')[0]
            input_fastq = os.path.join(input_fastq_paths[idx], f'{sample_id}.fastq')
            if not os.path.exists(input_fastq):
                print("input_fastq missing:", mob_output_path, sample_id)
            classification_output = os.path.join(mob_output_path, f)
            if not os.path.exists(classification_output):
                print("classification_output missing:", input_fastq_paths[idx], sample_id)
                
            magnet_output_prefix = input_fastq_paths[idx].split('/')[-1]+'-kraken'
            if not os.path.exists(os.path.join(magnet_output, magnet_output_prefix)):
                os.mkdir(os.path.join(magnet_output, magnet_output_prefix))
            magnet_sample_output = os.path.join(magnet_output, magnet_output_prefix, sample_id)
            
            # print(input_fastq)
            # print(classification_output)
            # print(magnet_sample_output)
            
            cmd_args = ['-c', classification_output,
            '-i', input_fastq,
            '-m', 'ont',
            '-t', str(4),
            '-a', str(0),
            '--min-abundance', str(0.01),
            '-o', magnet_sample_output,
            '--threads', '40']
            
            print(" ".join(cmd_args))
            cmd_records.append(cmd_args)

-c /home/Users/ns58/Mob-experiments/Kraken2-outputs/Zymo-Log-0.10/Zymo-Log-1.report.txt -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-Log-p0.10/Zymo-Log-1.fastq -m ont -t 4 -a 0 --min-abundance 0.01 -o /home/Users/yl181/memu/magnet_v222_ZYMO_Log/Zymo-Log-p0.10-kraken/Zymo-Log-1 --threads 40
-c /home/Users/ns58/Mob-experiments/Kraken2-outputs/Zymo-Log-0.10/Zymo-Log-5.report.txt -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-Log-p0.10/Zymo-Log-5.fastq -m ont -t 4 -a 0 --min-abundance 0.01 -o /home/Users/yl181/memu/magnet_v222_ZYMO_Log/Zymo-Log-p0.10-kraken/Zymo-Log-5 --threads 40
-c /home/Users/ns58/Mob-experiments/Kraken2-outputs/Zymo-Log-0.10/Zymo-Log-2.report.txt -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-Log-p0.10/Zymo-Log-2.fastq -m ont -t 4 -a 0 --min-abundance 0.01 -o /home/Users/yl181/memu/magnet_v222_ZYMO_Log/Zymo-Log-p0.10-kraken/Zymo-Log-2 --threads 40
-c /home/Users/ns58/Mob-experiments/Kraken2-outputs/Zymo-Log-0.10/Zymo-Log-3.report.txt -i /home/Use

## SRR17687125

In [14]:
mob_output_paths = ['/home/Users/ns58/Mob-experiments/Mob-outputs/SRR17687125']
input_fastq_paths = ['/home/Users/ns58/Mob-experiments/Data/Gut']

magnet_output = '/home/Users/yl181/memu/magnet_v222_SRR17687125'
if not os.path.exists(magnet_output):
    os.mkdir(magnet_output)

In [19]:
cmd_records = []

for idx, mob_output_path in enumerate(mob_output_paths):
    for f in os.listdir(mob_output_path):
        if 'relative_abundance.tsv' in f and not 'nof' in f:
            sample_id = f.split('-')[0]
            input_fastq = os.path.join(input_fastq_paths[idx], f'{sample_id}.fastq')
            if not os.path.exists(input_fastq):
                print("input_fastq missing:", mob_output_path, sample_id)
            classification_output = os.path.join(mob_output_path, f)
            if not os.path.exists(classification_output):
                print("classification_output missing:", input_fastq_paths[idx], sample_id)
                
            magnet_output_prefix = input_fastq_paths[idx].split('/')[-1]
            if not os.path.exists(os.path.join(magnet_output, magnet_output_prefix)):
                os.mkdir(os.path.join(magnet_output, magnet_output_prefix))
            magnet_sample_output = os.path.join(magnet_output, magnet_output_prefix, sample_id)
            
            cmd_args = ['-c', classification_output,
            '-i', input_fastq,
            '-m', 'ont',
            '-a', str(12),
            '--min-abundance', str(0.000),
            '-o', magnet_sample_output,
            '--threads', '40']
            
            #print(sample_id)
            print(" ".join(cmd_args))
            cmd_records.append(cmd_args)
            
for idx, mob_output_path in enumerate(mob_output_paths):
    for f in os.listdir(mob_output_path):
        if 'relative_abundance.tsv' in f and 'nof' in f:
            sample_id = f.split('-')[1]
            input_fastq = os.path.join(input_fastq_paths[idx], f'{sample_id}.fastq')
            if not os.path.exists(input_fastq):
                print("input_fastq missing:", mob_output_path, sample_id)
            classification_output = os.path.join(mob_output_path, f)
            if not os.path.exists(classification_output):
                print("classification_output missing:", input_fastq_paths[idx], sample_id)
                
            magnet_output_prefix = '-'.join(f.split('-')[0:2])
            if not os.path.exists(os.path.join(magnet_output, magnet_output_prefix)):
                os.mkdir(os.path.join(magnet_output, magnet_output_prefix))
            magnet_sample_output = os.path.join(magnet_output, magnet_output_prefix, sample_id)
            
            cmd_args = ['-c', classification_output,
            '-i', input_fastq,
            '-m', 'ont',
            '-a', str(12),
            '--min-abundance', str(0.000),
            '-o', magnet_sample_output,
            '--threads', '40']
            
            #print(sample_id)
            print(" ".join(cmd_args))
            cmd_records.append(cmd_args)

-c /home/Users/ns58/Mob-experiments/Mob-outputs/SRR17687125/SRR17687125-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data/Gut/SRR17687125.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_SRR17687125/Gut/SRR17687125 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/SRR17687125/nof-SRR17687125-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data/Gut/SRR17687125.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_SRR17687125/nof-SRR17687125/SRR17687125 --threads 40


## Zymo TM

In [13]:
mob_output_paths = ['/home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-TM']
input_fastq_paths = ['/home/Users/ns58/Mob-experiments/Data-grouped/Zymo-TM']

magnet_output = '/home/Users/yl181/memu/magnet_v222_ZYMO_TM'
if not os.path.exists(magnet_output):
    os.mkdir(magnet_output)

In [15]:
cmd_records = []

for idx, mob_output_path in enumerate(mob_output_paths):
    for f in os.listdir(mob_output_path):
        if 'relative_abundance.tsv' in f and not 'nof' in f:
            sample_id = '-'.join(f.split('-')[0:3])
            input_fastq = os.path.join(input_fastq_paths[idx], f'{sample_id}.fastq')
            if not os.path.exists(input_fastq):
                print("input_fastq missing:", mob_output_path, sample_id)
            classification_output = os.path.join(mob_output_path, f)
            if not os.path.exists(classification_output):
                print("classification_output missing:", input_fastq_paths[idx], sample_id)
                
            magnet_output_prefix = input_fastq_paths[idx].split('/')[-1]
            if not os.path.exists(os.path.join(magnet_output, magnet_output_prefix)):
                os.mkdir(os.path.join(magnet_output, magnet_output_prefix))
            magnet_sample_output = os.path.join(magnet_output, magnet_output_prefix, sample_id)
            
            cmd_args = ['-c', classification_output,
            '-i', input_fastq,
            '-m', 'ont',
            '-a', str(12),
            '--min-abundance', str(0.000),
            '-o', magnet_sample_output,
            '--threads', '40']
            
            #print(sample_id)
            print(" ".join(cmd_args))
            cmd_records.append(cmd_args)
            

-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-TM/Zymo-TM-3-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-TM/Zymo-TM-3.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_ZYMO_TM/Zymo-TM/Zymo-TM-3 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-TM/Zymo-TM-1-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-TM/Zymo-TM-1.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_ZYMO_TM/Zymo-TM/Zymo-TM-1 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-TM/Zymo-TM-2-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-TM/Zymo-TM-2.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_ZYMO_TM/Zymo-TM/Zymo-TM-2 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-TM/Zymo-TM-5-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-TM/Zymo-TM-5.fastq -m ont -a 12 --min-abu

In [17]:
for idx, mob_output_path in enumerate(mob_output_paths):
    for f in os.listdir(mob_output_path):
        if 'relative_abundance.tsv' in f and 'nof' in f:
            sample_id = '-'.join(f.split('-')[1:4])
            input_fastq = os.path.join(input_fastq_paths[idx], f'{sample_id}.fastq')
            if not os.path.exists(input_fastq):
                print("input_fastq missing:", mob_output_path, sample_id)
            classification_output = os.path.join(mob_output_path, f)
            if not os.path.exists(classification_output):
                print("classification_output missing:", input_fastq_paths[idx], sample_id)
                
            magnet_output_prefix = '-'.join(f.split('-')[0:3])
            if not os.path.exists(os.path.join(magnet_output, magnet_output_prefix)):
                os.mkdir(os.path.join(magnet_output, magnet_output_prefix))
            magnet_sample_output = os.path.join(magnet_output, magnet_output_prefix, sample_id)
            
            cmd_args = ['-c', classification_output,
            '-i', input_fastq,
            '-m', 'ont',
            '-a', str(12),
            '--min-abundance', str(0.000),
            '-o', magnet_sample_output,
            '--threads', '40']
            
            #print(sample_id)
            print(" ".join(cmd_args))
            cmd_records.append(cmd_args)

-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-TM/nof-Zymo-TM-1-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-TM/Zymo-TM-1.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_ZYMO_TM/nof-Zymo-TM/Zymo-TM-1 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-TM/nof-Zymo-TM-2-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-TM/Zymo-TM-2.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_ZYMO_TM/nof-Zymo-TM/Zymo-TM-2 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-TM/nof-Zymo-TM-4-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-TM/Zymo-TM-4.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_ZYMO_TM/nof-Zymo-TM/Zymo-TM-4 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-TM/nof-Zymo-TM-5-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-TM/Zymo-TM-5.

## Simulations

In [12]:
mob_output_paths = ['/home/Users/ns58/Mob-experiments/Mob-outputs/Sim-Even-s20', '/home/Users/ns58/Mob-experiments/Mob-outputs/Sim-Log-s12']
input_fastq_paths = ['/home/Users/ns58/Mob-experiments/Data-grouped/Sim-Even-s20', '/home/Users/ns58/Mob-experiments/Data-grouped/Sim-Log-s12']

magnet_output = '/home/Users/yl181/memu/magnet_v222_simulations'
if not os.path.exists(magnet_output):
    os.mkdir(magnet_output)

In [13]:
mob_output_paths = ['/home/Users/ns58/Mob-experiments/Mob-outputs/Sim-MAG-Even-s20']
input_fastq_paths = ['/home/Users/ns58/Mob-experiments/Data-grouped/Sim-MAG-Even-s20']

magnet_output = '/home/Users/yl181/memu/magnet_v222_simulations_mags'
if not os.path.exists(magnet_output):
    os.mkdir(magnet_output)

In [14]:
cmd_records = []

for idx, mob_output_path in enumerate(mob_output_paths):
    for f in os.listdir(mob_output_path):
        if 'relative_abundance.tsv' in f:
            sample_id = '-'.join(f.split('-')[0:3])
            input_fastq = os.path.join(input_fastq_paths[idx], f'{sample_id}.fastq')
            if not os.path.exists(input_fastq):
                print("input_fastq missing:", mob_output_path, sample_id)
            classification_output = os.path.join(mob_output_path, f)
            if not os.path.exists(classification_output):
                print("classification_output missing:", input_fastq_paths[idx], sample_id)
                
            magnet_output_prefix = input_fastq_paths[idx].split('/')[-1]
            if not os.path.exists(os.path.join(magnet_output, magnet_output_prefix)):
                os.mkdir(os.path.join(magnet_output, magnet_output_prefix))
            magnet_sample_output = os.path.join(magnet_output, magnet_output_prefix, sample_id)
            
            cmd_args = ['-c', classification_output,
            '-i', input_fastq,
            '-m', 'ont',
            '-a', str(12),
            '--min-abundance', str(0.000),
            '-o', magnet_sample_output,
            '--threads', '40']
            
            #print(sample_id)
            print(" ".join(cmd_args))
            cmd_records.append(cmd_args)

-c /home/Users/ns58/Mob-experiments/Mob-outputs/Sim-MAG-Even-s20/Sim-Even-4-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Sim-MAG-Even-s20/Sim-Even-4.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_simulations_mags/Sim-MAG-Even-s20/Sim-Even-4 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Sim-MAG-Even-s20/Sim-Even-3-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Sim-MAG-Even-s20/Sim-Even-3.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_simulations_mags/Sim-MAG-Even-s20/Sim-Even-3 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Sim-MAG-Even-s20/Sim-Even-1-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Sim-MAG-Even-s20/Sim-Even-1.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_simulations_mags/Sim-MAG-Even-s20/Sim-Even-1 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Sim-MAG-Even-s20/Si

## Zymo Log

In [7]:
mob_output_paths = ['/home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Log-0.10', '/home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Log-0.25']
input_fastq_paths = ['/home/Users/ns58/Mob-experiments/Data-grouped/Zymo-Log-p0.10', '/home/Users/ns58/Mob-experiments/Data-grouped/Zymo-Log-p0.25']

magnet_output = '/home/Users/yl181/memu/magnet_v222_ZYMO_Log'
if not os.path.exists(magnet_output):
    os.mkdir(magnet_output)

In [8]:
cmd_records = []

for idx, mob_output_path in enumerate(mob_output_paths):
    for f in os.listdir(mob_output_path):
        if f.startswith('Zymo-Log-') and 'relative_abundance.tsv' in f:
            sample_id = '-'.join(f.split('-')[0:3])
            input_fastq = os.path.join(input_fastq_paths[idx], f'{sample_id}.fastq')
            if not os.path.exists(input_fastq):
                print("input_fastq missing:", mob_output_path, sample_id)
            classification_output = os.path.join(mob_output_path, f)
            if not os.path.exists(classification_output):
                print("classification_output missing:", input_fastq_paths[idx], sample_id)
                
            magnet_output_prefix = input_fastq_paths[idx].split('/')[-1]
            if not os.path.exists(os.path.join(magnet_output, magnet_output_prefix)):
                os.mkdir(os.path.join(magnet_output, magnet_output_prefix))
            magnet_sample_output = os.path.join(magnet_output, magnet_output_prefix, sample_id)
            
            # print(input_fastq)
            # print(classification_output)
            # print(magnet_sample_output)
            
            cmd_args = ['-c', classification_output,
            '-i', input_fastq,
            '-m', 'ont',
            '-a', str(12),
            '--min-abundance', str(0.000),
            '-o', magnet_sample_output,
            '--threads', '40']
            
            print(" ".join(cmd_args))
            cmd_records.append(cmd_args)

-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Log-0.10/Zymo-Log-2-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-Log-p0.10/Zymo-Log-2.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_ZYMO_Log/Zymo-Log-p0.10/Zymo-Log-2 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Log-0.10/Zymo-Log-5-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-Log-p0.10/Zymo-Log-5.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_ZYMO_Log/Zymo-Log-p0.10/Zymo-Log-5 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Log-0.10/Zymo-Log-1-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-Log-p0.10/Zymo-Log-1.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_ZYMO_Log/Zymo-Log-p0.10/Zymo-Log-1 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Log-0.10/Zymo-Log-4-relative_abundance.tsv -i /home/Users/n

## Cheese

In [20]:
mob_output_paths = ['/home/Users/ns58/Mob-experiments/Mob-outputs/Cheese']
input_fastq_paths = ['/home/Users/ns58/Mob-experiments/Data-grouped/Cheese']

magnet_output = '/home/Users/yl181/memu/magnet_v222_cheese'
if not os.path.exists(magnet_output):
    os.mkdir(magnet_output)

In [21]:
cmd_records = []

for idx, mob_output_path in enumerate(mob_output_paths):
    for f in os.listdir(mob_output_path):
        if f.startswith('Cheese-') and 'relative_abundance.tsv' in f:
            sample_id = '-'.join(f.split('-')[0:2])
            input_fastq = os.path.join(input_fastq_paths[idx], f'{sample_id}.fastq')
            if not os.path.exists(input_fastq):
                print("input_fastq missing:", mob_output_path, sample_id)
            classification_output = os.path.join(mob_output_path, f)
            if not os.path.exists(classification_output):
                print("classification_output missing:", input_fastq_paths[idx], sample_id)
                
            magnet_output_prefix = input_fastq_paths[idx].split('/')[-1]
            if not os.path.exists(os.path.join(magnet_output, magnet_output_prefix)):
                os.mkdir(os.path.join(magnet_output, magnet_output_prefix))
            magnet_sample_output = os.path.join(magnet_output, magnet_output_prefix, sample_id)
            
            # print(input_fastq)
            # print(classification_output)
            # print(magnet_sample_output)
            
            cmd_args = ['-c', classification_output,
            '-i', input_fastq,
            '-m', 'ont',
            '-a', str(12),
            '--min-abundance', str(0.000),
            '-o', magnet_sample_output,
            '--threads', '40']
            
            print(" ".join(cmd_args))
            cmd_records.append(cmd_args)

-c /home/Users/ns58/Mob-experiments/Mob-outputs/Cheese/Cheese-4-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Cheese/Cheese-4.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_cheese/Cheese/Cheese-4 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Cheese/Cheese-2-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Cheese/Cheese-2.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_cheese/Cheese/Cheese-2 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Cheese/Cheese-1-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Cheese/Cheese-1.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_cheese/Cheese/Cheese-1 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Cheese/Cheese-8-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Cheese/Cheese-8.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users

## iterate ZYMO

In [12]:
mob_output_paths = ['/home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Even',
                    '/home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Even-0.25',
                    '/home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Even-0.50',
                    '/home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Even-0.75']

In [13]:
input_fastq_paths =['/home/Users/ns58/Mob-experiments/Data-grouped/Zymo-EVEN-p0.01',
                    '/home/Users/ns58/Mob-experiments/Data-grouped/Zymo-EVEN-p0.25',
                    '/home/Users/ns58/Mob-experiments/Data-grouped/Zymo-EVEN-p0.50',
                    '/home/Users/ns58/Mob-experiments/Data-grouped/Zymo-EVEN-p0.75']

In [14]:
mob_output_paths = ['/home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Even-0.75']

In [15]:
input_fastq_paths =['/home/Users/ns58/Mob-experiments/Data-grouped/Zymo-EVEN-p0.75']

In [16]:
magnet_output = '/home/Users/yl181/memu/magnet_v222_zymo_even_full'
if not os.path.exists(magnet_output):
    os.mkdir(magnet_output)

In [17]:
cmd_records = []

for idx, mob_output_path in enumerate(mob_output_paths):
    for f in os.listdir(mob_output_path):
        if f.startswith('Zymo-Even-') and 'relative_abundance.tsv' in f:
            sample_id = '-'.join(f.split('-')[0:3])
            input_fastq = os.path.join(input_fastq_paths[idx], f'{sample_id}.fastq')
            if not os.path.exists(input_fastq):
                print("input_fastq missing:", mob_output_path, sample_id)
            classification_output = os.path.join(mob_output_path, f)
            if not os.path.exists(classification_output):
                print("classification_output missing:", input_fastq_paths[idx], sample_id)
                
            magnet_output_prefix = input_fastq_paths[idx].split('/')[-1]
            if not os.path.exists(os.path.join(magnet_output, magnet_output_prefix)):
                os.mkdir(os.path.join(magnet_output, magnet_output_prefix))
            magnet_sample_output = os.path.join(magnet_output, magnet_output_prefix, sample_id)
            
            # print(input_fastq)
            # print(classification_output)
            # print(magnet_sample_output)
            
            cmd_args = ['-c', classification_output,
            '-i', input_fastq,
            '-m', 'ont',
            '-a', str(12),
            '--min-abundance', str(0.000),
            '-o', magnet_sample_output,
            '--threads', '40']
            
            print(" ".join(cmd_args))
            cmd_records.append(cmd_args)

-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Even-0.75/Zymo-Even-3-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-EVEN-p0.75/Zymo-Even-3.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_zymo_even_full/Zymo-EVEN-p0.75/Zymo-Even-3 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Even-0.75/Zymo-Even-1-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-EVEN-p0.75/Zymo-Even-1.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_zymo_even_full/Zymo-EVEN-p0.75/Zymo-Even-1 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Even-0.75/Zymo-Even-2-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Zymo-EVEN-p0.75/Zymo-Even-2.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/magnet_v222_zymo_even_full/Zymo-EVEN-p0.75/Zymo-Even-2 --threads 40
-c /home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-Even-0.75/Zymo-Even-5-r

In [32]:
for cmd_args in cmd_records:
    run_magnet(cmd_args)

min_abundance: 0.0
num_species: 82
2923522    	 GCF_031142765.1 	 Enterococcus sp. IsoGale005    	 IsoGale0.. 	 Contig
2555396    	 GCF_013350625.1 	 Salmonella sp. SG220           	 SG220      	 Contig
1715212    	 GCF_001837115.1 	 Staphylococcus sp. HMSC063H1.. 	 HMSC063H.. 	 Scaffold
562        	 GCF_000008865.2 	 Escherichia coli O157:H7 str.. 	 Sakai su.. 	 Complete Genome
1739314    	 GCF_001811595.1 	 Enterococcus sp. HMSC078F03    	 HMSC078F.. 	 Scaffold


Process ForkPoolWorker-61:
Process ForkPoolWorker-63:
Process ForkPoolWorker-58:
Process ForkPoolWorker-46:
Process ForkPoolWorker-43:
Exception ignored in: <Finalize object, dead>
Traceback (most recent call last):
  File "/home/Users/yl181/miniconda3/envs/magnet/lib/python3.9/multiprocessing/util.py", line 224, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/home/Users/yl181/miniconda3/envs/magnet/lib/python3.9/multiprocessing/pool.py", line 692, in _terminate_pool
    cls._help_stuff_finish(inqueue, task_handler, len(pool))
  File "/home/Users/yl181/miniconda3/envs/magnet/lib/python3.9/multiprocessing/pool.py", line 672, in _help_stuff_finish
    inqueue._rlock.acquire()
KeyboardInterrupt: 
Process ForkPoolWorker-77:
Process ForkPoolWorker-74:
Process ForkPoolWorker-80:
Process ForkPoolWorker-70:
Process ForkPoolWorker-75:
Process ForkPoolWorker-50:
Process ForkPoolWorker-71:
Process ForkPoolWorker-79:
Process ForkPoolWorker-69:
Process ForkPoolWorker-72:


KeyboardInterrupt: 

  File "/home/Users/yl181/miniconda3/envs/magnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/Users/yl181/miniconda3/envs/magnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/Users/yl181/miniconda3/envs/magnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/Users/yl181/miniconda3/envs/magnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/Users/yl181/miniconda3/envs/magnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/Users/yl181/miniconda3/envs/magnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Trac

## select zymo sample

In [101]:
mob_output_path = '/home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-EVEN-subsample'
input_fastq_path = '/home/Users/ns58/Mob-experiments/Data-grouped/Zymo-EVEN-p0.01'
magnet_output = '/home/Users/yl181/memu/dev_magnet_v222_zymo_even_strains'

if not os.path.exists(magnet_output):
    os.mkdir(magnet_output)

In [102]:
records = []
for f in os.listdir(mob_output_path):
    if f.startswith('Zymo-EVEN-') and 'relative_abundance.tsv' in f:
        sample_id = f[:13]
        input_fastq = os.path.join(input_fastq_path, f'{sample_id}.fastq')
        if not os.path.exists(input_fastq):
            print(sample_id)
        classification_output = os.path.join(mob_output_path, f)
        if not os.path.exists(classification_output):
            print(sample_id)
            
        strain_mob_output = os.path.join(mob_output_path, f'{sample_id}_P_rgs_df.tsv')
        
        if sample_id == 'Zymo-EVEN-004':
            break

## select zymo gut sample

In [103]:
mob_output_path = '/home/Users/ns58/Mob-experiments/Mob-outputs/Zymo-GUT'
input_fastq_path = '/home/Users/ns58/Mob-experiments/Data-grouped/Zymo-GUT'
magnet_output = '/home/Users/yl181/memu/dev_magnet_v222_zymo_gut'

if not os.path.exists(magnet_output):
    os.mkdir(magnet_output)

In [104]:
records = []
for f in os.listdir(mob_output_path):
    if f.startswith('nof-Zymo-gut-') and 'relative_abundance.tsv' in f:
        sample_id = f[13]
        input_fastq = os.path.join(input_fastq_path, f'Zymo-GUT-00{sample_id}.fastq')
        if not os.path.exists(input_fastq):
            print(sample_id)
        classification_output = os.path.join(mob_output_path, f)
        if not os.path.exists(classification_output):
            print(sample_id)
            
        strain_mob_output = os.path.join(mob_output_path, f'{sample_id}_P_rgs_df.tsv')
        
        if sample_id == '6':
            print(sample_id)
            break

## Soil

In [105]:
mob_output_path = '/home/Users/ns58/Mob-experiments/Mob-outputs/Soil'
melon_output_path = '/home/Users/ns58/Mob-experiments/Melon-outputs/Soil'
input_fastq_path = '/home/Users/ns58/Mob-experiments/Data-grouped/Soil'
magnet_output = '/home/Users/yl181/memu/dev_magnet_v222_soil'

if not os.path.exists(magnet_output):
    os.mkdir(magnet_output)

### Mob

In [106]:
for f in os.listdir(mob_output_path):
    if 'relative_abundance.tsv' in f:
        sample_id = "-".join(f.split('-')[0:2])
        input_fastq = os.path.join(input_fastq_path, f'{sample_id}.fastq')
        if not os.path.exists(input_fastq):
            print("input_fastq missing:", sample_id)
        classification_output = os.path.join(mob_output_path, f)
        if not os.path.exists(classification_output):
            print('classification_output missing:', sample_id)
            
        strain_mob_output = os.path.join(mob_output_path, f'{sample_id}_P_rgs_df.tsv')
        
        if sample_id == 'Soil-5':
            print(sample_id)
            break

Soil-5


In [107]:
cmd_args = ['-c', classification_output,
            '-i', input_fastq,
            '-m', 'ont',
            '-a', str(12),
            '--min-abundance', str(0.000),
            '-o', os.path.join(magnet_output, f"{sample_id}_mob"),
            '--threads', '40']
" ".join(cmd_args)

'-c /home/Users/ns58/Mob-experiments/Mob-outputs/Soil/Soil-5-relative_abundance.tsv -i /home/Users/ns58/Mob-experiments/Data-grouped/Soil/Soil-5.fastq -m ont -a 12 --min-abundance 0.0 -o /home/Users/yl181/memu/dev_magnet_v222_soil/Soil-5_mob --threads 40'

### Melon

In [69]:
melon_output_f = os.path.join(melon_output_path, sample_id, f'{sample_id}.tsv')

In [70]:
melon_taxids = []
for item in pd.read_csv(melon_output_f, sep='\t')['species'].values:
    melon_taxids.append(item.split('|')[0])

In [71]:
with open(os.path.join(magnet_output, f'{sample_id}_melon_taxids.txt'), 'w') as melon_taxid_f:
    for item in melon_taxids:
        melon_taxid_f.write(f"{item}\n")

In [72]:
cmd_args = ['-c', os.path.join(magnet_output, f'{sample_id}_melon_taxids.txt'),
            '-i', input_fastq,
            '-m', 'ont',
            '-o', os.path.join(magnet_output, f'{sample_id}_melon'),
            '--threads', '40']
" ".join(cmd_args)

'-c /home/Users/yl181/memu/dev_magnet_v222_soil/Soil-5_melon_taxids.txt -i /home/Users/ns58/Mob-experiments/Data-grouped/Soil/Soil-5.fastq -m ont -o /home/Users/yl181/memu/dev_magnet_v222_soil/Soil-5_melon --threads 40'

## Strain magnet

In [12]:
# top_k = 5

In [13]:
# valid_species_df = pd.read_csv(classification_output, sep='\t')
# valid_species_df = valid_species_df[valid_species_df['F'] > 0.00]
# valid_species_taxids = set(valid_species_df['Target_ID'].values)

# strain_df = pd.read_csv(strain_mob_output, sep='\t')
# filtered_strain_df = strain_df[strain_df['Target_ID'].isin(valid_species_taxids)]

# taxid2accession_record = []
# for idx, row in filtered_strain_df.iterrows():
#     genome_list = literal_eval(row['Genome'])
#     for genome in genome_list:
#         accession = genome.split('|')[0]
#         taxid2accession_record.append({'taxid': row['Target_ID'],
#                                        'accession': accession})
# taxid2accession_df = pd.DataFrame(taxid2accession_record)
# taxid2accession_df = taxid2accession_df.groupby(['taxid', 'accession']).size().reset_index(name='counts')

# selected_genomes = []
# for taxid in valid_species_taxids:
#     selected_df = taxid2accession_df[taxid2accession_df['taxid'] == taxid].sort_values(['counts'], ascending=False)
#     # selected_genomes += list(selected_df.head(top_k)['accession'].values)
#     min_count = selected_df.iloc[0]['counts']
#     if len(selected_df[selected_df['counts'] >= min_count]) >= top_k:
#         candidate_selected_genomes = list(selected_df[selected_df['counts'] >= min_count]['accession'].values)
#         # if len(candidate_selected_genomes) > top_k:
#         #     print(taxid, candidate_selected_genomes, set(candidate_selected_genomes).intersection(ground_truth))
#     else:
#         candidate_selected_genomes = list(selected_df.head(top_k)['accession'].values)

#     for genome in candidate_selected_genomes:
#         selected_genomes.append({
#             'assembly': genome,
#             'taxid': taxid
#         })
        
# len(selected_genomes)

In [14]:
# pd.DataFrame(selected_genomes).set_index('assembly').to_csv(f'{sample_id}.top{top_k}-strains.csv')

In [15]:
# cmd_args = ['-c', f'{sample_id}.top{top_k}-strains.csv',
#             '-i', input_fastq,
#             '-m', 'ont',
#             '-o', os.path.join(magnet_output, sample_id),
#             '--threads', '40',
#             '--accession']

## construct parameters

In [30]:
args = parser.parse_args(cmd_args)

input_tsv = args.classification
input_fastq = args.fastq
input_fastq2 = args.fastq2
mode = args.mode
working_directory = args.output

taxid_col_idx = args.taxid_idx
abundance_col_idx = args.abundance_idx
min_abundance = args.min_abundance
min_mapq = args.min_mapq
min_coverage_score = args.min_covscore
threads = args.threads
valid_kingdom_str = args.kingdom

valid_kingdom = set()
for i in valid_kingdom_str.split(','):
    valid_kingdom.add(int(i))

if args.include_mag:
    mag_flag = 'all'
else:
    mag_flag = 'exclude'

accession_flag = args.accession
call_subspecies = args.subspecies

sep = '\t'
if str(input_tsv)[-3:] == 'csv':
    sep = ','

ncbi_taxa_db = NCBITaxa()
if not os.path.exists(working_directory):
    os.mkdir(working_directory)

# Refseq Downloader

In [33]:
if accession_flag:
    abundance_col_idx = None
    min_abundance = 0
    input_df, min_abundance = parsing_input_f(input_tsv, sep, taxid_col_idx, abundance_col_idx, min_abundance)
    valid_taxids = list(input_df['tax_id'].values)
else:
    input_df, min_abundance = parsing_input_f(input_tsv, sep, taxid_col_idx, abundance_col_idx, min_abundance)
    # make valid_kingdom a variable?
    valid_taxids = filter_input_df(input_df, min_abundance, ncbi_taxa_db, valid_kingdom=valid_kingdom, ret_subspecies=call_subspecies)

min_abundance: 0.0
num_species: 82


## ! Missing genome need to be informed.

In [34]:
reference_metadata = prepare_reference_genomes(valid_taxids, working_directory, ncbi_taxa_db, accession_flag=accession_flag, mag_flag=mag_flag)

2923522    	 GCF_031142765.1 	 Enterococcus sp. IsoGale005    	 IsoGale0.. 	 Contig
2555396    	 GCF_013350625.1 	 Salmonella sp. SG220           	 SG220      	 Contig
1715212    	 GCF_001837115.1 	 Staphylococcus sp. HMSC063H1.. 	 HMSC063H.. 	 Scaffold
562        	 GCF_000008865.2 	 Escherichia coli O157:H7 str.. 	 Sakai su.. 	 Complete Genome
1739314    	 GCF_001811595.1 	 Enterococcus sp. HMSC078F03    	 HMSC078F.. 	 Scaffold
2315836    	 GCF_008082385.1 	 Enterococcus sp. N041.A-2      	 N041.A-2   	 Contig
2502216    	 GCF_005503545.1 	 Pseudomonas sp. 3PA37B6        	 3PA37B6    	 Contig
1613       	 GCF_029961225.1 	 Limosilactobacillus fermentu.. 	 EFEL6800   	 Complete Genome
2665553    	 GCF_014492625.1 	 Salmonella sp. S048_01045      	 S048_010.. 	 Contig
5207       	 GCA_020975435.1 	 Cryptococcus neoformans        	 C36        	 Complete Genome
2582616    	 GCF_009902665.1 	 Salmonella sp. zj-f50          	 zj-f50     	 Scaffold
2665564    	 GCF_014492365.1 	 Salmonella s

In [35]:
downloaded_assemblies = reference_metadata[reference_metadata['Downloaded']]

# FastANI and Clustering

## Run FastANI, Clustering, and Choose Representative Genomes

In [36]:
reference_genome_path = os.path.join(working_directory, 'reference_genomes')

fastani_path = os.path.join(working_directory, 'fastANI')
if not os.path.exists(fastani_path):
    os.mkdir(fastani_path)

## With-in genus ANI Clustering (deprecated)

In [37]:
# genus_taxids = []
# for idx, row in downloaded_assemblies.iterrows():
#     species_taxid = row['Taxonomy ID']
#     genus_taxid = get_genus_taxid(species_taxid, ncbi_taxa_db, valid_kingdom)
#     genus_taxids.append(genus_taxid)
# downloaded_assemblies['Genus Taxid'] = genus_taxids

In [38]:
# def get_genus_taxid(taxid, ncbi_taxa_db, valid_kingdom):    
#     try:
#         lineage = ncbi_taxa_db.get_lineage(taxid)
#     except ValueError:
#         return 0 
    
#     if bool(set(lineage) & valid_kingdom):
#         taxid2rank_dict = ncbi_taxa_db.get_rank(lineage)
#         for lineage_taxid in lineage:
#             if taxid2rank_dict[lineage_taxid] == 'genus':
#                 return lineage_taxid
            
#     return 0

In [39]:
# genus_count_df = pd.DataFrame(downloaded_assemblies.groupby(['Genus Taxid'])['Taxonomy ID'].count())

# multi_species_genera = genus_count_df[genus_count_df['Taxonomy ID'] > 1].index.to_list()
# single_species_genera = genus_count_df[genus_count_df['Taxonomy ID'] == 1].index.to_list()+[0]

# genus2represetitive_mapping = dict()

# for genus_taxid in multi_species_genera:
#     if genus_taxid != 0:
#         fastani_assemblies = downloaded_assemblies[downloaded_assemblies['Genus Taxid'] == genus_taxid]['Assembly Accession ID'].values
#         genus_assemblie_list_f = os.path.join(fastani_path, f"genus_{genus_taxid}.txt")
#         with open(genus_assemblie_list_f, "w") as rl_f:
#             for accession in fastani_assemblies:
#                 reference_genome = os.path.join(reference_genome_path, f'{accession}.fasta')
#                 rl_f.write(f"{reference_genome}\n")

#         subprocess.run(['fastANI',
#                         '--rl', genus_assemblie_list_f,
#                         '--ql', genus_assemblie_list_f,
#                         "--matrix",
#                         '-o', os.path.join(fastani_path, f'genus_{genus_taxid}')],
#                        check=True,
#                        stdout=open(os.path.join(fastani_path, "fastani.log"), "a"),
#                        stderr=open(os.path.join(fastani_path, "fastani.err"), "a"))
        
#         representative_genomes = find_representative_within_genus(genus_taxid, fastani_path, downloaded_assemblies)
#         genus2represetitive_mapping[genus_taxid] = representative_genomes

# representative_labels = []
# for idx, row in downloaded_assemblies.iterrows():
#     genus_idx = row['Genus Taxid']
#     if genus_idx in single_species_genera:
#         representative_labels.append(True)
#     elif row['Assembly Accession ID'] in genus2represetitive_mapping[genus_idx]:
#         representative_labels.append(True)
#     else:
#         representative_labels.append(False)
        
# downloaded_assemblies['Cluster Representative'] = representative_labels

## Cross Genus ANI Clustering

In [40]:
fastani_assemblies = downloaded_assemblies['Assembly Accession ID'].values
assemblie_list_f = os.path.join(fastani_path, f"assemblie_list.txt")
with open(assemblie_list_f, "w") as rl_f:
    for accession in fastani_assemblies:
        reference_genome = os.path.join(reference_genome_path, f'{accession}.fasta')
        rl_f.write(f"{reference_genome}\n")
        
subprocess.run(['fastANI',
                '--rl', assemblie_list_f,
                '--ql', assemblie_list_f,
                "--matrix",
                '--threads', str(threads),
                '-o', os.path.join(fastani_path, f'pairwise_ani')],
               check=True,
               stdout=open(os.path.join(fastani_path, "fastani.log"), "a"),
               stderr=open(os.path.join(fastani_path, "fastani.err"), "a"))

CompletedProcess(args=['fastANI', '--rl', '/home/Users/yl181/memu/magnet_v222_zymo_even_full/Zymo-EVEN-p0.75/Zymo-Even-3/fastANI/assemblie_list.txt', '--ql', '/home/Users/yl181/memu/magnet_v222_zymo_even_full/Zymo-EVEN-p0.75/Zymo-Even-3/fastANI/assemblie_list.txt', '--matrix', '--threads', '40', '-o', '/home/Users/yl181/memu/magnet_v222_zymo_even_full/Zymo-EVEN-p0.75/Zymo-Even-3/fastANI/pairwise_ani'], returncode=0)

In [42]:
representative_genomes, member2representative = find_representative_genome(fastani_path, fastani_assemblies, downloaded_assemblies)

In [43]:
representative_genomes

defaultdict(list,
            {'GCF_001598635.1': ['GCF_031142765.1',
              'GCF_001811595.1',
              'GCF_008082385.1',
              'GCF_001813275.1',
              'GCF_018920115.1',
              'GCF_018920065.1',
              'GCF_018919985.1',
              'GCF_018919865.1',
              'GCF_018919805.1',
              'GCF_018919785.1',
              'GCF_018919685.1',
              'GCF_030499395.1',
              'GCF_030499465.1',
              'GCF_001598635.1',
              'GCF_001056095.1',
              'GCF_001053935.1',
              'GCF_031144525.1',
              'GCF_031144375.1',
              'GCF_023539285.1',
              'GCF_031144365.1',
              'GCF_031144325.1',
              'GCF_031144215.1',
              'GCF_031144155.1',
              'GCF_031144105.1',
              'GCF_031144145.1',
              'GCF_031143985.1',
              'GCF_031143855.1',
              'GCF_031143745.1',
              'GCF_031143485.1',
      

In [44]:
representative_labels = []
cluster_members = []
for idx, row in downloaded_assemblies.iterrows():
    accession = row['Assembly Accession ID']
    if accession in representative_genomes.keys():
        representative_labels.append(True)
        cluster_members.append(','.join(representative_genomes[accession]))
    else:
        representative_labels.append(False)
        cluster_members.append(','.join(representative_genomes[member2representative[accession]]))
downloaded_assemblies['Cluster Representative'] = representative_labels
downloaded_assemblies['Cluster Members'] = cluster_members

representative_df = downloaded_assemblies[downloaded_assemblies['Cluster Representative']]

seq2assembly_dict = get_seq2assembly_dict(working_directory, representative_df)
reference_fasta = merge_reference_fasta(list(representative_df['Assembly Accession ID']), working_directory)

In [46]:
def sort_samfile(assembly_id, aligner_output, output_dir, min_mapq, threads=20):
    '''converting and sorting alignment files'''
    bam_files = os.path.join(output_dir, "bam_files")
    # sam_files = os.path.join(output_dir, "sam_files")
    
    if not os.path.exists(bam_files):
        os.mkdir(bam_files)

    # covert sam file to binary bam file
    samtools_view_res = subprocess.Popen([
        "/home/Users/yl181/tools/samtools-1.19.2/samtools",
        "view",
        "-@", str(threads),
        "--min-MQ", str(min_mapq),
        "-bS"], #os.path.join(sam_files, f"{assembly_id}.sam")
        stdin=aligner_output.stdout,
        stdout=subprocess.PIPE)

    # sort the bam file 
    subprocess.run([
        "/home/Users/yl181/tools/samtools-1.19.2/samtools",
        "sort",
        "-@", str(threads),
        "-o", os.path.join(bam_files, f"{assembly_id}.sorted.bam"),
        "-O", "BAM"],
        stdin=samtools_view_res.stdout,
        stderr=subprocess.DEVNULL,
        check=True)

    # indexing the sorted bam file 
    subprocess.run([
        "/home/Users/yl181/tools/samtools-1.19.2/samtools",
        "index",
        os.path.join(bam_files, f"{assembly_id}.sorted.bam")],
                    check=True)

In [47]:
if mode == 'ont':
    aligner_output = run_minimap2(input_fastq, reference_fasta, 'merged', working_directory, threads=threads)
if mode == 'illumina':
    aligner_output = run_bowtie2(input_fastq, input_fastq2, reference_fasta, 'merged', working_directory, threads=threads)
sort_samfile('merged', aligner_output, working_directory, min_mapq=0, threads=threads)

In [48]:
coverage_files = os.path.join(working_directory, "coverage_files")
if not os.path.exists(coverage_files):
    os.mkdir(coverage_files)
    
pool = Pool(processes=threads)
pool.starmap(samtools_calculate_coverage, zip(repeat(working_directory), [True, False]))
pool.close()
pool.join()
    

representative_df = alignment_summary(representative_df,
                                      working_directory,
                                      seq2assembly_dict,
                                      include_supp=True)

representative_df = alignment_summary(representative_df,
                                      working_directory,
                                      seq2assembly_dict,
                                      include_supp=False)

CalledProcessError: Command '['samtools', 'coverage', '/home/Users/yl181/memu/magnet_v222_zymo_even_full/Zymo-EVEN-p0.75/Zymo-Even-3/bam_files/merged.sorted.bam', '--ff', 'UNMAP,QCFAIL,DUP', '-q', '0']' died with <Signals.SIGSEGV: 11>.

In [130]:
def samtools_merged_consensus(output_directory, threads):
    merged_bam = os.path.join(output_directory, 'bam_files', 'merged.sorted.bam')
    subprocess.run(['samtools', 'consensus', 
                    '--show-ins', 'no', 
                    '--show-del', 'yes',
                    '--min-MQ', str(20),
                    '-a',
                    '--mode', "simple",
                    '--threads', str(threads),
                    merged_bam, 
                    '-o', os.path.join(output_directory, 'merged_consensus.fasta')],
                  check=True)
    
    consensus_record_dict = SeqIO.to_dict(SeqIO.parse(os.path.join(output_directory, 'merged_consensus.fasta'), "fasta"))
    return consensus_record_dict

In [131]:
consensus_record_dict = samtools_merged_consensus(working_directory, threads)
representative_df = ani_summary(representative_df, consensus_record_dict, working_directory, threads)
representative_df = call_present_absent(representative_df, min_coverage_score)
representative_df.sort_values(['Primary Score'], ascending=False).to_csv(os.path.join(working_directory, 
                                                                                      'cluster_representative.csv'), 
                                                                         index=False)

In [132]:
representative_df.columns

Index(['Taxonomy ID', 'Assembly Accession ID', 'Source Database',
       'Is Representative', 'Assembly Level', 'Organism of Assembly', 'Strain',
       'Total Length', 'Downloaded', 'Species', 'Cluster Representative',
       'Cluster Members', 'Secondary Breadth', 'Secondary Expected',
       'Secondary Score', 'Secondary Depth', 'Primary Breadth',
       'Primary Expected', 'Primary Score', 'Primary Depth', 'Consensus ANI',
       'Combined PS and ANI (Sqrt(ANI)xPSx100)', 'Presence/Absence'],
      dtype='object')

In [133]:
representative_df[representative_df['Presence/Absence'] == 'Present']

Unnamed: 0,Taxonomy ID,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Strain,Total Length,Downloaded,Species,...,Secondary Expected,Secondary Score,Secondary Depth,Primary Breadth,Primary Expected,Primary Score,Primary Depth,Consensus ANI,Combined PS and ANI (Sqrt(ANI)xPSx100),Presence/Absence
1,3056648,GCF_030413175.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Bradyrhizobium roseus,S12-14-2,7319411.0,True,Bradyrhizobium sp. S12-14-2,...,0.143660,0.870010,1.240736,0.066733,0.073593,0.906780,1.145335,0.827142,82.47,Present
3,2954771,GCF_031202265.1,SOURCE_DATABASE_REFSEQ,False,Complete Genome,Bradyrhizobium sp. Ash2021,Ash2021,9704287.0,True,Bradyrhizobium sp. Ash2021,...,0.230135,0.841104,1.351045,0.107387,0.118962,0.902699,1.179293,0.803551,80.92,Present
4,3063329,GCF_030546645.1,SOURCE_DATABASE_REFSEQ,False,Contig,Flavitalea sp. BT771,BT771,8732077.0,True,Flavitalea sp. BT771,...,0.370252,0.795410,1.570058,0.248169,0.303316,0.818186,1.456170,0.799718,73.17,Present
5,2219043,GCF_003574315.2,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Capsulimonas corticalis,AX-7,7645051.0,True,Capsulimonas corticalis,...,0.085282,0.816503,1.280021,0.050279,0.055528,0.905472,1.136095,0.790988,80.53,Present
7,722472,GCF_900141755.1,SOURCE_DATABASE_REFSEQ,False,Chromosome,Bradyrhizobium lablabi,GAS499,7910099.0,True,Bradyrhizobium lablabi,...,0.207450,0.871973,1.285189,0.099097,0.109425,0.905620,1.169292,0.796864,80.84,Present
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,2480626,GCF_007827045.1,SOURCE_DATABASE_REFSEQ,False,Scaffold,Trebonia kvetii,15TR583,9254646.0,True,Trebonia kvetii,...,0.628096,0.733255,2.147493,0.403171,0.541646,0.744344,1.934744,0.852379,68.72,Present
140,652787,GCF_900105165.1,SOURCE_DATABASE_REFSEQ,False,Chromosome,Mucilaginibacter mallensis,MP1X4,6014056.0,True,Mucilaginibacter mallensis,...,0.279387,0.931456,1.258843,0.232672,0.243372,0.956035,1.198360,0.820509,86.60,Present
141,1792502,GCF_014642875.1,SOURCE_DATABASE_REFSEQ,False,Scaffold,Puia dinghuensis,CGMCC 1.15448,7182087.0,True,Puia dinghuensis,...,0.424826,0.788081,1.651759,0.287182,0.355743,0.807273,1.530674,0.798597,72.14,Present
143,2080758,GCF_004661485.1,SOURCE_DATABASE_REFSEQ,False,Contig,Methylotenera oryzisoli,La3113,2664030.0,True,Methylotenera oryzisoli,...,0.111636,0.830866,1.275716,0.078677,0.090951,0.865047,1.211409,0.764349,75.63,Present


# With-in Cluster Classification

In [428]:
def get_expected_coverage(genome_length, reads_mapped, genome_totol_count):
    mean_mapping_length = genome_totol_count/reads_mapped
    
    N = genome_length/mean_mapping_length
    x = reads_mapped
    
    expected_M = N*(1-((1-1/N)**x))
    variance = N*((1-1/N)**x) + (N**2)*(1-1/N)*((1-2/N)**x)-(N**2)*((1-1/N)**(2*x))
    
    expected_coverage = expected_M/N
    try:
        std = math.sqrt(variance)
    except ValueError:
        std = 0
    return expected_coverage, std

In [424]:
def calculate_depth(assembly_id, taxa_records):
    genome_length = taxa_records[assembly_id]['genome_length']
    covbases = taxa_records[assembly_id]['covbases']
    genome_totol_count = taxa_records[assembly_id]['genome_totol_count']
    reads_mapped = taxa_records[assembly_id]['reads_mapped']
    
    if genome_totol_count == 0 or reads_mapped == 0:
        breadth_coverage = 0
        depth_coverage = 0
        expected_breadth_coverage = 0
    else:
        breadth_coverage = covbases/genome_length
        depth_coverage = genome_totol_count/covbases
        expected_breadth_coverage, std = get_expected_coverage(genome_length, reads_mapped, genome_totol_count)
    
    return breadth_coverage, depth_coverage, expected_breadth_coverage

In [425]:
def alignment_summary(downloaded_assemblies, output_directory, seq2assembly_dict, include_supp=True):
    if include_supp:
        coverage_file_name = 'secondary_coverage.tsv'
        column_prefix = 'Secondary'
    else:
        coverage_file_name = 'primary_coverage.tsv'
        column_prefix = 'Primary'
    
    columns = [f'{column_prefix} Breadth',
               f'{column_prefix} Expected',
               f'{column_prefix} Score',
               f'{column_prefix} Depth']
    
    coverage_df = pd.read_csv(os.path.join(output_directory,
                                           'coverage_files',
                                           coverage_file_name),
                              sep='\t',
                              header=None,
                              names=['rname','startpos','endpos','numreads','covbases','coverage','meandepth','meanbaseq','meanmapq'])
    
    taxa_records = defaultdict(lambda: defaultdict(int))
    for idx, row in coverage_df.iterrows():
        taxa_reference = seq2assembly_dict[row['rname']]
        taxa_records[taxa_reference]['genome_length'] += row['endpos']
        taxa_records[taxa_reference]['reads_mapped'] += row['numreads']
        taxa_records[taxa_reference]['genome_totol_count'] += int(row['meandepth'] * row['endpos'])
        taxa_records[taxa_reference]['covbases'] += row['covbases']
    
    breadth_coverage_list = []
    depth_coverage_list = []
    expected_breadth_coverage_list = []
    coverage_score = []
    for assembly_id in downloaded_assemblies['Assembly Accession ID']:
        breadth_coverage, depth_coverage, expected_breadth_coverage = calculate_depth(assembly_id, taxa_records)
        breadth_coverage_list.append(breadth_coverage)
        depth_coverage_list.append(depth_coverage)
        expected_breadth_coverage_list.append(expected_breadth_coverage)
        if expected_breadth_coverage != 0:
            coverage_score.append(min(breadth_coverage/expected_breadth_coverage, 1))
        else:
            coverage_score.append(0)
        
    downloaded_assemblies[columns[0]] = breadth_coverage_list
    downloaded_assemblies[columns[1]] = expected_breadth_coverage_list
    downloaded_assemblies[columns[2]] = coverage_score
    downloaded_assemblies[columns[3]] = depth_coverage_list
    
    downloaded_assemblies.to_csv(os.path.join(output_directory, 'alignment.csv'), index=False)
    
    return downloaded_assemblies

In [383]:
counter = 0
for idx, row in representative_df.iterrows():
    cluster_members = row['Cluster Members'].split(",")
    
    if len(cluster_members) > 1:
        print(cluster_members)
        counter += 1
    
    if counter == 2:
        break

['GCF_000013425.1', 'GCF_001838925.1', 'GCF_016893225.1']
['GCF_001598635.1', 'GCF_001053935.1', 'GCF_031143435.1', 'GCF_018919785.1']


In [384]:
cluster_members

['GCF_001598635.1', 'GCF_001053935.1', 'GCF_031143435.1', 'GCF_018919785.1']

In [421]:
members_df = downloaded_assemblies[downloaded_assemblies['Assembly Accession ID'].isin(cluster_members)]

In [422]:
seq2assembly_dict = get_seq2assembly_dict(working_directory, members_df)
reference_fasta = merge_reference_fasta(list(members_df['Assembly Accession ID']), working_directory)

In [423]:
if mode == 'ont':
    aligner_output = run_minimap2(input_fastq, reference_fasta, 'merged', working_directory, threads=threads)
if mode == 'illumina':
    aligner_output = run_bowtie2(input_fastq, input_fastq2, reference_fasta, 'merged', working_directory, threads=threads)
sort_samfile('merged', aligner_output, working_directory, min_mapq=0, threads=threads)

In [431]:
consensus_record_dict = samtools_merged_consensus(working_directory, threads)

In [432]:
members_df = ani_summary(members_df, consensus_record_dict, working_directory, threads)
members_df = call_present_absent(members_df, min_coverage_score)

In [533]:
def samtools_calculate_coverage(output_dir, include_supp=False):
    coverage_files = os.path.join(output_dir, "coverage_files")
    bam_files = os.path.join(output_dir, "bam_files")

    command = ["samtools",
               "coverage",
               "--no-header",
               os.path.join(bam_files, f"merged.sorted.bam")]
    
    if include_supp:
        # samtools coverage --ff UNMAP,QCFAIL,DUP -q 0 merged.sorted.bam > secondary_coverage.tsv
        coverage_file = os.path.join(coverage_files, f"secondary_coverage.tsv")
        command += ['--ff', 'UNMAP,QCFAIL,DUP', '-q', str(0)]
    else:
        # samtools coverage merged.sorted.bam -q 20 > primary_coverage.tsv
        coverage_file = os.path.join(coverage_files, f"primary_coverage.tsv")
        command += ['-q', str(20)]
        # command += ['--ff', 'UNMAP,QCFAIL,DUP', '-q', str(20)]

    subprocess.run(command,
                    check=True,
                    stdout=open(coverage_file, "w"))

In [534]:
coverage_files = os.path.join(working_directory, "coverage_files")
if not os.path.exists(coverage_files):
    os.mkdir(coverage_files)
    
pool = Pool(processes=threads)
pool.starmap(samtools_calculate_coverage, zip(repeat(working_directory), [True, False]))
pool.close()
pool.join()

In [535]:
members_df = alignment_summary(members_df,
                                      working_directory,
                                      seq2assembly_dict,
                                      include_supp=True)

In [536]:
members_df = alignment_summary(members_df,
                                      working_directory,
                                      seq2assembly_dict,
                                      include_supp=False)

In [537]:
members_df[['Assembly Accession ID', 'Species', 'Assembly Level', 'Cluster Representative', 'Total Length', 
            'Secondary Breadth', 'Primary Breadth', 'Secondary Expected', 'Primary Expected', 'Secondary Score', 'Primary Score', 'Consensus ANI']]

Unnamed: 0,Assembly Accession ID,Species,Assembly Level,Cluster Representative,Total Length,Secondary Breadth,Primary Breadth,Secondary Expected,Primary Expected,Secondary Score,Primary Score,Consensus ANI
4,GCF_001598635.1,Enterococcus faecalis,Complete Genome,True,2803429.0,0.876527,0.238773,0.996414,0.411209,0.879681,0.580661,0.938553
13,GCF_001053935.1,Enterococcus sp. 1140_ESPC,Contig,False,2713742.0,0.890697,0.170908,0.990717,0.346761,0.899043,0.49287,0.932959
14,GCF_031143435.1,Enterococcus sp. FR192,Contig,False,3090693.0,0.802494,0.268412,0.984906,0.606032,0.814792,0.4429,0.952038
16,GCF_018919785.1,Enterococcus sp. S171_ASV_20,Contig,False,3284381.0,0.756196,0.156462,0.978218,0.266662,0.773034,0.586741,0.929656


In [521]:
members_df.columns

Index(['Taxonomy ID', 'Assembly Accession ID', 'Source Database',
       'Is Representative', 'Assembly Level', 'Organism of Assembly', 'Strain',
       'Total Length', 'Downloaded', 'Species', 'Genus Taxid',
       'Cluster Representative', 'Cluster Members', 'Secondary Breadth',
       'Secondary Expected', 'Secondary Score', 'Secondary Depth',
       'Primary Breadth', 'Primary Expected', 'Primary Score', 'Primary Depth',
       'Consensus ANI', 'Combined PS and ANI (Sqrt(ANI)xPSx100)',
       'Presence/Absence'],
      dtype='object')