In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import argparse
import sys
import subprocess
import math
import pickle
from collections import defaultdict
import json
import pandas as pd
import math
from ete3 import NCBITaxa
from Bio import SeqIO

In [None]:
from utils.reference_finder import prepare_reference_genomes
from utils.alignment import run_minimap2, sort_samfile, run_bwa, samtools_calculate_depth
from utils.summary import alignment_1_summary, alignment_2_summary, merge_reference_fasta, call_present_absent
from utils.ani import samtools_merged_consensus, ani_summary

In [None]:
def get_species_taxid(taxid, ncbi_taxa_db, valid_kingdom):
    lineage = ncbi_taxa_db.get_lineage(taxid)
    if bool(set(lineage) & valid_kingdom):
        taxid2rank_dict = ncbi_taxa_db.get_rank(lineage)
        for lineage_taxid in taxid2rank_dict:
            if taxid2rank_dict[lineage_taxid] == 'species':
                return lineage_taxid
    return None

In [None]:
def filter_kraken2_taxonomy(kraken2_report, min_frac, ncbi_taxa_db, valid_kingdom):
    classification_result_df = pd.read_csv(kraken2_report, sep='\t', names=['Abundance', 'Cumulative Count', 'Count', 'Rank', 'Taxid', 'Taxname']) 
    taxids = classification_result_df[(classification_result_df['Rank'] == 'S') & (classification_result_df['Abundance'] >= min_frac*100)]['Taxid'].tolist()
    valid_taxids = []
    for taxid in taxids:
        species_taxid = get_species_taxid(taxid, ncbi_taxa_db, valid_kingdom)
        if species_taxid is not None:
            valid_taxids.append(species_taxid)
    return valid_taxids

In [None]:
ncbi_taxa_db = NCBITaxa()

In [None]:
input_fastq_1 = '../test_data/Zymo_illumina/SRR11207265_1.downsampled.fastq'
input_fastq_2 = '../test_data/Zymo_illumina/SRR11207265_2.downsampled.fastq'
kraken2_report = "../test_data/Zymo_illumina/SRR11207265.downsampled.report"
threads = 40
min_mapq = 20
min_coverage_score = 0.7
min_frac = 0.0005

In [None]:
working_directory = '../test_output_downsampled'
if not os.path.exists(working_directory):
    os.mkdir(working_directory)

In [None]:
taxid_queries = filter_kraken2_taxonomy(kraken2_report,
                                        min_frac=min_frac, 
                                        ncbi_taxa_db=ncbi_taxa_db, 
                                        valid_kingdom={2, 4751, 2157, 10239})

In [None]:
reference_metadata = prepare_reference_genomes(taxid_queries, working_directory, ncbi_taxa_db)

In [None]:
downloaded_assemblies = reference_metadata[reference_metadata['Downloaded']]

In [None]:
for assembly_id in downloaded_assemblies['Assembly Accession ID']:
    reference_fasta = os.path.join(working_directory, 'reference_genomes', f'{assembly_id}.fasta')
    run_bwa(input_fastq_1, input_fastq_2, reference_fasta, assembly_id, working_directory, threads=threads)
    sort_samfile(assembly_id, working_directory, min_mapq, threads)
    samtools_calculate_depth(assembly_id, working_directory)

In [None]:
downloaded_assemblies = alignment_1_summary(downloaded_assemblies, working_directory)

In [None]:
filtered_assemblies = list(downloaded_assemblies[downloaded_assemblies['Coverage Score'] >= min_coverage_score]['Assembly Accession ID'])

In [None]:
reference_fasta = merge_reference_fasta(filtered_assemblies, working_directory)
run_bwa(input_fastq_1, input_fastq_2, reference_fasta, 'merged', working_directory, threads=threads)
sort_samfile('merged', working_directory, min_mapq, threads)
samtools_calculate_depth('merged', working_directory)

In [None]:
downloaded_assemblies = alignment_2_summary(downloaded_assemblies, working_directory)

In [None]:
consensus_record_dict = samtools_merged_consensus(working_directory, threads)
downloaded_assemblies = ani_summary(downloaded_assemblies, consensus_record_dict, working_directory)

In [None]:
downloaded_assemblies = call_present_absent(downloaded_assemblies)

In [None]:
downloaded_assemblies.sort_values(['CS2'], ascending=False).to_csv(os.path.join(working_directory, 'reference_alignment.csv'), index=False)

In [None]:
downloaded_assemblies