In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import argparse
import sys
import subprocess
import math
import pickle
from collections import defaultdict
import json
import pandas as pd
import math
from ete3 import NCBITaxa
from Bio import SeqIO

In [2]:
from utils.reference_finder import prepare_reference_genomes
from utils.alignment import run_minimap2, sort_samfile, run_bwa, samtools_calculate_depth
from utils.summary import alignment_1_summary, alignment_2_summary, merge_reference_fasta, call_present_absent
from utils.ani import samtools_merged_consensus, ani_summary

In [3]:
def get_species_taxid(taxid, ncbi_taxa_db, valid_kingdom):
    lineage = ncbi_taxa_db.get_lineage(taxid)
    if bool(set(lineage) & valid_kingdom):
        taxid2rank_dict = ncbi_taxa_db.get_rank(lineage)
        for lineage_taxid in taxid2rank_dict:
            if taxid2rank_dict[lineage_taxid] == 'species':
                return lineage_taxid
    return None

In [4]:
def filter_kraken2_taxonomy(kraken2_report, min_frac, ncbi_taxa_db, valid_kingdom):
    classification_result_df = pd.read_csv(kraken2_report, sep='\t', names=['Abundance', 'Cumulative Count', 'Count', 'Rank', 'Taxid', 'Taxname']) 
    taxids = classification_result_df[(classification_result_df['Rank'] == 'S') & (classification_result_df['Abundance'] >= min_frac*100)]['Taxid'].tolist()
    valid_taxids = []
    for taxid in taxids:
        species_taxid = get_species_taxid(taxid, ncbi_taxa_db, valid_kingdom)
        if species_taxid is not None:
            valid_taxids.append(species_taxid)
    return valid_taxids

In [5]:
def get_pathogen_taxids(pathogen_list_f, ncbi_taxa_db, valid_kingdom):
    pathogen_df = pd.read_csv(pathogen_list_f)
    pathogen_df = pathogen_df.set_index('PathogenTaxID')
    valid_taxids = []
    for taxid in pathogen_df.index.to_list():
        species_taxid = get_species_taxid(taxid, ncbi_taxa_db, valid_kingdom)
        if species_taxid is not None:
            valid_taxids.append(species_taxid)
    return valid_taxids

In [6]:
def unzip_fastq(input_fastq, working_directory):
    temp_input_dir = os.path.join(working_directory, 'input_reads')
    if not os.path.exists(temp_input_dir):
        os.mkdir(temp_input_dir)
        
    subprocess.run(['cp',
                    input_fastq,
                    temp_input_dir],
                   check=True)
    
    filename = input_fastq.split('/')[-1]
    subprocess.run(['gunzip', '-d', os.path.join(temp_input_dir, filename)], check=True)
    
    filename = input_fastq.split('/')[-1][:-3]
    return os.path.join(temp_input_dir, filename)

In [7]:
ncbi_taxa_db = NCBITaxa()

In [8]:
input_fastq_1 = '/data/ww_data/final_ww_data/fastp/wastewater metagenome/SRR11088368_1.fastq.gz'
input_fastq_2 = '/data/ww_data/final_ww_data/fastp/wastewater metagenome/SRR11088368_2.fastq.gz'
kraken2_report = '/data/ww_data/final_ww_data/kraken/wastewater metagenome/SRR11088368_report.txt'
pathogen_list_f = '/home/Users/yl181/ww_pathogen/metadata/filtered_pathogen.csv'
working_directory = '/scratch0/yl181/ww_pathogen/test_run'
threads = 40
min_mapq = 20
min_coverage_score = 0.7
min_frac = 0.0005

In [9]:
if not os.path.exists(working_directory):
    os.mkdir(working_directory)

In [10]:
input_fastq_1 = unzip_fastq(input_fastq_1, working_directory)

gzip: /scratch0/yl181/ww_pathogen/test_run/input_reads/SRR11088368_1.fastq already exists;	not overwritten


CalledProcessError: Command '['gunzip', '-d', '/scratch0/yl181/ww_pathogen/test_run/input_reads/SRR11088368_1.fastq.gz']' returned non-zero exit status 2.

In [None]:
input_fastq_2 = unzip_fastq(input_fastq_2, working_directory)

In [9]:
taxid_queries = filter_kraken2_taxonomy(kraken2_report,
                                        min_frac=min_frac, 
                                        ncbi_taxa_db=ncbi_taxa_db, 
                                        valid_kingdom={2, 4751, 2157, 10239})

In [10]:
len(taxid_queries)

107

In [11]:
pathogen_taxids = get_pathogen_taxids(pathogen_list_f,
                                      ncbi_taxa_db=ncbi_taxa_db, 
                                      valid_kingdom={2, 4751, 2157, 10239})

In [12]:
taxid_queries = list(set(taxid_queries).intersection(set(pathogen_taxids)))

In [13]:
len(taxid_queries)

30

In [14]:
reference_metadata = prepare_reference_genomes(taxid_queries, working_directory, ncbi_taxa_db)

40324      	 GCF_900475405.1 	 Stenotrophomonas maltophilia   	 NCTC1025.. 	 Complete Genome
645        	 GCF_012931585.1 	 Aeromonas salmonicida          	 SRW-OG1    	 Complete Genome
651        	 GCF_020423125.1 	 Aeromonas media                	 TR3_1      	 Complete Genome
33039      	 GCA_023674525.1 	 [Ruminococcus] torques         	 NB2A-14-.. 	 Chromosome
1680       	 GCF_003030905.1 	 Bifidobacterium adolescentis   	 1-11       	 Complete Genome
40214      	 GCF_003335165.1 	 Acinetobacter johnsonii        	 LXL_C1     	 Complete Genome
40215      	 GCF_018336855.1 	 Acinetobacter junii            	 YR7        	 Complete Genome
2479767    	 GCF_005848555.1 	 Blautia sp. SC05B48            	 SC05B48    	 Complete Genome
2743575    	 GCF_013374795.1 	 Acinetobacter sp. NEB 394      	 NEB 394    	 Complete Genome
1307       	 GCF_000026745.1 	 Streptococcus suis BM407       	 BM407      	 Complete Genome
649756     	 GCF_001998765.1 	 Anaerostipes hadrus            	 BPB5       


30 archives were successfully processed.


In [15]:
reference_metadata

Unnamed: 0,Taxonomy ID,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Strain,Total Length,Downloaded,Species
0,40324,GCF_900475405.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Stenotrophomonas maltophilia,NCTC10258,4481118,True,Stenotrophomonas maltophilia
1,645,GCF_012931585.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Aeromonas salmonicida,SRW-OG1,4621725,True,Aeromonas salmonicida
2,651,GCF_020423125.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Aeromonas media,TR3_1,4531033,True,Aeromonas media
3,33039,GCA_023674525.1,SOURCE_DATABASE_GENBANK,False,Chromosome,[Ruminococcus] torques,NB2A-14-FMU,2838928,True,[Ruminococcus] torques
4,1680,GCF_003030905.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Bifidobacterium adolescentis,1-11,2192428,True,Bifidobacterium adolescentis
5,40214,GCF_003335165.1,SOURCE_DATABASE_REFSEQ,False,Complete Genome,Acinetobacter johnsonii,LXL_C1,3398706,True,Acinetobacter johnsonii
6,40215,GCF_018336855.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Acinetobacter junii,YR7,3438557,True,Acinetobacter junii
7,2479767,GCF_005848555.1,SOURCE_DATABASE_REFSEQ,False,Complete Genome,Blautia sp. SC05B48,SC05B48,3731576,True,Blautia sp. SC05B48
8,2743575,GCF_013374795.1,SOURCE_DATABASE_REFSEQ,False,Complete Genome,Acinetobacter sp. NEB 394,NEB 394,3842079,True,Acinetobacter sp. NEB 394
9,1307,GCF_000026745.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Streptococcus suis BM407,BM407,2170808,True,Streptococcus suis


In [63]:
downloaded_assemblies = reference_metadata[reference_metadata['Downloaded']]

In [64]:
for assembly_id in downloaded_assemblies['Assembly Accession ID']:
    reference_fasta = os.path.join(working_directory, 'reference_genomes', f'{assembly_id}.fasta')
    run_bwa(input_fastq_1, input_fastq_2, reference_fasta, assembly_id, working_directory, threads=threads)
    sort_samfile(assembly_id, working_directory, min_mapq=min_mapq, threads=threads)
    samtools_calculate_depth(assembly_id, working_directory)

In [65]:
downloaded_assemblies = alignment_1_summary(downloaded_assemblies, working_directory)

In [66]:
filtered_assemblies = list(downloaded_assemblies[downloaded_assemblies['Coverage Score'] >= min_coverage_score]['Assembly Accession ID'])

In [67]:
reference_fasta = merge_reference_fasta(filtered_assemblies, working_directory)
run_bwa(input_fastq_1, input_fastq_2, reference_fasta, 'merged', working_directory, threads=threads)
sort_samfile('merged', working_directory, min_mapq=0, threads=threads)
samtools_calculate_depth('merged', working_directory)

In [68]:
downloaded_assemblies = alignment_2_summary(downloaded_assemblies, working_directory)

In [69]:
consensus_record_dict = samtools_merged_consensus(working_directory, threads)
downloaded_assemblies = ani_summary(downloaded_assemblies, consensus_record_dict, working_directory)

In [70]:
downloaded_assemblies = call_present_absent(downloaded_assemblies)

In [71]:
downloaded_assemblies.sort_values(['CS2'], ascending=False).to_csv(os.path.join(working_directory, 'reference_alignment.csv'), index=False)

In [72]:
downloaded_assemblies

Unnamed: 0,Taxonomy ID,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Strain,Downloaded,Species,Breadth Coverage,Expected Coverage,Coverage Score,Depth Coverage,BC2,EC2,CS2,DC2,Consensus ANI,Combined CS2 and ANI (Sqrt(ANI)xCS2x100),Presence/Absence
0,40324,GCF_900475405.1,NCBI RefSeq,True,Complete Genome,Stenotrophomonas maltophilia,NCTC10258,True,Stenotrophomonas maltophilia,0.839573,1.0,0.839573,34.563671,0.814941,1.0,0.814941,23.577827,0.981539,80.74,Present
1,645,GCF_012931585.1,NCBI RefSeq,True,Complete Genome,Aeromonas salmonicida,SRW-OG1,True,Aeromonas salmonicida,0.841355,0.999998,0.841356,15.705711,0.71889,0.986014,0.729087,5.939255,0.98722,72.44,Present
2,651,GCF_020423125.1,NCBI RefSeq,True,Complete Genome,Aeromonas media,TR3_1,True,Aeromonas media,0.871588,1.0,0.871588,18.324055,0.842731,0.99833,0.844141,7.588203,0.9832,83.7,Present
3,33039,GCA_023674525.1,NCBI,False,Chromosome,[Ruminococcus] torques,NB2A-14-FMU,True,[Ruminococcus] torques,0.638248,0.999997,0.63825,20.186668,0.0,0.0,0.0,0.0,0.0,0.0,Absent
4,1680,GCF_003030905.1,NCBI RefSeq,True,Complete Genome,Bifidobacterium adolescentis,1-11,True,Bifidobacterium adolescentis,0.969784,1.0,0.969784,49.626209,0.970498,1.0,0.970498,39.889433,0.995705,96.84,Present
5,40214,GCF_003335165.1,NCBI RefSeq,False,Complete Genome,Acinetobacter johnsonii,LXL_C1,True,Acinetobacter johnsonii,0.955982,1.0,0.955982,79.342853,0.974916,1.0,0.974916,33.517974,0.989357,96.97,Present
6,40215,GCF_018336855.1,NCBI RefSeq,True,Complete Genome,Acinetobacter junii,YR7,True,Acinetobacter junii,0.703081,1.0,0.703081,32.917163,0.671086,0.999798,0.671221,12.675992,0.955793,65.62,Present
7,2479767,GCF_005848555.1,NCBI RefSeq,False,Complete Genome,Blautia sp. SC05B48,SC05B48,True,Blautia sp. SC05B48,0.939978,1.0,0.939978,28.135163,0.954549,1.0,0.954549,23.922561,0.983677,94.67,Present
8,2743575,GCF_013374795.1,NCBI RefSeq,False,Complete Genome,Acinetobacter sp. NEB 394,NEB 394,True,Acinetobacter sp. NEB 394,0.959521,1.0,0.959521,79.441439,0.990409,1.0,0.990409,56.647163,0.995534,98.82,Present
9,1307,GCF_000026745.1,NCBI RefSeq,True,Complete Genome,Streptococcus suis BM407,BM407,True,Streptococcus suis,0.641112,0.999732,0.641284,12.830374,0.0,0.0,0.0,0.0,0.0,0.0,Absent
