## Test read in kraken2 output (without --use-mpa-style)
### From the results folder from nf-core https://nf-co.re/mag/results#mag/results-3b4dd3469725654d67e06b3853ba460d64c80788/


In [68]:
import pandas as pd
import altair as alt

In [73]:
kraken_raw= pd.read_csv('/Users/williamrosenbaum/Downloads/kraken2_report.txt', 
                     sep='\t',
                     header=None,
                     names=['pct', 'reads_clade', 'reads_taxon', 'rank', 'ID', 'name'])

In [149]:
kraken = (kraken_raw
  # Find where the viruses starts in the dataframe 
 .loc[kraken_raw[lambda x: x['name'].str.contains('Viruses')].index[0]:, :]
 .loc[lambda x: x['reads_taxon'] > 10]
 .loc[lambda x: x['rank'].str.contains('S')]
 .assign(pct_reads=lambda x: x['reads_clade'] / x['reads_clade'].sum())
)

kraken


Unnamed: 0,pct,reads_clade,reads_taxon,rank,ID,name,pct_reads
13879,1.52,214053,214053,S,1211417,uncultured crAssphage,0.994393
14472,0.01,1117,1117,S1,1090134,Salmonella phage SPN3US,0.005189
14522,0.0,16,16,S,1273755,Halovirus HRTV-8,7.4e-05
15731,0.0,14,14,S,1051631,Streptococcus phage YMC-2011,6.5e-05
17776,0.0,17,17,S,50294,Psittacid alphaherpesvirus 1,7.9e-05
17779,0.0,14,14,S,10317,Cercopithecine alphaherpesvirus 2,6.5e-05
17936,0.0,29,29,S,2107708,Pandoravirus neocaledonia,0.000135


In [170]:
bar = alt.Chart(kraken).mark_bar().encode(
 alt.Y('name:N', sort='-x'),
 alt.X('reads_taxon:Q'),
 tooltip=[alt.Tooltip('name:N'),
          alt.Tooltip('reads_taxon:Q', title='Number of reads', format=',')]
)

text = bar.mark_text(
    align='left',
    dx=3
).encode(
    alt.Text('pct_reads:Q', format='.1%')
)

(bar + text)

## Reading in results from CAT from nf-core
### quite some cleaning up 

In [276]:
cat = pd.read_csv('MEGAHIT-MaxBin2-CAPES_S11.bin2classification.names.txt',
                  sep='\t', 
                  skiprows=2,
                  header=None,
                  usecols=[0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15],
                  names=['bin', 'classification', 'num_ORF',
                         'num_ORF_classification_based', 'kingdom', 
                         'clade', 'clade_phylum', 'phylum_class', 'class_order', 
                         'order_family', 'family_genus', 'genus'])
                 
   

In [289]:
cat.head(5)

Unnamed: 0,bin,classification,num_ORF,num_ORF_classification_based,kingdom,clade,clade_phylum,phylum_class,class_order,order_family,family_genus,genus
0,MEGAHIT-MaxBin2-CAPES_S11.002.fa,classified,1455,1442,Bacteria (superkingdom): 0.97,FCB group (clade): 0.74,Bacteroidetes/Chlorobi group (clade): 0.74,Bacteroidetes (phylum): 0.74,Bacteroidia (class): 0.74,Bacteroidales (order): 0.74,Bacteroidaceae (family): 0.65,Bacteroides (genus): 0.65
1,MEGAHIT-MaxBin2-CAPES_S11.003.fa,classified,1790,1782,Bacteria (superkingdom): 0.97,Terrabacteria group (clade): 0.82,Actinobacteria (phylum): 0.71,Actinobacteria (class): 0.71,Bifidobacteriales (order): 0.71,Bifidobacteriaceae (family): 0.71,Bifidobacterium (genus): 0.71,
2,MEGAHIT-MaxBin2-CAPES_S11.004.fa,classified,2022,2021,Bacteria (superkingdom): 0.72,,,,,,,
3,MEGAHIT-MaxBin2-CAPES_S11.005.fa,classified,6074,6024,Bacteria (superkingdom): 0.98,FCB group (clade): 0.46,Bacteroidetes/Chlorobi group (clade): 0.46,Bacteroidetes (phylum): 0.46,Bacteroidia (class): 0.46,Bacteroidales (order): 0.46,Bacteroidaceae (family): 0.40,Bacteroides (genus): 0.40
4,MEGAHIT-MaxBin2-CAPES_S11.006.fa,classified,2721,2717,Bacteria (superkingdom): 0.83,,,,,,,


In [385]:
(cat
 .melt(id_vars=['bin', 'classification', 'num_ORF', 
                'num_ORF_classification_based'])
 .loc[lambda x: ~x['value'].isna()]
 .drop(columns='variable')
 #.value_counts('bin')
 #.loc[lambda x: x['bin'] == 'MEGAHIT-MaxBin2-CAPES_S11.002.fa']
 .assign(certainty=lambda x: x['value'].str.extract(r'(\d+.*)').astype(float),
         name=lambda x: x['value'].str.extract(r'(.*?)\('),
         order=lambda x: x['value'].str.extract(r'.*\((.*?)\)'))
 .loc[lambda x: x['order'] != 'clade']
 
)

Unnamed: 0,bin,classification,num_ORF,num_ORF_classification_based,value,certainty,name,order
0,MEGAHIT-MaxBin2-CAPES_S11.002.fa,classified,1455,1442,Bacteria (superkingdom): 0.97,0.97,Bacteria,superkingdom
1,MEGAHIT-MaxBin2-CAPES_S11.003.fa,classified,1790,1782,Bacteria (superkingdom): 0.97,0.97,Bacteria,superkingdom
2,MEGAHIT-MaxBin2-CAPES_S11.004.fa,classified,2022,2021,Bacteria (superkingdom): 0.72,0.72,Bacteria,superkingdom
3,MEGAHIT-MaxBin2-CAPES_S11.005.fa,classified,6074,6024,Bacteria (superkingdom): 0.98,0.98,Bacteria,superkingdom
4,MEGAHIT-MaxBin2-CAPES_S11.006.fa,classified,2721,2717,Bacteria (superkingdom): 0.83,0.83,Bacteria,superkingdom
...,...,...,...,...,...,...,...,...
231,MEGAHIT-MaxBin2-CAPES_S11.009.fa,classified,4822,4781,Bacteroides (genus): 0.69,0.69,Bacteroides,genus
238,MEGAHIT-MaxBin2-CAPES_S11.016.fa,classified,2318,2280,Acidaminococcus massiliensis (species): 0.49,0.49,Acidaminococcus massiliensis,species
244,MEGAHIT-MaxBin2-CAPES_S11.022.fa,classified,2097,1994,unclassified Olsenella (no rank): 0.82,0.82,unclassified Olsenella,no rank
250,MEGAHIT-MaxBin2-CAPES_S11.028.fa,classified,6336,6098,Bacteroides (genus): 0.59,0.59,Bacteroides,genus


### CAT LCA file

### maybe the lca file is nothing to have? seems to be redundant if you have the bin2classificaiotn file. 

## Reading in results from gtdb-tk from nf-core (the summary tsv)

In [492]:
gtdb_raw = pd.read_csv('gtdbtk_summary.tsv',
                  sep='\t')

def massage_gtdb(gtwb_raw: pd.DataFrame, 
                 order: str, 
                 sample_name: str, 
                 assembly_type: str) -> pd.DataFrame:
    
    rank = {'order': 'o__', 'family': 'f__'}
    
    return (gtdb_raw
     .loc[lambda x: x['user_genome'].str.contains(f'(?=.*{sample_name})(?=.*{assembly_type})')]
     .loc[lambda x: ~x['classification'].isna()]
     .assign(classification=lambda x: x['classification'].str.split(';'))
     .explode(column='classification')
     .loc[lambda x: x['classification'].str.contains(rank[order])]
     .assign(classification=lambda x: x['classification'].str.replace(rank[order], ''))
    )
    
gtdb = massage_gtdb(gtdb_raw, 'order', '_S11', 'MEGA')


In [489]:
alt.Chart(gtdb).mark_bar().encode(
 alt.X('count(classification):Q'),
 alt.Y('classification:N', sort='-x')
)

### Centrifuge classificaiton (right after preprocessing)

### needs a way to identify which one is virus. Maybe this is sorted out if one uses another db to classify?

In [185]:
centrifuge = pd.read_csv('/Users/williamrosenbaum/Downloads/report.txt',
                         sep='\t')

In [406]:
(centrifuge
 .loc[lambda x: x['abundance'] > 0]
 .sort_values(by='abundance', ascending=False)
# .loc[lambda x: x['name'].str.contains(r'virus|phage')] #try to filter out virus 
)

Unnamed: 0,name,taxID,taxRank,genomeSize,numReads,numUniqueReads,abundance
186,Klebsiella pneumoniae,573,species,15713353,4403776,3921236,2.082890e-01
179,Escherichia coli,562,species,12319210,2998448,1302439,2.055870e-01
1913,Bifidobacterium longum,216816,species,3655960,735601,695952,1.486330e-01
1513,Streptococcus infantarius,102684,species,1913271,309029,264686,7.275100e-02
269,Bacteroides thetaiotaomicron,818,species,6390542,977349,942527,7.028640e-02
...,...,...,...,...,...,...,...
2420,Dehalogenimonas lykanthroporepellens,552810,species,1686510,77,49,1.403740e-177
1983,Fictibacillus arsenicus,255247,species,4055461,49,39,1.128970e-188
2824,Fictibacillus phosphorivorans,1221500,species,4230665,95,73,1.057440e-189
1451,Bacillus weihenstephanensis,86662,species,5740546,135,42,1.123280e-204


## Genome binning summary (/results/binning/summary)

In [407]:
binning = pd.read_csv('bin_summary.tsv',
                      sep='\t')

In [217]:
(binning
 .iloc[:, 1:]
)

Unnamed: 0,bin,Depth CAPES_S11,Depth CAPES_S21,Depth CAPES_S7,GenomeBin,Domain,%Complete (domain),%Complete and single-copy (domain),%Complete and duplicated (domain),%Fragmented (domain),...,closest_placement_ani,closest_placement_af,pplacer_taxonomy,classification_method,note,"other_related_references(genome_id,species_name,radius,ANI,AF)",msa_percent,translation_table,red_value,warnings
0,MEGAHIT-MaxBin2-CAPES_S7.002.fa,12.13450,165.35450,65.374600,MEGAHIT-MaxBin2-CAPES_S7.002.fa,bacteria_odb10,92.7,91.1,1.6,0.8,...,98.73,0.89,d__Bacteria;p__Actinobacteriota;c__Actinomycet...,taxonomic classification defined by topology a...,topological placement and ANI have congruent s...,"GCF_000269965.1, s__Bifidobacterium infantis, ...",90.27,11.0,,
1,MEGAHIT-MaxBin2-CAPES_S7.003.fa,23.78450,77.88510,77.397200,MEGAHIT-MaxBin2-CAPES_S7.003.fa,bacteria_odb10,97.6,83.1,14.5,1.6,...,,,,,,,,,,
2,MEGAHIT-MaxBin2-CAPES_S7.005.fa,1.24615,3.89888,123.401000,MEGAHIT-MaxBin2-CAPES_S7.005.fa,bacteria_odb10,58.8,54.8,4.0,2.4,...,99.03,0.88,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,taxonomic classification defined by topology a...,topological placement and ANI have congruent s...,"GCF_000828055.2, s__Klebsiella variicola, 95.0...",62.16,11.0,,
3,MEGAHIT-MaxBin2-CAPES_S7.001.fa,4.20057,113.81500,122.391000,MEGAHIT-MaxBin2-CAPES_S7.001.fa,bacteria_odb10,90.3,77.4,12.9,2.4,...,,,,,,,,,,
4,MEGAHIT-MaxBin2-CAPES_S7.004.fa,48.09430,4.50323,115.534000,MEGAHIT-MaxBin2-CAPES_S7.004.fa,bacteria_odb10,63.7,52.4,11.3,10.5,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
578,SPAdes-MaxBin2-CAPES_S7.008.fa,1.16163,0.00000,29.996900,SPAdes-MaxBin2-CAPES_S7.008.fa,bacteria_odb10,95.1,91.1,4.0,1.6,...,,,d__Bacteria;p__Firmicutes_C;c__Negativicutes;o...,taxonomic classification defined by topology a...,,"GCF_902810435.1, s__Veillonella parvula_A, 95....",88.90,11.0,0.996764,Genome not assigned to closest species as it f...
579,SPAdes-MaxBin2-CAPES_S7.010.fa,0.00000,0.00000,13.498000,SPAdes-MaxBin2-CAPES_S7.010.fa,bacteria_odb10,42.7,41.9,0.8,8.9,...,95.44,0.59,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,ANI,topological placement and ANI have incongruent...,"GCF_900478025.1, s__Streptococcus pasteurianus...",50.94,11.0,,
580,SPAdes-MaxBin2-CAPES_S7.011.fa,0.00000,0.00000,10.866700,SPAdes-MaxBin2-CAPES_S7.011.fa,bacteria_odb10,26.6,24.2,2.4,8.9,...,,,,,,,,,,
581,SPAdes-MaxBin2-CAPES_S7.012.fa,0.00000,0.00000,8.468365,SPAdes-MaxBin2-CAPES_S7.012.fa,bacteria_odb10,76.6,75.0,1.6,11.3,...,98.64,0.71,d__Bacteria;p__Firmicutes_A;c__Clostridia;o__L...,taxonomic classification defined by topology a...,topological placement and ANI have congruent s...,"GCA_900546325.1, s__Faecalimonas sp900546325, ...",71.71,11.0,,


### data from Genomebinning/Quast/quast_summary

In [219]:
quast = pd.read_csv('quast_summary.tsv',
                    sep='\t')

In [220]:
quast

Unnamed: 0,Assembly,# contigs (>= 0 bp),# contigs (>= 1000 bp),# contigs (>= 5000 bp),# contigs (>= 10000 bp),# contigs (>= 25000 bp),# contigs (>= 50000 bp),Total length (>= 0 bp),Total length (>= 1000 bp),Total length (>= 5000 bp),...,# contigs,Largest contig,Total length,GC (%),N50,N75,L50,L75,# N's per 100 kbp,# predicted rRNA genes
0,MEGAHIT-MetaBAT2-CAPES_S7.1.fa,4,4,4,4,2,2,201428,201428,201428,...,4,102222,201428,48.22,102222,68750,1,2,0.00,0 + 0 part
1,MEGAHIT-MetaBAT2-CAPES_S7.11.fa,4,4,4,4,4,3,411934,411934,411934,...,4,183221,411934,44.58,109964,70155,2,3,0.00,0 + 0 part
2,MEGAHIT-MetaBAT2-CAPES_S7.10.fa,243,243,127,53,4,0,1728819,1728819,1377526,...,243,49968,1728819,39.43,10178,5643,53,112,0.00,0 + 0 part
3,MEGAHIT-MetaBAT2-CAPES_S7.13.fa,5,5,5,5,4,3,403950,403950,403950,...,5,143029,403950,43.63,100706,91934,2,3,0.00,0 + 0 part
4,MEGAHIT-MetaBAT2-CAPES_S7.12.fa,5,5,5,4,4,2,364182,364182,364182,...,5,225856,364182,57.26,225856,54475,1,2,0.00,0 + 0 part
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
578,SPAdesHybrid-MetaBAT2-CAPES_S11.55.fa,472,472,93,10,0,0,1729859,1729859,690291,...,472,24414,1729859,59.52,4101,2734,132,261,0.00,0 + 0 part
579,SPAdesHybrid-MetaBAT2-CAPES_S11.59.fa,12,12,12,11,10,8,1019881,1019881,1019881,...,12,257271,1019881,44.75,110185,80799,3,6,1023.55,1 + 0 part
580,SPAdesHybrid-MetaBAT2-CAPES_S11.57.fa,6,6,6,6,5,4,742576,742576,742576,...,6,235546,742576,42.00,187388,144928,2,3,472.41,1 + 0 part
581,SPAdesHybrid-MetaBAT2-CAPES_S11.58.fa,4,4,4,3,3,3,322085,322085,322085,...,4,123230,322085,41.58,95563,95374,2,3,0.00,0 + 0 part


In [None]:
nextflow run nf-core/taxprofiler --input input.csv --databases databases.csv --outdir testtaxout -profile docker --run_kraken2 --run_kaiju
