In [1]:
import pandas as pd
import altair as alt
import numpy as np
from altair_saver import save
import altair
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

# Bracken

## Bracken raw df

In [143]:
def df_bracken_species_raw(file: str, 
                           level: str = 'species',
                           cutoff: float = 0.05,
                           virus_only: bool = True) -> pd.DataFrame:
    '''
    Returns a df for the taxonomy found in the cleaned raw bracken report.
    Used to generate bar plots of the different taxonomies. 
    
    :param str file: Path to the bracken report.
    :param str level: Level of taxonomy. [domain, phylum, class, order, family, genus, species]. Default = 'species'
    :param float cutoff: Cutoff of percent the taxonomy level is present in. Default = 0.05
    :param bool virus_only: Only include Viruses. Default = True
    :return: pd.DataFrame
    '''
    
    taxonomy = {'domain': 'D', 
                'phylum': 'P', 
                'class': 'K', 
                'order': 'O',
                'family': 'F',
                'genus': 'G',
                'species': 'S'}
     
    df = (
            pd.read_csv(file)
              .loc[lambda x: x.level == taxonomy[level]]
              .loc[lambda x: x.percent > cutoff]
              .sort_values('percent', ascending=False)
               
           )
    
    if virus_only:
        return df.loc[lambda x: x.domain == 'Virus']
    return df


## Bracken bar plot

In [448]:
def bar_chart_bracken_raw(file: str, 
                          level: str = 'species',
                          cutoff: float = 0.05,
                          number: int = 10,
                          virus_only=True) -> altair.vegalite.v4.api.Chart:
    '''
    Returns a bar chart of the taxnomies from the bracken species file. 
    
    :param str file: Path to the bracken report.
    :param str level: Level of taxonomy. [domain, phylum, class, order, family, genus, species]. Default = 'species'
    :param float cutoff: Cutoff of percent the taxonomy level is present in. Default = 0.05
    :param int number: The number bars to plot. Default = 10
    :return: altair.vegalite.v4.api.Chart
    '''
    
    df = df_bracken_species_raw(file, level, cutoff, virus_only)
    
    return alt.Chart(df.head(number)).mark_bar().encode(
     alt.X('percent:Q', axis=alt.Axis(format='.1%'), title='Percent of reads'),
     alt.Y('name:N', sort='-x', title=None),
     alt.Color('name:N', title=None),
     tooltip=[alt.Tooltip('domain:N'), alt.Tooltip('percent:Q', format='.1%')]
      ).properties(
        width=500, height=500)
  

plot = bar_chart_bracken_raw('results/cleaned_files/Bat-Guano-15_S6_L001_R_bracken_raw.csv', 
                             'species',
                             cutoff=0.001,
                             number=6,
                             virus_only=True)

plot

# Kaiju

In [401]:
def bar_chart_kaiju_raw(file: str, 
                        cutoff: float = 0.05,
                        number: int = 10) -> altair.vegalite.v4.api.Chart:
    '''
    Returns a bar chart of the taxnomies from the bracken species file. 
    
    :param str file: Path to the bracken report.
    :param str level: Level of taxonomy. [domain, phylum, class, order, family, genus, species]. Default = 'species'
    :param float cutoff: Cutoff of percent the taxonomy level is present in. Default = 0.05
    :param int number: The number bars to plot. Default = 10
    :return: altair.vegalite.v4.api.Chart
    '''
    
    df = (pd.read_csv(file)
          .groupby(['taxon_id', 'percent', 'taxon_name'], as_index=False)
          .agg(taxonomy=('taxonomy', list))
          .sort_values('percent', ascending=False)
          .loc[lambda x: x.percent > cutoff]
         )
    
    
    return alt.Chart(df.head(number)).mark_bar().encode(
     alt.X('percent:Q', axis=alt.Axis(format='.1%'), title='Percent of reads'),
     alt.Y('taxon_name:N', sort='-x', title=None),
     alt.Color('taxon_name:N', title=None),
     tooltip=[alt.Tooltip('taxonomy:O'), alt.Tooltip('percent:Q', format='.1%')]
      ).properties(
        width=500, height=500)
   

bar_chart_kaiju_raw('results/cleaned_files/Bat-Guano-15_S6_L001_R_kaiju_raw.csv',
                   cutoff=0.00001)

# MEGAHIT contig histogram

In [427]:
import numpy as np

def megahit_contig_histogram(file: str) -> altair.vegalite.v4.api.Chart:
    '''
    Plots histogram of the contigs from the megahit assembled contigs.
    :param str file: Path to the csv file for the megahit contigs.
    :return: Altair histogram
    '''
    contigs = pd.read_csv(file)
    
    return alt.Chart(contigs, title='Megahit contigs size').mark_bar().encode(
     alt.X('length:Q', bin=alt.Bin(step=500), title='Length (nt)'),
     alt.Y('count(length):Q', title='Number of contigs')
    ).properties(width=500, height=500)
 
    
megahit_contig_histogram('results/megahit/Bat-Guano-15_S6_L001_R.csv')

# Facet wrap of contig coverage

In [446]:
# works

def plot_contig_coverage_facet(df: str) -> list[altair.vegalite.v4.api.Chart]:
    '''
    Returns a list of plots for every contig in the csv file generated from samtools mpileup and the 
    wrangle_contig_info.py script. 
    '''
    plots = []
    
    contigs_coverage = (pd.read_csv(df)
        .assign(category=lambda x: pd.cut(x.length, bins=[1, 500, 2500, np.inf],
                                          labels=['short', 'medium', 'long']))
        .assign(category=lambda x: pd.Categorical(x.category, ['short', 'medium', 'long']))
        .sort_values('category'))

    
    step_size = {'short': 50, 'medium': 150, 'long': 2000}
    
    for contig in contigs_coverage.category.unique():
        
        base = alt.Chart().mark_line().encode(
         alt.X('position:N', axis=alt.Axis(values=np.arange(0, 20000, step_size[contig]))),
         alt.Y('coverage:Q')
            ).properties(width=125, height=125)

        rule = alt.Chart().mark_rule(color='red').encode(
         alt.Y('mean(coverage):Q')
            ).properties(width=125, height=125)

        layered = (alt.layer(base, rule, data=contigs_coverage.loc[lambda x: x.category == contig])
         .encode(alt.Y(title='Coverage'),
                 alt.X(title='Position'))
         .facet('contig:N', columns=6, title=f'Coverage of contigs with {contig} length')
         .resolve_scale(y='independent', x='independent'))
        
        plots.append(layered)
        
    return plots


    
test = plot_contig_coverage_facet('results/cleaned_files/Bat-Guano-15_S6_L001_R_contigs_coverage_mpileup.csv')

In [447]:
test[1]

# Merge CAT and kaiju on MEGAHIT contigs