In [1]:
%matplotlib inline
from biom import Table
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns

In [2]:
import qiime2 as q2, qiime2.plugins.feature_table as ft, qiime2.plugins.taxa as taxa

def load_mf(fn, index='#SampleID'):
    _df = pd.read_csv(fn, sep='\t', dtype='str', na_values=[], keep_default_na=False)
    _df.set_index(index, inplace=True)
    return _df

In [3]:
cd trimmed-150nts/

/Users/yoshikivazquezbaeza/Documents/PDF/KnightLaboratory/HastyWater/trimmed-150nts


In [4]:
mf = load_mf('mapping-file.alpha.tsv')

In [5]:
mapping_file = q2.Metadata.load('mapping-file.alpha.tsv')

In [6]:
greengenes = q2.Artifact.load('taxonomy.qza')
table = q2.Artifact.load('table-deblur.qza')

In [7]:
bt = table.view(Table)

In [8]:
[i for i in bt.ids() if 'd0' in i]

['11282.d0', '11282.d0.spike']

In [9]:
acinetos = ft.methods.filter_features(table,
                                      metadata=greengenes.view(q2.Metadata),
                                      where='Taxon LIKE "%Acinetobacter%"')

In [10]:
acinetos.filtered_table.save('feature-table.acinetos.qza')

'feature-table.acinetos.qza'

In [11]:
taxa.visualizers.barplot(acinetos.filtered_table, greengenes, mapping_file).\
visualization.save('taxonomy.barplot.greengenes.only.acinetobacters.qzv')

'taxonomy.barplot.greengenes.only.acinetobacters.qzv'

These dataframes include the count data, this is useful to visualize as heatmaps and what not.

In [12]:
acdf = acinetos.filtered_table.view(pd.DataFrame)

Note that although the signal in the rest of the samples appears very clear (see the next notebook). The pre-spike sample included a very small amount (compared to the total) of sequences that were classified as Acinetobacter.

In [19]:
acdf.loc['11282.d0'].sort_values(ascending=False)

1917.0

From the list of sequences above, we can see that there's a few sequences that dominate the abundance, the others are likely sequencing errors or misclassifications of Acinetobacter.

In [18]:
acdf.sum().sort_values(ascending=False)[:20]

CCTACGGGGGGCAGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGCCTTATGGTTGTAAAGCACTTTAAGCGAGGAGGAGGCTACTTTAGTTAATACCTAGAGATAGTGGACGTTACT    422060.0
CCTACGGGGGGCTGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGCCTTATGGTTGTAAAGCACTTTAAGCGAGGAGGAGGCTACTTTAGTTAATACCTAGAGATAGTGGACGTTACT    291443.0
CCTACGGGAGGCAGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGCCTTATGGTTGTAAAGCACTTTAAGCGAGGAGGAGGCTACTTTAGTTAATACCTAGAGATAGTGGACGTTACT    252974.0
CCTACGGGAGGCTGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGCCTTATGGTTGTAAAGCACTTTAAGCGAGGAGGAGGCTACTTTAGTTAATACCTAGAGATAGTGGACGTTACT    230285.0
CCTACGGGTGGCAGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGCCTTATGGTTGTAAAGCACTTTAAGCGAGGAGGAGGCTACTTTAGTTAATACCTAGAGATAGTGGACGTTACT    207584.0
CCTACGGGGGGCAGCAGTGGGGAATATTGGACAATGGGGGGAACCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGCCTTATGGTTGTAAAGCACTTTAAGCGAGGAGGAGGCTTACCTGGTTAATACCCAGGATAAGTGGACGTTACT    154901.0
CCTACGGGTGGCTGCAGTGGGG