In [1]:
import qiime2
from qiime2.plugins import (
    cutadapt, demux, dada2, feature_table, metadata,
    greengenes2, taxa, feature_classifier,
    vsearch
)

from qiime2 import Artifact, Metadata
import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

path = {
    "art" : "../data/artifacts/mock/",
    "vis" : "../visualizations/mock/",
    "res" : "../data/resources/"
 }

for filepath in path.values():
    os.makedirs(filepath, exist_ok=True)

In [None]:
# create mock metadata
metadata_df = pd.DataFrame({
    'sampleid': ['2476', '2477'], 'mock' : [0, 1]}).set_index('sampleid')

metadata = qiime2.Metadata(metadata_df)

# 1. Quality control 
## 1.1. DADA2 
Clearing sequences from artifacts of sequencing

In [None]:
# import raw data with manifest
raw_seqs = qiime2.Artifact.import_data('SampleData[PairedEndSequencesWithQuality]',
                                       '../data/manifest-mock.tsv', view_type='PairedEndFastqManifestPhred33V2')

In [8]:
quality_vis = demux.visualizers.summarize(raw_seqs)
quality_vis.visualization.save(path["vis"] + "quality-plot.qzv")

'../visualizations/mock/quality-plot.qzv'

<Figure size 640x480 with 0 Axes>

## Interpretation

Desired length of forward + reverse primers should cover full V3-V4 hypervariable region (~465 bp). DADA2 expects >= 12 bp of overlap. The quality of reverse read is not great (quality score of 20 means that 1% of pairs is erroneous). However, this should not hurt species classification much.

In [3]:
qc_reads = dada2.methods.denoise_paired(
    raw_seqs, trunc_len_f=260, trunc_len_r=230, n_threads=32,
    min_fold_parent_over_abundance=4,
)

qc_reads.denoising_stats.save(path["art"] + "denoise-stats.qza")
qc_reads.table.save(path["art"] + "feature-table.qza")
qc_reads.representative_sequences.save(path["art"] + "rep-seqs.qza")


metadata.visualizers.tabulate(input=qc_reads.denoising_stats.view(Metadata)).visualization.save(path["vis"] + "denoise-stats.qzv")
feature_table.visualizers.summarize(qc_reads.table).visualization.save(path["vis"] + "feature-table.qzv")

NameError: name 'raw_seqs' is not defined

## Interpretation 

DADA2 again (3rd time) discards too many reads, which is a sign of a poor quality of sequencing. Only 35% of reads pass the filter. Majority of reads is flagged as chimeric, again. Overall, 29% and 26% of reads passed the quality control step. 

# 2. Taxonomical classification - bacterial 16S

In [3]:
# load preprocessed data
table = Artifact.load(path["art"] + "feature-table.qza")
rep_seqs = Artifact.load(path["art"] + "rep-seqs.qza")

## Greengenes 2

In [11]:
gg2_mapped = vsearch.methods.cluster_features_closed_reference(sequences=rep_seqs, table=table,
                                                               reference_sequences=Artifact.load(path["res"] + "2022.10.backbone.full-length.fna.qza"),
                                                               perc_identity=0.99, threads=8)

gg2_mapped.clustered_table.save(path["art"] + "feature-table-gg2.qza")
gg2_mapped.clustered_sequences.save(path["art"] + "rep-seqs-gg2.qza")

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: vsearch --usearch_global /tmp/tmp47v01p8h --id 0.99 --db /tmp/qiime2/vbezshapkin/data/a53d9300-5c5c-4774-a2e8-a5e23904f1ae/data/dna-sequences.fasta --uc /tmp/tmpavjp62_x --strand plus --qmask none --notmatched /tmp/tmpquk21cfz --threads 8 --minseqlength 1 --fasta_width 0



vsearch v2.22.1_linux_x86_64, 1007.1GB RAM, 144 cores
https://github.com/torognes/vsearch

Reading file /tmp/qiime2/vbezshapkin/data/a53d9300-5c5c-4774-a2e8-a5e23904f1ae/data/dna-sequences.fasta 100%
494630940 nt in 331269 seqs, min 416, max 4563, avg 1493
Masking 100%
Counting k-mers 100%
Creating k-mer index 100%
Searching 100%
Matching unique query sequences: 475 of 494 (96.15%)


Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: vsearch --sortbysize /tmp/tmpquk21cfz --xsize --output /tmp/q2-DNAFASTAFormat-7f3t_quj --minseqlength 1 --fasta_width 0



vsearch v2.22.1_linux_x86_64, 1007.1GB RAM, 144 cores
https://github.com/torognes/vsearch

Reading file /tmp/tmpquk21cfz 100%
8013 nt in 19 seqs, min 277, max 465, avg 422
Getting sizes 100%
Sorting 100%
Median abundance: 4
Writing output 100%


'../data/artifacts/mock/rep-seqs-gg2.qza'

In [21]:
tax = greengenes2.methods.taxonomy_from_table(table = gg2_mapped.clustered_table,
                                              reference_taxonomy = Artifact.load(path["res"] + "gg2.2022.10.taxonomy.asv.nwk.qza"))

In [22]:
tax.classification.save(path["art"] + "tax-gg2.qza")

'../data/artifacts/mock/tax-gg2.qza'

In [24]:
vis = taxa.visualizers.barplot(table = gg2_mapped.clustered_table,
                               taxonomy = Artifact.load(path["art"] + "tax-gg2.qza"),
                               metadata = metadata)
vis.visualization.save(path["vis"] + "gg2-barplot.qzv")

'../visualizations/mock/gg2-barplot.qzv'

In [20]:
# percentage of unmapped sequences

unmapped_perc = gg2_mapped.unmatched_sequences.view(pd.Series).shape[0] / rep_seqs.view(pd.Series).shape[0]
print('Percentage of unmapped sequences: {:.2f}%'.format(unmapped_perc*100))

Percentage of unmapped sequences: 3.85%


## 3. Taxonomical classification - ITS

In [None]:
# Run out of memory, reset tmpdir, runs bashes script internally, doesn't respect fish shell :/
# export TMPDIR=./tmp/

In [1]:
# # classification command
# qiime feature-classifier classify-sklearn \
#     --i-reads data/artifacts/mock/rep-seqs.qza \
#     --i-classifier data/resources/unite_ver9_99_25.07.2023-Q2-2023.9.qza \
#     --n-jobs 8 \
#     --o-classification data/artifacts/mock/rep-seqs-unite-its.qza

In [None]:
# load results