# Import demultiplexed filtered reads

In [None]:
import pandas as pd
import os

met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

# sort reads by sequencing run
for ri in set(met.Run_index):
  tmp = 'Data/Demux/'+met.loc[met.Run_index == ri]['SeqRun'].tolist()[0]
  !mkdir $tmp
  !mv Data/Demux/$ri/* $tmp
  !rm -r Data/Demux/$ri
  for fq in os.listdir(tmp):
    if '.fastq.' in fq:
      continue
    fastq = fq.replace('.fq.','.fastq.')
    !mv $tmp/$fq $tmp/$fastq

In [2]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

# import reads
for sr in set(met.SeqRun):
  demux = 'Data/Demux/%s_demux.qza'%sr
  deqzv = 'Data/Demux/%s_demux.qzv'%sr
  
  !qiime tools import \
    --type 'SampleData[PairedEndSequencesWithQuality]' \
    --input-path Data/Demux/$sr \
    --input-format CasavaOneEightSingleLanePerSampleDirFmt \
    --output-path $demux
  !qiime demux summarize \
    --i-data $demux \
    --o-visualization $deqzv

[32mImported Data/Demux/SeqRun_2 as CasavaOneEightSingleLanePerSampleDirFmt to Data/Demux/SeqRun_2_demux.qza[0m
[32mSaved Visualization to: Data/Demux/SeqRun_2_demux.qzv[0m
[32mImported Data/Demux/SeqRun_1 as CasavaOneEightSingleLanePerSampleDirFmt to Data/Demux/SeqRun_1_demux.qza[0m
[32mSaved Visualization to: Data/Demux/SeqRun_1_demux.qzv[0m


In [1]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

cutadapt = 'Data/Cutadapt'
!mkdir $cutadapt
for sr in set(met.SeqRun):
  demux = 'Data/Demux/%s_demux.qza'%sr
  cutad = cutadapt+'/%s_trim.qza'%sr
  cuqzv = cutadapt+'/%s_trim.qzv'%sr
  
  !qiime cutadapt trim-paired \
    --i-demultiplexed-sequences $demux \
    --o-trimmed-sequences $cutad \
    --p-cores 10 \
    --p-front-f CAAGRGTTHGATYMTGGCTCAG \
    --p-adapter-f GGGGGGGGGGGGGGGGGGGGGGGGGGGGGG \
    --p-front-r TGCTGCCTCCCGTAGGAGT \
    --p-adapter-r GGGGGGGGGGGGGGGGGGGGGGGGGGGGGG \
    --p-error-rate 0.2 \
    --p-match-adapter-wildcards \
    --p-discard-untrimmed \
    --p-match-read-wildcards

  # Visualization
  !qiime demux summarize \
      --i-data $cutad \
      --o-visualization $cuqzv

[32mSaved SampleData[PairedEndSequencesWithQuality] to: Data/Cutadapt/SeqRun_1_trim.qza[0m
[32mSaved Visualization to: Data/Cutadapt/SeqRun_1_trim.qzv[0m
[32mSaved SampleData[PairedEndSequencesWithQuality] to: Data/Cutadapt/SeqRun_2_trim.qza[0m
[32mSaved Visualization to: Data/Cutadapt/SeqRun_2_trim.qzv[0m


# Dada2 denoising

In [1]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

outdir = 'Data/Denoised_180-200'
!mkdir $outdir
for sr in set(met.SeqRun):
  cutad = 'Data/Cutadapt/%s_trim.qza'%sr
  table = outdir+'/%s_bac_table.qza'%sr
  taqzv = outdir+'/%s_bac_table.qzv'%sr
  repsq = outdir+'/%s_bac_rep-seqs.qza'%sr
  reqzv = outdir+'/%s_bac_rep-seqs.qzv'%sr
  stats = outdir+'/%s_bac_denoising-stats.qza'%sr
  stqzv = outdir+'/%s_bac_denoising-stats.qzv'%sr
  
  !qiime dada2 denoise-paired \
    --i-demultiplexed-seqs $cutad \
    --p-trunc-len-f 180 \
    --p-trunc-len-r 200 \
    --p-n-threads 6 \
    --o-table $table \
    --o-representative-sequences $repsq \
    --o-denoising-stats $stats

  !qiime feature-table tabulate-seqs \
    --i-data $repsq \
    --o-visualization $reqzv

  !qiime metadata tabulate \
    --m-input-file $stats \
    --o-visualization $stqzv

  !qiime feature-table summarize \
    --i-table $table \
    --o-visualization $taqzv \
    --m-sample-metadata-file metadata.tsv

[32mSaved FeatureTable[Frequency] to: Data/Denoised_180-200/SeqRun_1_bac_table.qza[0m
[32mSaved FeatureData[Sequence] to: Data/Denoised_180-200/SeqRun_1_bac_rep-seqs.qza[0m
[32mSaved SampleData[DADA2Stats] to: Data/Denoised_180-200/SeqRun_1_bac_denoising-stats.qza[0m
[32mSaved Visualization to: Data/Denoised_180-200/SeqRun_1_bac_rep-seqs.qzv[0m
[32mSaved Visualization to: Data/Denoised_180-200/SeqRun_1_bac_denoising-stats.qzv[0m
[32mSaved Visualization to: Data/Denoised_180-200/SeqRun_1_bac_table.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Denoised_180-200/SeqRun_2_bac_table.qza[0m
[32mSaved FeatureData[Sequence] to: Data/Denoised_180-200/SeqRun_2_bac_rep-seqs.qza[0m
[32mSaved SampleData[DADA2Stats] to: Data/Denoised_180-200/SeqRun_2_bac_denoising-stats.qza[0m
[32mSaved Visualization to: Data/Denoised_180-200/SeqRun_2_bac_rep-seqs.qzv[0m
[32mSaved Visualization to: Data/Denoised_180-200/SeqRun_2_bac_denoising-stats.qzv[0m
[32mSaved Visualization to: Data/De

# Merge different runs

In [4]:
!qiime feature-table merge \
  --i-tables Data/Denoised_180-200/*bac_table.qza \
  --o-merged-table Data/merged_table.qza

!qiime feature-table merge-seqs \
  --i-data Data/Denoised_180-200/*bac_rep-seqs.qza \
  --o-merged-data Data/merged_rep-seqs.qza

!qiime feature-table summarize \
  --i-table Data/merged_table.qza \
  --o-visualization Data/merged_table.qzv \
  --m-sample-metadata-file metadata.tsv

[32mSaved FeatureTable[Frequency] to: Data/merged_table.qza[0m
[32mSaved FeatureData[Sequence] to: Data/merged_rep-seqs.qza[0m
[32mSaved Visualization to: Data/merged_table.qzv[0m


In [5]:
table = 'Data/merged_table.qza'
clstseq = 'Data/merged_rep-seqs.qza'

!qiime feature-table filter-features \
  --i-table $table \
  --p-min-frequency 50 \
  --p-min-samples 4 \
  --o-filtered-table $table

!qiime feature-table summarize \
  --i-table $table \
  --o-visualization Data/merged_table.qzv \
  --m-sample-metadata-file metadata.tsv

!qiime feature-table filter-seqs \
  --i-data $clstseq \
  --i-table $table \
  --o-filtered-data $clstseq

[32mSaved FeatureTable[Frequency] to: Data/merged_table.qza[0m
[32mSaved Visualization to: merged_table.qzv[0m
[32mSaved FeatureData[Sequence] to: Data/merged_rep-seqs.qza[0m


# Taxonomy assignment

In [6]:
!qiime feature-classifier classify-hybrid-vsearch-sklearn \
  --i-query Data/merged_rep-seqs.qza \
  --i-reference-reads ../Classifier/silva-138_1-ssu-nr99-seqs-V1-V2-uniq.qza \
  --i-reference-taxonomy ../Classifier/silva-138_1-ssu-nr99-tax-V1-V2-derep-uniq.qza \
  --i-classifier ../Classifier/V1-V2-ssu-nr99-classifier.qza \
  --p-threads 2 \
  --p-no-prefilter \
  --o-classification Data/V1-V2_taxonomy_vsearch-sklearn.qza

!qiime metadata tabulate \
  --m-input-file Data/V1-V2_taxonomy_vsearch-sklearn.qza \
  --o-visualization Data/V1-V2_taxonomy_vsearch-sklearn.qzv

[32mSaved FeatureData[Taxonomy] to: Data/V1-V2_taxonomy_vsearch-sklearn.qza[0m
[32mSaved Visualization to: Data/V1-V2_taxonomy_vsearch-sklearn.qzv[0m


In [1]:
import pandas as pd

# declare Qiime2 generated artifacts here
table    = 'Data/merged_table.qza' 
taxo     = 'Data/V1-V2_taxonomy_vsearch-sklearn.qza'
rep_seq  = 'Data/merged_rep-seqs.qza'

# export rep-seqs.qza, table.qza and taxonomy.qza

!qiime tools export \
  --input-path $taxo \
  --output-path ./

[32mExported Data/V1-V2_taxonomy_vsearch-sklearn.qza as TSVTaxonomyDirectoryFormat to directory ./[0m


In [2]:
!qiime tools import \
  --type 'FeatureData[Taxonomy]' \
  --input-path taxonomy.tsv \
  --output-path Data/mock_taxa_asv.qza

[32mImported taxonomy.tsv as TSVTaxonomyDirectoryFormat to Data/mock_taxa_asv.qza[0m


In [4]:
!qiime taxa barplot \
  --i-table Data/merged_table.qza \
  --i-taxonomy Data/mock_taxa_asv.qza \
  --m-metadata-file metadata.tsv \
  --o-visualization asv-barplot.qzv

[32mSaved Visualization to: asv-barplot.qzv[0m


# Combo: Combining ASV hashes with last available taxa 

In [None]:
# Install biopython
!pip install biopython

In [1]:
import pandas as pd
from Bio import SeqIO

# declare Qiime2 generated artifacts here
table    = 'Data/merged_table.qza' 
taxo     = 'Data/V1-V2_taxonomy_vsearch-sklearn.qza'
rep_seq  = 'Data/merged_rep-seqs.qza'

# export rep-seqs.qza, table.qza and taxonomy.qza
!mkdir Biom Taxa Rep-seqs #temp directories

!qiime tools export \
  --input-path $table \
  --output-path Biom/

!qiime tools export \
  --input-path $taxo \
  --output-path Taxa

!qiime tools export \
  --input-path $rep_seq \
  --output-path Rep-seqs/

# convert .biom to .tsv
!biom convert -i Biom/feature-table.biom -o Biom/feature-table.tsv --to-tsv 

# *****replacing hashes with combination of taxonomy and beginings of the hashes*****
# reading tables
taxa = pd.read_csv('Taxa/taxonomy.tsv', sep='\t')
biom = pd.read_csv('Biom/feature-table.tsv', sep='\t', skiprows=1)

#creating a new column with modified taxonomy
#I also shortened some annotations and deleted some symbols 
#that were crashing tree construction with modified files
tax_rep = {';__':'','[':'',']':'','.':'','/':'_',"'":'_',' ':'_','archaeon_enrichment':'a_e',
           'uncultured_rumen':'u_r','uncultured_archaeon':'u_a','uncultured_euryarchaeote':'u_eur',
           'uncultured_compost':'u_c','_archaeon':'_a','unidentified':'unid','unid_rumen':'unid_r',
           'uncultured_bacterium':'u_b','uncultured':'u','rumen_bacterium':'r_b','group':'gr',
            '':'','':'','':'','':'','':'',}
taxa['Taxon2'] = taxa['Taxon']
for key in tax_rep.keys():
  taxa['Taxon2'] = taxa['Taxon2'].str.replace(key, tax_rep[key])

#dealing with uncultured taxa to provide additional information
taxa['Combo'] =  taxa['Taxon2'].str.split("__").str[-1].str.split(";").str[-1]
for x in ['u_eur','u_r','unid_methanogen','a_e','u_a','u_c','unid_a','u','u_b','unid_r','r_b','methanogenic_a','u_methanogenic_a']:
  taxa.loc[taxa['Combo'].str[:]==x,'Combo']=taxa['Taxon2'].str.split("__").str[-2].str.split(';').str[0]+'_'+x
for n in range(3,6):
  for y in ['u_u_a','u_u_r','u_u_b','u_u','u_u_eur','u_u_methanogenic_a']:
    taxa.loc[taxa.Combo.str[:]==y,'Combo']=taxa.Taxon2.str.split("__").str[-n].str.split(';').str[0]+'_'+y.split('_',1)[1]
  
#add modified taxonomy information to feature hashes, separating them by '|'
biom['#OTU ID'] = taxa['Combo']+'|'+taxa['Feature ID']
taxa['Feature ID'] = biom['#OTU ID']
taxa = taxa[['Feature ID', 'Taxon', 'Confidence', 'Consensus', 'Method']]

### writing modified files
biom.to_csv('Biom/feature-table.tsv', sep='\t', index=False)
taxa.to_csv('Taxa/taxonomy.tsv', sep='\t', index=False)
fasta_hash  = r"Rep-seqs/dna-sequences.fasta"
fasta_combo = r"Rep-seqs/dna-sequences.fa"
hlist = biom['#OTU ID'].tolist()
with open(fasta_hash) as hashes, open(fasta_combo, 'w') as combo:
  for record in SeqIO.parse(fasta_hash, 'fasta'):
    for h in hlist:
      if str(record.id) in str(h):
        combo.write('>'+str(h)+'\n'+str(record.seq)+'\n')

#some cleaning and renaming
!rm $fasta_hash
!mv $fasta_combo $fasta_hash

[32mExported Data/merged_table.qza as BIOMV210DirFmt to directory Biom/[0m
[32mExported Data/V1-V2_taxonomy_vsearch-sklearn.qza as TSVTaxonomyDirectoryFormat to directory Taxa[0m
[32mExported Data/merged_rep-seqs.qza as DNASequencesDirectoryFormat to directory Rep-seqs/[0m


In [2]:
#creating new rep-seqs.qza, table.qza and taxonomy.qza with modified hashes (added 'combo_' in the name)
!biom convert -i Biom/feature-table.tsv -o Biom/feature-table.biom --table-type="OTU table" --to-hdf5

!qiime tools import \
  --input-path Biom/feature-table.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV210Format \
  --output-path Data/combo_table.qza

!qiime tools import \
  --type 'FeatureData[Taxonomy]' \
  --input-path Taxa/taxonomy.tsv \
  --output-path Data/combo_taxonomy.qza

!qiime tools import \
  --input-path $fasta_hash \
  --type 'FeatureData[Sequence]' \
  --output-path Data/combo_rep-seqs.qza

!rm -r Biom Taxa Rep-seqs #clean temp directories

!qiime feature-table summarize \
  --i-table Data/combo_table.qza \
  --m-sample-metadata-file metadata.tsv \
  --o-visualization Data/combo_table.qzv

[32mImported Biom/feature-table.biom as BIOMV210Format to Data/combo_table.qza[0m
[32mImported Taxa/taxonomy.tsv as TSVTaxonomyDirectoryFormat to Data/combo_taxonomy.qza[0m
[32mImported Rep-seqs/dna-sequences.fasta as DNASequencesDirectoryFormat to Data/combo_rep-seqs.qza[0m
[32mSaved Visualization to: Data/combo_table.qzv[0m


# Filtration to remove low abundant features

In [3]:
tabdir = 'Data/Divided_tables'

!mkdir $tabdir

!qiime taxa filter-table \
  --i-table Data/combo_table.qza \
  --i-taxonomy Data/combo_taxonomy.qza \
  --p-include p__ \
  --p-exclude mitochondria,chloroplast \
  --o-filtered-table $tabdir/full-table.qza

!qiime feature-table filter-samples \
  --i-table $tabdir/full-table.qza \
  --p-min-features 50 \
  --p-min-frequency 3000 \
  --o-filtered-table $tabdir/full-table.qza

!qiime feature-table summarize \
  --i-table $tabdir/full-table.qza \
  --o-visualization $tabdir/full-table.qzv \
  --m-sample-metadata-file metadata.tsv

!qiime feature-table filter-seqs \
  --i-data Data/combo_rep-seqs.qza \
  --i-table $tabdir/full-table.qza \
  --o-filtered-data Data/combo_rep-seqs.qza

mkdir: cannot create directory ‘Data/Divided_tables’: File exists
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/full-table.qza[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/full-table.qza[0m
[32mSaved Visualization to: Data/Divided_tables/full-table.qzv[0m
[32mSaved FeatureData[Sequence] to: Data/combo_rep-seqs.qza[0m


# Generate a tree for phylogenetic diversity analysis

In [4]:
!qiime phylogeny align-to-tree-mafft-fasttree \
  --i-sequences Data/combo_rep-seqs.qza \
  --p-n-threads 2 \
  --o-alignment Data/aligned-rep-seqs.qza \
  --o-masked-alignment Data/masked-aligned-rep-seqs.qza \
  --o-tree Data/unrooted-tree.qza \
  --o-rooted-tree Data/rooted-tree.qza

[32mSaved FeatureData[AlignedSequence] to: Data/aligned-rep-seqs.qza[0m
[32mSaved FeatureData[AlignedSequence] to: Data/masked-aligned-rep-seqs.qza[0m
[32mSaved Phylogeny[Unrooted] to: Data/unrooted-tree.qza[0m
[32mSaved Phylogeny[Rooted] to: Data/rooted-tree.qza[0m


# Taxabarplots

In [5]:
!mkdir -p Results/Taxa_barplots
!qiime taxa barplot \
  --i-table Data/Divided_tables/full-table.qza \
  --i-taxonomy Data/combo_taxonomy.qza \
  --m-metadata-file metadata.tsv \
  --o-visualization Results/Taxa_barplots/full-taxabarplot.qzv

[32mSaved Visualization to: Results/Taxa_barplots/full-taxabarplot.qzv[0m


# Alpha and beta diversity analysis

### Alpha rarefaction plotting

In [6]:
!qiime diversity alpha-rarefaction \
  --i-table Data/Divided_tables/full-table.qza \
  --i-phylogeny Data/rooted-tree.qza \
  --p-max-depth 30000 \
  --m-metadata-file metadata.tsv \
  --o-visualization Results/Alpha_rarefaction.qzv

[32mSaved Visualization to: Results/Alpha_rarefaction.qzv[0m


### Core-metrics-phylogenetic: Core diversity metrics (phylogenetic and non-phylogenetic)

In [7]:
!qiime diversity core-metrics-phylogenetic \
  --i-phylogeny Data/rooted-tree.qza \
  --i-table Data/Divided_tables/full-table.qza \
  --p-sampling-depth 5856 \
  --m-metadata-file metadata.tsv \
  --p-n-jobs-or-threads 'auto' \
  --output-dir Results/Core-metrics

[32mSaved FeatureTable[Frequency] to: Results/Core-metrics/rarefied_table.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/faith_pd_vector.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/observed_features_vector.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/shannon_vector.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/evenness_vector.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/unweighted_unifrac_distance_matrix.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/weighted_unifrac_distance_matrix.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/jaccard_distance_matrix.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/bray_curtis_distance_matrix.qza[0m
[32mSaved PCoAResults to: Results/Core-metrics/unweighted_unifrac_pcoa_results.qza[0m
[32mSaved PCoAResults to: Results/Core-metrics/weighted_unifrac_pcoa_results.qza[0m
[32mSaved PCoAResults to: Resu

### Principal Coordinate Analysis Biplot

In [8]:
table = 'Data/Divided_tables/full-table.qza'
reltab = 'Data/Relative_tables/full-relative_table.qza'
  
  # Converting feature table [Frequency] to [Relative frequency]
!mkdir Data/Relative_tables
!qiime feature-table relative-frequency \
  --i-table Data/Divided_tables/full-table.qza \
  --o-relative-frequency-table $reltab

for artifact in ['unweighted_unifrac', 'weighted_unifrac', 'jaccard', 'bray_curtis']:
  pcoa  = 'Results/Core-metrics/%s_pcoa_results.qza' % artifact
  bidir = 'Results/Biplots'
  biplot= bidir+'/%s_biplot.qza' % artifact
  bi_qzv= bidir+'/%s_biplot.qzv' % artifact

  !mkdir -p $bidir
    
  # pcoa-biplot: Principal Coordinate Analysis Biplot     
  !qiime diversity pcoa-biplot \
    --i-pcoa $pcoa \
    --i-features $reltab \
    --o-biplot $biplot

  !qiime emperor biplot \
    --i-biplot $biplot \
    --m-sample-metadata-file metadata.tsv \
    --p-ignore-missing-samples \
    --p-number-of-features 5 \
    --o-visualization $bi_qzv

mkdir: cannot create directory ‘Data/Relative_tables’: File exists
[32mSaved FeatureTable[RelativeFrequency] to: Data/Relative_tables/full-relative_table.qza[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/unweighted_unifrac_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/unweighted_unifrac_biplot.qzv[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/weighted_unifrac_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/weighted_unifrac_biplot.qzv[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/jaccard_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/jaccard_biplot.qzv[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/bray_curtis_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/bray_curtis_biplot.qzv[0m
