# Import demultiplexed filtered reads

In [1]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

# sort reads by sequencing run
for ri in set(met.Run_index):
  tmp = met.loc[met.Run_index == ri]['SeqRun'].tolist()[0]
  !mkdir Data/Demux/$tmp
  !mv Data/Demux/$ri/* Data/Demux/$tmp
  !rm -r Data/Demux/$ri

mkdir: cannot create directory ‘Data/Demux/SeqRun_2’: File exists
mkdir: cannot create directory ‘Data/Demux/SeqRun_2’: File exists
mkdir: cannot create directory ‘Data/Demux/SeqRun_2’: File exists


In [2]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

# import reads
for sr in set(met.SeqRun):
  demux = 'Data/Demux/%s_demux.qza'%sr
  
  !qiime tools import \
    --type 'SampleData[PairedEndSequencesWithQuality]' \
    --input-path Data/Demux/$sr \
    --input-format CasavaOneEightSingleLanePerSampleDirFmt \
    --output-path $demux

[32mImported Data/Demux/SeqRun_2 as CasavaOneEightSingleLanePerSampleDirFmt to Data/Demux/SeqRun_2_demux.qza[0m


In [3]:
for sr in set(met.SeqRun):
  demux = 'Data/Demux/%s_demux.qza'%sr
  deqzv = 'Data/Demux/%s_demux.qzv'%sr
  
  !qiime demux summarize \
    --i-data $demux \
    --o-visualization $deqzv

[32mSaved Visualization to: Data/Demux/SeqRun_2_demux.qzv[0m


In [4]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
adapter = met.LinkerPrimerSequence.tolist()[0]

!mkdir Data/Cutadapt
for sr in set(met.SeqRun):
  demux = 'Data/Demux/%s_demux.qza'%sr
  cutad = 'Data/Cutadapt/%s_trim.qza'%sr
  cuqzv = 'Data/Cutadapt/%s_trim.qzv'%sr
  
  !qiime cutadapt trim-paired \
    --i-demultiplexed-sequences $demux \
    --o-trimmed-sequences $cutad \
    --p-cores 10 \
    --p-anywhere-f $adapter \
    --p-error-rate 0.2 \
    --p-match-adapter-wildcards \
    --p-discard-untrimmed \
    --p-match-read-wildcards

  # Visualization
  !qiime demux summarize \
      --i-data $cutad \
      --o-visualization $cuqzv

[32mSaved SampleData[PairedEndSequencesWithQuality] to: Data/Cutadapt/SeqRun_2_trim.qza[0m
[32mSaved Visualization to: Data/Cutadapt/SeqRun_2_trim.qzv[0m


In [10]:
#import pandas as pd
#met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
#
#!mkdir Data/Joined
#for sr in set(met.SeqRun):
#  cutad = 'Data/Cutadapt/%s_trim.qza'%sr
#  joined = 'Data/Joined/%s_joined.qza'%sr
#  !qiime vsearch join-pairs \
#    --i-demultiplexed-seqs $cutad \
#    --p-threads 8 \
#    --p-maxdiffs 12 \
#    --o-joined-sequences $joined

[32mSaved SampleData[JoinedSequencesWithQuality] to: Data/Joined/SeqRun_2_joined.qza[0m


In [12]:
#import pandas as pd
#met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
#
#adapter = len(met.LinkerPrimerSequence.tolist()[0])
#!mkdir Data/Deblur
#for sr in set(met.SeqRun):
#  joined = 'Data/Joined/%s_joined.qza'%sr
#  table = 'Data/Deblur/%s_arc_table.qza'%sr
#  taqzv = 'Data/Deblur/%s_arc_table.qzv'%sr
#  repsq = 'Data/Deblur/%s_arc_rep-seqs.qza'%sr
#  reqzv = 'Data/Deblur/%s_arc_rep-seqs.qzv'%sr
#  stats = 'Data/Deblur/%s_arc_denoising-stats.qza'%sr
#  stqzv = 'Data/Deblur/%s_arc_denoising-stats.qzv'%sr
  
#  !qiime deblur denoise-16S \
#    --i-demultiplexed-seqs $joined \
#    --p-trim-length 300 \
#    --p-jobs-to-start 10 \
#    --o-table $table \
#    --o-representative-sequences $repsq \
#    --o-stats $stats#

#  !qiime feature-table tabulate-seqs \
#    --i-data $repsq \
#    --o-visualization $reqzv

#  !qiime feature-table summarize \
#    --i-table $table \
#    --o-visualization $taqzv \
#    --m-sample-metadata-file metadata.tsv

[32mSaved FeatureTable[Frequency] to: Data/Deblur/SeqRun_2_bac_table.qza[0m
[32mSaved FeatureData[Sequence] to: Data/Deblur/SeqRun_2_bac_rep-seqs.qza[0m
[32mSaved DeblurStats to: Data/Deblur/SeqRun_2_bac_denoising-stats.qza[0m
[32mSaved Visualization to: Data/Deblur/SeqRun_2_bac_rep-seqs.qzv[0m
[32mSaved Visualization to: Data/Deblur/SeqRun_2_bac_table.qzv[0m


# Dada2 denoising

In [9]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

adapter = len(met.LinkerPrimerSequence.tolist()[0])
!mkdir Data/Denoised
for sr in set(met.SeqRun):
  cutad = 'Data/Cutadapt/%s_trim.qza'%sr
  table = 'Data/Denoised/%s_arc_table.qza'%sr
  taqzv = 'Data/Denoised/%s_arc_table.qzv'%sr
  repsq = 'Data/Denoised/%s_arc_rep-seqs.qza'%sr
  reqzv = 'Data/Denoised/%s_arc_rep-seqs.qzv'%sr
  stats = 'Data/Denoised/%s_arc_denoising-stats.qza'%sr
  stqzv = 'Data/Denoised/%s_arc_denoising-stats.qzv'%sr
  
  !qiime dada2 denoise-paired \
    --i-demultiplexed-seqs $cutad \
    --p-trunc-len-f 220 \
    --p-trunc-len-r 240 \
    --p-n-threads 10 \
    --o-table $table \
    --o-representative-sequences $repsq \
    --o-denoising-stats $stats

  !qiime feature-table tabulate-seqs \
    --i-data $repsq \
    --o-visualization $reqzv

  !qiime metadata tabulate \
    --m-input-file $stats \
    --o-visualization $stqzv

  !qiime feature-table summarize \
    --i-table $table \
    --o-visualization $taqzv \
    --m-sample-metadata-file metadata.tsv

[32mSaved FeatureTable[Frequency] to: Data/Denoised/SeqRun_2_arc_table.qza[0m
[32mSaved FeatureData[Sequence] to: Data/Denoised/SeqRun_2_arc_rep-seqs.qza[0m
[32mSaved SampleData[DADA2Stats] to: Data/Denoised/SeqRun_2_arc_denoising-stats.qza[0m
[32mSaved Visualization to: Data/Denoised/SeqRun_2_arc_rep-seqs.qzv[0m
[32mSaved Visualization to: Data/Denoised/SeqRun_2_arc_denoising-stats.qzv[0m
[32mSaved Visualization to: Data/Denoised/SeqRun_2_arc_table.qzv[0m


In [10]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

adapter = len(met.LinkerPrimerSequence.tolist()[0])
!mkdir Data/Denoised
for sr in set(met.SeqRun):
  demux = 'Data/Demux/%s_demux.qza'%sr
  table = 'Data/Denoised/%s_arc_table.qza'%sr
  taqzv = 'Data/Denoised/%s_arc_table.qzv'%sr
  repsq = 'Data/Denoised/%s_arc_rep-seqs.qza'%sr
  reqzv = 'Data/Denoised/%s_arc_rep-seqs.qzv'%sr
  stats = 'Data/Denoised/%s_arc_denoising-stats.qza'%sr
  stqzv = 'Data/Denoised/%s_arc_denoising-stats.qzv'%sr
  
  !qiime dada2 denoise-paired \
    --i-demultiplexed-seqs $demux \
    --p-trim-left-f $adapter \
    --p-trunc-len-f 185 \
    --p-trunc-len-r 230 \
    --p-n-threads 10 \
    --o-table $table \
    --o-representative-sequences $repsq \
    --o-denoising-stats $stats

  !qiime feature-table tabulate-seqs \
    --i-data $repsq \
    --o-visualization $reqzv

  !qiime metadata tabulate \
    --m-input-file $stats \
    --o-visualization $stqzv

  !qiime feature-table summarize \
    --i-table $table \
    --o-visualization $taqzv \
    --m-sample-metadata-file metadata.tsv

mkdir: cannot create directory ‘Data/Denoised’: File exists
[32mSaved FeatureTable[Frequency] to: Data/Denoised/SeqRun_2_arc_table.qza[0m
[32mSaved FeatureData[Sequence] to: Data/Denoised/SeqRun_2_arc_rep-seqs.qza[0m
[32mSaved SampleData[DADA2Stats] to: Data/Denoised/SeqRun_2_arc_denoising-stats.qza[0m
[32mSaved Visualization to: Data/Denoised/SeqRun_2_arc_rep-seqs.qzv[0m
[32mSaved Visualization to: Data/Denoised/SeqRun_2_arc_denoising-stats.qzv[0m
[32mSaved Visualization to: Data/Denoised/SeqRun_2_arc_table.qzv[0m


# Merge different runs

In [38]:
#!qiime feature-table merge \
#  --i-tables Data/Denoised/SeqRun_1_bac_table.qza Data/Denoised/SeqRun_2_bac_table.qza \
#  --o-merged-table Data/merged_table.qza

#!qiime feature-table merge-seqs \
#  --i-data Data/Denoised/SeqRun_1_bac_rep-seqs.qza Data/Denoised/SeqRun_2_bac_rep-seqs.qza \
#  --o-merged-data Data/merged_rep-seqs.qza

#!qiime feature-table summarize \
#  --i-table Data/merged_table.qza \
#  --o-visualization Data/merged_table.qzv \
#  --m-sample-metadata-file metadata.tsv

[32mSaved FeatureTable[Frequency] to: Data/merged_table.qza[0m


# Training feature classifier 
### 138 Silva db ref-seqs and taxonomy were obtained from Qiime2 website and produced by rescript

In [16]:
# Extract reference reads
#!qiime feature-classifier extract-reads \
#  --i-sequences Classifier/silva-138-99-seqs.qza \
#  --p-f-primer AGAGTTTGATCCTGGCTCAG \
#  --p-r-primer TGCTGCCTCCCGTAGGAGT \
#  --p-min-length 280 \
#  --p-max-length 500 \
#  --o-reads Classifier/ref-seqs.qza

Usage: [34mqiime feature-classifier extract-reads[0m [OPTIONS]

  Extract simulated amplicon reads from a reference database. Performs in-
  silico PCR to extract simulated amplicons from reference sequences that
  match the input primer sequences (within the mismatch threshold specified
  by `identity`). Both primer sequences must be in the 5' -> 3' orientation.
  Sequences that fail to match both primers will be excluded. Reads are
  extracted, trimmed, and filtered in the following order: 1. reads are
  extracted in specified orientation; 2. primers are removed; 3. reads
  longer than `max_length` are removed; 4. reads are trimmed with
  `trim_right`; 5. reads are truncated to `trunc_len`; 6. reads are trimmed
  with `trim_left`; 7. reads shorter than `min_length` are removed.

[1mInputs[0m:
  [34m[4m--i-sequences[0m ARTIFACT [32mFeatureData[Sequence][0m
                                                                    [35m[required][0m
[1mParameters[0m:
  [34m[4m--

In [1]:
# Train classifier
!qiime feature-classifier fit-classifier-naive-bayes \
  --i-reference-reads Classifier/silva-138-99-seqs.qza \
  --i-reference-taxonomy Classifier/silva-138-99-tax.qza \
  --o-classifier Classifier/full_silva_138_classifier.qza

[32mSaved TaxonomicClassifier to: Classifier/V1-V2_silva_138_classifier.qza[0m


# Taxonomy assignment

In [6]:
!qiime feature-classifier classify-hybrid-vsearch-sklearn \
  --i-query Data/Denoised/SeqRun_2_arc_rep-seqs.qza \
  --i-reference-reads Classifier/silva-138-99-seqs.qza \
  --i-reference-taxonomy Classifier/silva-138-99-tax.qza \
  --i-classifier Classifier/full_silva_138_classifier.qza \
  --p-threads 1 \
  --p-no-prefilter \
  --o-classification Data/taxonomy_vsearch-sklearn.qza

!qiime metadata tabulate \
  --m-input-file Data/taxonomy_vsearch-sklearn.qza \
  --o-visualization Data/taxonomy_vsearch-sklearn.qzv

[32mSaved FeatureData[Taxonomy] to: Data/taxonomy_vsearch-sklearn.qza[0m
[32mSaved Visualization to: Data/taxonomy_vsearch-sklearn.qzv[0m


In [48]:
#!qiime feature-classifier classify-consensus-blast \
#  --i-query Data/merged_rep-seqs.qza \
#  --i-reference-reads Classifier/ref-seqs.qza \
#  --i-reference-taxonomy Classifier/silva-138-99-tax.qza \
#  --o-classification Data/taxonomy_blast.qza

[32mSaved FeatureData[Taxonomy] to: Data/taxonomy_blast.qza[0m


# Combo: Combining ASV hashes with last available taxa 

In [None]:
# Install biopython
!pip install biopython

In [14]:
import pandas as pd
from Bio import SeqIO

# declare Qiime2 generated artifacts here
table    = 'Data/Denoised/SeqRun_2_arc_table.qza' 
taxo     = 'Data/taxonomy_vsearch-sklearn.qza'
rep_seq  = 'Data/Denoised/SeqRun_2_arc_rep-seqs.qza'

# export rep-seqs.qza, table.qza and taxonomy.qza
!mkdir Biom Taxa Rep-seqs #temp directories

!qiime tools export \
  --input-path $table \
  --output-path Biom/

!qiime tools export \
  --input-path $taxo \
  --output-path Taxa

!qiime tools export \
  --input-path $rep_seq \
  --output-path Rep-seqs/

# convert .biom to .tsv
!biom convert -i Biom/feature-table.biom -o Biom/feature-table.tsv --to-tsv 

# *****replacing hashes with combination of taxonomy and beginings of the hashes*****
# reading tables
taxa = pd.read_csv('Taxa/taxonomy.tsv', sep='\t')
biom = pd.read_csv('Biom/feature-table.tsv', sep='\t', skiprows=1)

#creating a new column with modified taxonomy
#I also shortened some annotations and deleted some symbols 
#that were crashing tree construction with modified files
taxa['Taxon'] = taxa.Taxon.replace(';__','').str.replace('[','').str.replace(']','').str.replace('.','')\
.str.replace('/','_').str.replace("'",'').str.replace(' ','_').str.replace('archaeon_enrichment','arc_enrich')\
.str.replace('uncultured_rumen','unc_rum').str.replace('uncultured_archaeon','unc_arc').str.replace('_archaeon','_arc')\
.str.replace('uncultured_compost','unc_comp').str.replace('uncultured_euryarchaeote','unc_euryarc')\
.str.replace('uncultured','unc').str.replace('unidentified','unid')

#dealing with uncultured taxa to provide additional information
taxa['Combo'] =  taxa['Taxon'].str.split("__").str[-1].str.split(";").str[-1]
for x in ['unc_euryarc','unc_rum','unid_methanogen','arc_enrich','unc_arc','unc_comp','unc']:
  taxa.loc[taxa['Combo'].str[:]==x,'Combo']=taxa['Taxon'].str.split("__").str[-2].str.split(';').str[0]+'_'+x
  
for n in range(3,6):
  taxa.loc[taxa.Combo.str[:]=='unc_unc_arc','Combo']=taxa.Taxon.str.split("__").str[-n].str.split(';').str[0]+'_unc_arc'
  taxa.loc[taxa.Combo.str[:]=='unc_unc_rum','Combo']=taxa.Taxon.str.split("__").str[-n].str.split(';').str[0]+'_unc_rum'
  taxa.loc[taxa.Combo.str[:]=='unc_unc','Combo']=taxa.Taxon.str.split("__").str[-n].str.split(';').str[0]+'_unc'
  
#add modified taxonomy information to feature hashes, separating them by '|'
biom['#OTU ID'] = taxa['Combo']+'|'+taxa['Feature ID']
taxa['Feature ID'] = biom['#OTU ID']
taxa = taxa[['Feature ID', 'Taxon', 'Confidence', 'Consensus', 'Method']]

### writing modified files
biom.to_csv('Biom/feature-table.tsv', sep='\t', index=False)
taxa.to_csv('Taxa/taxonomy.tsv', sep='\t', index=False)
fasta_hash  = r"Rep-seqs/dna-sequences.fasta"
fasta_combo = r"Rep-seqs/dna-sequences.fa"
hlist = biom['#OTU ID'].tolist()
with open(fasta_hash) as hashes, open(fasta_combo, 'w') as combo:
  for record in SeqIO.parse(fasta_hash, 'fasta'):
    for h in hlist:
      if str(record.id) in h:
        combo.write('>'+h+'\n'+str(record.seq)+'\n')

#some cleaning and renaming
!rm $fasta_hash
!mv $fasta_combo $fasta_hash

mkdir: cannot create directory ‘Biom’: File exists
mkdir: cannot create directory ‘Taxa’: File exists
mkdir: cannot create directory ‘Rep-seqs’: File exists
[32mExported Data/Denoised/SeqRun_2_arc_table.qza as BIOMV210DirFmt to directory Biom/[0m
[32mExported Data/taxonomy_vsearch-sklearn.qza as TSVTaxonomyDirectoryFormat to directory Taxa[0m
[32mExported Data/Denoised/SeqRun_2_arc_rep-seqs.qza as DNASequencesDirectoryFormat to directory Rep-seqs/[0m


In [15]:
#creating new rep-seqs.qza, table.qza and taxonomy.qza with modified hashes (added 'combo_' in the name)
!biom convert -i Biom/feature-table.tsv -o Biom/feature-table.biom --table-type="OTU table" --to-hdf5

!qiime tools import \
  --input-path Biom/feature-table.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV210Format \
  --output-path Data/combo_table.qza

!qiime tools import \
  --type 'FeatureData[Taxonomy]' \
  --input-path Taxa/taxonomy.tsv \
  --output-path Data/combo_taxonomy.qza

!qiime tools import \
  --input-path $fasta_hash \
  --type 'FeatureData[Sequence]' \
  --output-path Data/combo_rep-seqs.qza

!rm -r Biom Taxa Rep-seqs #clean temp directories

!qiime feature-table summarize \
  --i-table Data/combo_table.qza \
  --m-sample-metadata-file metadata.tsv \
  --o-visualization Data/combo_table.qzv

[32mImported Biom/feature-table.biom as BIOMV210Format to Data/combo_table.qza[0m
[32mImported Taxa/taxonomy.tsv as TSVTaxonomyDirectoryFormat to Data/combo_taxonomy.qza[0m
[32mImported Rep-seqs/dna-sequences.fasta as DNASequencesDirectoryFormat to Data/combo_rep-seqs.qza[0m
[32mSaved Visualization to: Data/combo_table.qzv[0m


# Generate a tree for phylogenetic diversity analysis

In [16]:
!qiime phylogeny align-to-tree-mafft-fasttree \
  --i-sequences Data/combo_rep-seqs.qza \
  --p-n-threads 10 \
  --o-alignment Data/aligned-rep-seqs.qza \
  --o-masked-alignment Data/masked-aligned-rep-seqs.qza \
  --o-tree Data/unrooted-tree.qza \
  --o-rooted-tree Data/rooted-tree.qza

[32mSaved FeatureData[AlignedSequence] to: Data/aligned-rep-seqs.qza[0m
[32mSaved FeatureData[AlignedSequence] to: Data/masked-aligned-rep-seqs.qza[0m
[32mSaved Phylogeny[Unrooted] to: Data/unrooted-tree.qza[0m
[32mSaved Phylogeny[Rooted] to: Data/rooted-tree.qza[0m


# Filtration to remove low abundant features

In [2]:
tabdir = 'Data/Divided_tables'

!mkdir $tabdir
!qiime feature-table filter-features \
  --i-table Data/combo_table.qza \
  --p-min-frequency 5 \
  --o-filtered-table $tabdir/full-table.qza

!qiime taxa filter-table \
  --i-table $tabdir/full-table.qza \
  --i-taxonomy Data/combo_taxonomy.qza \
  --p-exclude mitochondria,chloroplast,d__Bacteria \
  --p-include p__ \
  --o-filtered-table $tabdir/full-table.qza

!qiime feature-table filter-samples \
  --i-table $tabdir/full-table.qza \
  --p-min-features 5 \
  --p-min-frequency 10 \
  --o-filtered-table $tabdir/full-table.qza

!qiime feature-table summarize \
  --i-table $tabdir/full-table.qza \
  --o-visualization $tabdir/full-table.qzv \
  --m-sample-metadata-file metadata.tsv

mkdir: cannot create directory ‘Data/Divided_tables’: File exists
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/full-table.qza[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/full-table.qza[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/full-table.qza[0m
[32mSaved Visualization to: Data/Divided_tables/full-table.qzv[0m


# Separating table by day and body site for the analysis

In [20]:
def filt_smpl(tabin,col,tabout):
  !qiime feature-table filter-samples \
    --i-table $tabin \
    --m-metadata-file metadata.tsv \
    --p-where "$col" \
    --o-filtered-table $tabout

def filt_feat(tabin,minfrq,tabout):
  !qiime feature-table filter-features \
    --i-table $tabin \
    --p-min-frequency $minfrq \
    --o-filtered-table $tabout
    
def tabqzv(tabin,tabout):
  !qiime feature-table summarize \
    --i-table $tabin \
    --o-visualization $tabout \
    --m-sample-metadata-file metadata.tsv

In [21]:
# divide by days 
tabdir = 'Data/Divided_tables'

for t in ['d0-RSP_RF','d7-E_FL','d13-E_FL_SAM_FR', 'd7_d13-E_FL']:
  day = "[Day]='%s'" % t.split('-')[0]
  tab = tabdir+'/%s-table.qza' % t
  tav = tabdir+'/%s-table.qzv' % t
  if t == 'd7_d13-E_FL':
    day = "[Day] IN ('d7','d13') AND NOT [BS] IN ('SAM','FR')"
  filt_smpl(tabdir+'/full-table.qza',day,tab)
  filt_feat(tab,5,tab)
  tabqzv(tab,tav)

[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d0-RSP_RF-table.qza[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d0-RSP_RF-table.qza[0m
[32mSaved Visualization to: Data/Divided_tables/d0-RSP_RF-table.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d7-E_FL-table.qza[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d7-E_FL-table.qza[0m
[32mSaved Visualization to: Data/Divided_tables/d7-E_FL-table.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d13-E_FL_SAM_FR-table.qza[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d13-E_FL_SAM_FR-table.qza[0m
[32mSaved Visualization to: Data/Divided_tables/d13-E_FL_SAM_FR-table.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d7_d13-E_FL-table.qza[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d7_d13-E_FL-table.qza[0m
[32mSaved Visualization to: Data/Divided_tables/d7_d13-E_FL-table.qzv[0m


### List of all tables produced

In [22]:
tabdir = 'Data/Divided_tables'
tables = ['d0-RSP_RF','d7-E_FL','d13-E_FL_SAM_FR','d7_d13-E_FL','full']

# Taxabarplots

### By samples

In [23]:
!mkdir -p Results/Taxa_barplots/By_samples
tabdir = 'Data/Divided_tables'
tables = ['d0-RSP_RF','d7-E_FL','d13-E_FL_SAM_FR','d7_d13-E_FL','full']
for tab in tables:
  table = tabdir+'/%s-table.qza' % tab
  out = 'Results/Taxa_barplots/By_samples/%s-taxabarplot.qzv' % tab
  
  !qiime taxa barplot \
    --i-table $table \
    --i-taxonomy Data/combo_taxonomy.qza \
    --m-metadata-file metadata.tsv \
    --o-visualization $out

[32mSaved Visualization to: Results/Taxa_barplots/By_samples/d0-RSP_RF-taxabarplot.qzv[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_samples/d7-E_FL-taxabarplot.qzv[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_samples/d13-E_FL_SAM_FR-taxabarplot.qzv[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_samples/d7_d13-E_FL-taxabarplot.qzv[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_samples/full-taxabarplot.qzv[0m


### By groups

In [24]:
import pandas as pd

tables = ['d0-RSP_RF','d7-E_FL','d13-E_FL_SAM_FR','d7_d13-E_FL','full']
tabdir = 'Data/Divided_tables'
outres = 'Results/Taxa_barplots/By_groups'
outdat = 'Data/Grouped_tables'
met = pd.read_csv('metadata.tsv',sep='\t',index_col=0)
trash = ['BarcodeSequence','LinkerPrimerSequence','Run_index','Sample Name','Smpl_no','BodySite',\
         'Date of Sampling','Description','Description 2','SeqRun','Src_rstcRun','Day_num','rstc_run']
groups = [col for col in met.columns.tolist() if col not in trash]

for tab in tables:
  tabin = tabdir+'/%s-table.qza' % tab
  
  !mkdir -p $outres $outdat
  for group in groups:
  #Collapse metadata
    meta = met.drop_duplicates(subset=group).copy()
    meta.index = meta[group]
    meta.index.name = '#SampleID'
    meta.to_csv(outdat+'/%s-%s-metadata.tsv'%(tab,group),sep='\t')
  #Variables    
    grouped = outdat + '/%s-%s-group_table.qza' % (tab,group)
    taxabar = outres + '/%s-%s-taxabarplot.qzv' % (tab,group)
    metadata= outdat + '/%s-%s-metadata.tsv'    % (tab,group)
    
    try:
    #Group tables    
      !qiime feature-table group \
        --i-table $tabin \
        --p-axis 'sample' \
        --m-metadata-file metadata.tsv \
        --m-metadata-column $group \
        --p-mode 'mean-ceiling' \
        --o-grouped-table $grouped
    #Taxabarplots    
      !qiime taxa barplot \
        --i-table $grouped \
        --i-taxonomy Data/combo_taxonomy.qza \
        --m-metadata-file $metadata \
        --o-visualization $taxabar
    except:
      print('Looks like there is no %s group in table %s' % (group, tabin))

[32mSaved FeatureTable[Frequency] to: Data/Grouped_tables/d0-RSP_RF-BS-group_table.qza[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_groups/d0-RSP_RF-BS-taxabarplot.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Grouped_tables/d0-RSP_RF-Day-group_table.qza[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_groups/d0-RSP_RF-Day-taxabarplot.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Grouped_tables/d0-RSP_RF-Day_hour-group_table.qza[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_groups/d0-RSP_RF-Day_hour-taxabarplot.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Grouped_tables/d0-RSP_RF-Treatment-group_table.qza[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_groups/d0-RSP_RF-Treatment-taxabarplot.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Grouped_tables/d0-RSP_RF-Source-group_table.qza[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_groups/d0-RSP_RF-Source-taxabarplot.qzv[0m
[32mSaved FeatureTable[Frequency]

# Alpha and beta diversity analysis

### Alpha rarefaction plotting

In [32]:
!qiime diversity alpha-rarefaction \
  --i-table Data/Divided_tables/full-table.qza \
  --i-phylogeny Data/rooted-tree.qza \
  --p-max-depth 30000 \
  --m-metadata-file metadata.tsv \
  --o-visualization Results/Alpha_rarefaction.qzv

[32mSaved Visualization to: Results/Alpha_rarefaction.qzv[0m


### Core-metrics-phylogenetic: Core diversity metrics (phylogenetic and non-phylogenetic)

In [36]:
!qiime diversity core-metrics-phylogenetic \
  --i-phylogeny Data/rooted-tree.qza \
  --i-table Data/Divided_tables/full-table.qza \
  --p-sampling-depth 4455 \
  --m-metadata-file metadata.tsv \
  --p-n-jobs-or-threads 'auto' \
  --output-dir Results/Core-metrics

[32mSaved FeatureTable[Frequency] to: Results/Core-metrics/rarefied_table.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/faith_pd_vector.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/observed_features_vector.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/shannon_vector.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/evenness_vector.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/unweighted_unifrac_distance_matrix.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/weighted_unifrac_distance_matrix.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/jaccard_distance_matrix.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/bray_curtis_distance_matrix.qza[0m
[32mSaved PCoAResults to: Results/Core-metrics/unweighted_unifrac_pcoa_results.qza[0m
[32mSaved PCoAResults to: Results/Core-metrics/weighted_unifrac_pcoa_results.qza[0m
[32mSaved PCoAResults to: Resu

### Principal Coordinate Analysis Biplot

In [37]:
table = 'Data/Divided_tables/full-table.qza'
reltab = 'Data/Relative_tables/full-relative_table.qza'
  
  # Converting feature table [Frequency] to [Relative frequency]
!mkdir Data/Relative_tables
!qiime feature-table relative-frequency \
  --i-table Data/Divided_tables/full-table.qza \
  --o-relative-frequency-table $reltab

for artifact in ['unweighted_unifrac', 'weighted_unifrac', 'jaccard', 'bray_curtis']:
  pcoa  = 'Results/Core-metrics/%s_pcoa_results.qza' % artifact
  bidir = 'Results/Biplots/Biplots_full'
  biplot= bidir+'/%s_biplot.qza' % artifact
  bi_qzv= bidir+'/%s_biplot.qzv' % artifact

  !mkdir -p $bidir
    
  # pcoa-biplot: Principal Coordinate Analysis Biplot     
  !qiime diversity pcoa-biplot \
    --i-pcoa $pcoa \
    --i-features $reltab \
    --o-biplot $biplot

  !qiime emperor biplot \
    --i-biplot $biplot \
    --m-sample-metadata-file metadata.tsv \
    --p-ignore-missing-samples \
    --p-number-of-features 5 \
    --o-visualization $bi_qzv

mkdir: cannot create directory ‘Data/Relative_tables’: File exists
[32mSaved FeatureTable[RelativeFrequency] to: Data/Relative_tables/full-relative_table.qza[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/Biplots_full/unweighted_unifrac_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/Biplots_full/unweighted_unifrac_biplot.qzv[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/Biplots_full/weighted_unifrac_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/Biplots_full/weighted_unifrac_biplot.qzv[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/Biplots_full/jaccard_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/Biplots_full/jaccard_biplot.qzv[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/Biplots_full/bray_curtis_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/Biplots_full/bray_curtis_biplot.qzv[0m


In [None]:
### Filtering beta diversity distances

### Beta diversity comparisons

In [30]:
import pandas as pd

tabdir = 'Data/Divided_tables'
tables = ['d0-RSP_RF','d7-E_FL','d13-E_FL_SAM_FR','d7_d13-E_FL','full']

meta = pd.read_csv('metadata.tsv',sep='\t',index_col=0)
trash = ['BarcodeSequence','LinkerPrimerSequence','Run_index','Sample Name','Smpl_no',\
         'Date of Sampling','Description','Description 2','SeqRun','Src_rstcRun','Day_num']
groups = [col for col in meta.columns.tolist() if col not in trash]

for tab in tables:
  for beta in ['unweighted_unifrac', 'weighted_unifrac', 'jaccard', 'bray_curtis']:
    outdir  = 'Results/Beta_comparisons/Beta_compar_%s'% tab
    distance= 'Results/Core-metrics/Core_%s/%s_distance_matrix.qza' % (tab,beta)
    
    !mkdir -p $outdir
    
    for column in groups:
      bgs_qzv = outdir+'/%s_%s_beta-group-sign-pairwise.qzv' % (column, beta)
   
      !qiime diversity beta-group-significance \
        --i-distance-matrix $distance \
        --m-metadata-file metadata.tsv \
        --m-metadata-column $column \
        --p-pairwise \
        --o-visualization $bgs_qzv

[32mSaved Visualization to: Results/Beta_comparisons/Beta_compar_d0-RSP_RF/BodySite_unweighted_unifrac_beta-group-sign-pairwise.qzv[0m
[32mSaved Visualization to: Results/Beta_comparisons/Beta_compar_d0-RSP_RF/BS_unweighted_unifrac_beta-group-sign-pairwise.qzv[0m
[32mSaved Visualization to: Results/Beta_comparisons/Beta_compar_d0-RSP_RF/rstc_run_unweighted_unifrac_beta-group-sign-pairwise.qzv[0m
[31m[1mPlugin error from diversity:

  All values in the grouping vector are the same. This method cannot operate on a grouping vector with only a single group of objects (e.g., there are no 'between' distances because there is only a single group).

Debug info has been saved to /tmp/qiime2-q2cli-err-vh0e01dn.log[0m
[31m[1mPlugin error from diversity:

  All values in the grouping vector are the same. This method cannot operate on a grouping vector with only a single group of objects (e.g., there are no 'between' distances because there is only a single group).

Debug info has been s

### Adonis

In [31]:
import pandas as pd

tabdir = 'Data/Divided_tables'
tables = ['d7-E_FL','d13-E_FL_SAM_FR','d7_d13-E_FL','full']

meta = pd.read_csv('metadata.tsv',sep='\t',index_col=0)
trash = ['BarcodeSequence','LinkerPrimerSequence','Run_index','Sample Name','Smpl_no',\
         'Date of Sampling','Description','Description 2','SeqRun','Src_rstcRun','Day_num']
groups = [col for col in meta.columns.tolist() if col not in trash]
formula = 'BS*Treatment*Source'

for tab in tables:
  if tab == 'd13-E_FL_SAM_FR':
    formula = 'BS*Treatment*Source*Day_hour'
  if tab == 'd7_d13-E_FL':
    formula = 'BS*Treatment*Source*Day'
  for beta in ['unweighted_unifrac', 'weighted_unifrac', 'jaccard', 'bray_curtis']:
    outdir  = 'Results/Adonis/Adonis_%s'% tab
    distance= 'Results/Core-metrics/Core_%s/%s_distance_matrix.qza' % (tab,beta)
    
    !mkdir -p $outdir
    adonis = outdir+'/%s_adonis.qzv' % beta

    !qiime diversity adonis \
      --i-distance-matrix $distance \
      --m-metadata-file metadata.tsv \
      --p-formula "$formula" \
      --p-n-jobs 10 \
      --o-visualization $adonis  

[31m[1mPlugin error from diversity:

  Command '['run_adonis.R', '/tmp/tmpz27137f5/dm.tsv', '/tmp/tmpz27137f5/md.tsv', 'BS*Treatment*Source', '999', '10', '/tmp/qiime2-temp-m8ud9d0h/adonis.tsv']' returned non-zero exit status 1.

Debug info has been saved to /tmp/qiime2-q2cli-err-syli5gkw.log[0m
[31m[1mPlugin error from diversity:

  Command '['run_adonis.R', '/tmp/tmps0bn9jit/dm.tsv', '/tmp/tmps0bn9jit/md.tsv', 'BS*Treatment*Source', '999', '10', '/tmp/qiime2-temp-51a29kho/adonis.tsv']' returned non-zero exit status 1.

Debug info has been saved to /tmp/qiime2-q2cli-err-uuxc8ub4.log[0m
[31m[1mPlugin error from diversity:

  Command '['run_adonis.R', '/tmp/tmpc1dvfqo0/dm.tsv', '/tmp/tmpc1dvfqo0/md.tsv', 'BS*Treatment*Source', '999', '10', '/tmp/qiime2-temp-klu7gjh4/adonis.tsv']' returned non-zero exit status 1.

Debug info has been saved to /tmp/qiime2-q2cli-err-wx1my7nw.log[0m
[31m[1mPlugin error from diversity:

  Command '['run_adonis.R', '/tmp/tmpqhp36x3o/dm.tsv', '/tmp/

# ANCOM

In [87]:
!mkdir Results/ANCOM
def Ancom(column, ancom):
  ctable = 'Results/ANCOM/ctable.qza'
  !qiime composition add-pseudocount \
    --i-table Results/ANCOM/table.qza \
    --o-composition-table $ctable
  !qiime composition ancom \
    --i-table $ctable \
    --m-metadata-file metadata.tsv \
    --m-metadata-column $column \
    --o-visualization $ancom
  !rm $ctable
    
def Ancol(lev, column, ancol):
  ctable = 'Results/ANCOM/ctable.qza'
  ltable = 'Results/ANCOM/ltable.qza'
  !qiime taxa collapse \
    --i-table Results/ANCOM/table.qza \
    --i-taxonomy Data/combo_taxonomy.qza \
    --p-level $lev \
    --o-collapsed-table $ltable
  !qiime composition add-pseudocount \
    --i-table $ltable \
    --o-composition-table $ctable
  !qiime composition ancom \
    --i-table $ctable \
    --m-metadata-file metadata.tsv \
    --m-metadata-column $column \
    --o-visualization $ancol
  !rm $ctable $ltable 
  
def filter_table(df):
  df.index.names = ['SampleID']
  df.to_csv('Results/ANCOM/table.tsv',sep='\t')
  !qiime feature-table filter-samples \
    --i-table Data/Divided_tables/full-table.qza \
    --m-metadata-file Results/ANCOM/table.tsv \
    --o-filtered-table Results/ANCOM/table.qza
  !rm Results/ANCOM/table.tsv


mkdir: cannot create directory ‘Results/ANCOM’: File exists


In [88]:
import pandas as pd
from os import path

met = pd.read_csv('metadata.tsv',sep='\t',index_col=0)

cols = ['BS_Trt','BS_Src','BS_Dh']
days = ['d7','d13']
bodysites = ['E','FL','SAM','FR']
for day in days:
  for bs in bodysites:
    if day=='d7' and bs in ['SAM','FR']:
      continue
    dftab = met.loc[(met.Day==day)&(met.BS==bs)].copy()
    filter_table(dftab)
    for col in cols:
      if col=='BS_Dh' and bs!='FR':
        continue
        
      ancom = 'Results/ANCOM/%s_%s_%s_ancom.qzv'%(day,bs,col)
      Ancom(col,ancom)
      for lev in [6,7]:
        ancol = 'Results/ANCOM/%s_%s_%s_ancom_%s.qzv'%(day,bs,col,lev)
        Ancol(lev,col,ancol)
    !rm Results/ANCOM/table.qza

[32mSaved FeatureTable[Frequency] to: Results/ANCOM/table.qza[0m
[32mSaved FeatureTable[Composition] to: Results/ANCOM/ctable.qza[0m
[32mSaved Visualization to: Results/ANCOM/d7_E_BS_Trt_ancom.qzv[0m
[32mSaved FeatureTable[Frequency] to: Results/ANCOM/ltable.qza[0m
[32mSaved FeatureTable[Composition] to: Results/ANCOM/ctable.qza[0m
[32mSaved Visualization to: Results/ANCOM/d7_E_BS_Trt_ancom_6.qzv[0m
[32mSaved FeatureTable[Frequency] to: Results/ANCOM/ltable.qza[0m
[32mSaved FeatureTable[Composition] to: Results/ANCOM/ctable.qza[0m
[32mSaved Visualization to: Results/ANCOM/d7_E_BS_Trt_ancom_7.qzv[0m
[32mSaved FeatureTable[Composition] to: Results/ANCOM/ctable.qza[0m
[32mSaved Visualization to: Results/ANCOM/d7_E_BS_Src_ancom.qzv[0m
[32mSaved FeatureTable[Frequency] to: Results/ANCOM/ltable.qza[0m
[32mSaved FeatureTable[Composition] to: Results/ANCOM/ctable.qza[0m
[32mSaved Visualization to: Results/ANCOM/d7_E_BS_Src_ancom_6.qzv[0m
[32mSaved FeatureTable[Freq

# Longitudinal analysis

### Pairwise-differences and distances

In [43]:
import pandas as pd

def df_matrix(beta,met):
  matr = beta[beta.index.isin(met.index)].copy()
  matr = matr[met.index]
  matr.sort_index(inplace=True)
  matr = matr.reindex(sorted(matr.columns),axis=1)
  return matr

def first_diff(metric,col,diff):
  !qiime longitudinal pairwise-differences \
    --m-metadata-file Results/First_differences/alpha.tsv \
    --p-metric $metric \
    --p-group-column $col \
    --p-state-column Day \
    --p-state-1 d7 \
    --p-state-2 d13 \
    --p-individual-id-column IID \
    --p-replicate-handling random \
    --o-visualization $diff
  
def first_dist(dista,col,distv):
  !qiime longitudinal pairwise-distances \
    --i-distance-matrix $dista \
    --m-metadata-file Results/First_differences/alpha.tsv \
    --p-group-column $col \
    --p-state-column Day \
    --p-state-1 d7 \
    --p-state-2 d13 \
    --p-individual-id-column IID \
    --p-replicate-handling random \
    --o-visualization $distv
  
def distance_matrix(matr,div,col,dista):
  matr.to_csv('Results/First_distances/matrix.tsv',sep='\t')
  matr = 'Results/First_distances/matrix.tsv'
      
  !qiime tools import \
    --input-path $matr \
    --output-path $dista \
    --type DistanceMatrix
  !rm $matr

def beta_div(qza):  
  a = !unzip $qza
  digest = a[1].split('/')[0].replace('  inflating: ','')
  inf = digest + '/data/distance-matrix.tsv'
  data = pd.read_csv(inf, sep='\t',index_col=0)
  !rm -r $digest
  return data 

In [28]:
diversity = {'observed_features':'Observed ASVs',
             'shannon_entropy':'Shannon\'s entropy',
             'faith_pd':'Faith\'s PD',
             'pielou_evenness':'Pielou\'s evenness'}
cols = ['BS_Trt','BS_Src']

df = pd.read_csv('Results/Core-metrics/alpha.tsv',sep='\t',index_col='#SampleID')
alpha = df.loc[(df.Day!='d0')&(df.BS!='SAM')&(df.BS!='FR')].copy()
alpha['IID'] = ''
for group in set(alpha.BS_Day_Trt_Src):
  temp = alpha.loc[alpha.BS_Day_Trt_Src==group].copy()
  for i,index in enumerate(temp.index):
    alpha.loc[index,'IID'] = temp.loc[index,'BS_Trt_Src']+'_'+str(i)
      
!mkdir Results/First_differences
alpha.to_csv('Results/First_differences/alpha.tsv',sep='\t')
for div in diversity:
  for col in cols:
    diff = 'Results/First_differences/%s_%s_first-diff.qzv'%(div,col)
    first_diff(div,col,diff)

mkdir: cannot create directory ‘Results/First_differences’: File exists
[32mSaved Visualization to: Results/First_differences/observed_features_BS_Trt_first-diff.qzv[0m
[32mSaved Visualization to: Results/First_differences/observed_features_BS_Src_first-diff.qzv[0m
[32mSaved Visualization to: Results/First_differences/shannon_entropy_BS_Trt_first-diff.qzv[0m
[32mSaved Visualization to: Results/First_differences/shannon_entropy_BS_Src_first-diff.qzv[0m
[32mSaved Visualization to: Results/First_differences/faith_pd_BS_Trt_first-diff.qzv[0m
[32mSaved Visualization to: Results/First_differences/faith_pd_BS_Src_first-diff.qzv[0m
[32mSaved Visualization to: Results/First_differences/pielou_evenness_BS_Trt_first-diff.qzv[0m
[32mSaved Visualization to: Results/First_differences/pielou_evenness_BS_Src_first-diff.qzv[0m


In [44]:
diversity = {'unweighted_unifrac':'Unweighted UniFrac',
             'weighted_unifrac':'Weighted UniFrac',
             'jaccard':'Jaccard',
             'bray_curtis':'Bray-Curtis'}
cols = ['BS_Trt','BS_Src']
df = pd.read_csv('Results/Core-metrics/alpha.tsv',sep='\t',index_col='#SampleID')
met = df.loc[(df.Day!='d0')&(df.BS!='SAM')&(df.BS!='FR')].copy()
!mkdir Results/First_distances
for div in diversity:
  beta = beta_div('Results/Core-metrics/%s_distance_matrix.qza'%div)
  for col in cols:
    distv = 'Results/First_distances/%s_%s_first-dist.qzv'%(div,col)
    dista = 'Results/First_distances/%s_%s_distances.qza'%(div,col)
    matr = df_matrix(beta,met)
    distance_matrix(matr,div,col,dista)
    first_dist(dista,col,distv)

mkdir: cannot create directory ‘Results/First_distances’: File exists
[32mImported Results/First_distances/matrix.tsv as DistanceMatrixDirectoryFormat to Results/First_distances/unweighted_unifrac_BS_Trt_distances.qza[0m
[32mSaved Visualization to: Results/First_distances/unweighted_unifrac_BS_Trt_first-dist.qzv[0m
[32mImported Results/First_distances/matrix.tsv as DistanceMatrixDirectoryFormat to Results/First_distances/unweighted_unifrac_BS_Src_distances.qza[0m
[32mSaved Visualization to: Results/First_distances/unweighted_unifrac_BS_Src_first-dist.qzv[0m
[32mImported Results/First_distances/matrix.tsv as DistanceMatrixDirectoryFormat to Results/First_distances/weighted_unifrac_BS_Trt_distances.qza[0m
[32mSaved Visualization to: Results/First_distances/weighted_unifrac_BS_Trt_first-dist.qzv[0m
[32mImported Results/First_distances/matrix.tsv as DistanceMatrixDirectoryFormat to Results/First_distances/weighted_unifrac_BS_Src_distances.qza[0m
[32mSaved Visualization to: R