# Import demultiplexed filtered reads

In [1]:
import pandas as pd
import os
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

# sort reads by sequencing run

for sr in set(met.SeqRun):
  fwd = 'Data/Demux/'+sr+'_fwd'
  !mkdir $fwd
  !cp Data/Demux/$sr/*_R1_* $fwd

In [2]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

# import reads
for sr in set(met.SeqRun):
  demux = 'Data/Demux/%s_demux_fwd.qza'%sr
  deqzv = 'Data/Demux/%s_demux_fwd.qzv'%sr
  fwd = 'Data/Demux/'+sr+'_fwd'
  
  !qiime tools import \
    --type 'SampleData[SequencesWithQuality]' \
    --input-path $fwd \
    --input-format CasavaOneEightSingleLanePerSampleDirFmt \
    --output-path $demux
  
  !qiime demux summarize \
    --i-data $demux \
    --o-visualization $deqzv

[32mImported Data/Demux/SeqRun_2_fwd as CasavaOneEightSingleLanePerSampleDirFmt to Data/Demux/SeqRun_2_demux_fwd.qza[0m
[32mSaved Visualization to: Data/Demux/SeqRun_2_demux_fwd.qzv[0m


In [3]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

cutadapt = 'Data/Cutadapt'
!mkdir $cutadapt
for sr in set(met.SeqRun):
  demux = 'Data/Demux/%s_demux_fwd.qza'%sr
  cutad = cutadapt+'/%s_trim_fwd.qza'%sr
  cuqzv = cutadapt+'/%s_trim_fwd.qzv'%sr
  
  !qiime cutadapt trim-single \
    --i-demultiplexed-sequences $demux \
    --o-trimmed-sequences $cutad \
    --p-cores 10 \
    --p-front CAGYGCASCAGKCGMGAA \
    --p-adapter GGGGGGGGGGGGGGGGGGGG \
    --p-error-rate 0.2 \
    --p-match-adapter-wildcards \
    --p-discard-untrimmed \
    --p-match-read-wildcards

  # Visualization
  !qiime demux summarize \
      --i-data $cutad \
      --o-visualization $cuqzv

mkdir: cannot create directory ‘Data/Cutadapt’: File exists
[32mSaved SampleData[SequencesWithQuality] to: Data/Cutadapt/SeqRun_2_trim_fwd.qza[0m
[32mSaved Visualization to: Data/Cutadapt/SeqRun_2_trim_fwd.qzv[0m


# VSEARCH

In [2]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

!mkdir Data/Joined
for sr in set(met.SeqRun):
  cutad = 'Data/Cutadapt/%s_trim.qza'%sr
  joined = 'Data/Joined/%s_joined.qza'%sr
  joiqzv = 'Data/Joined/%s_joined.qzv'%sr
  
  !qiime vsearch join-pairs \
    --i-demultiplexed-seqs $cutad \
    --p-minovlen 0 \
    --p-threads 8 \
    --p-allowmergestagger \
    --o-joined-sequences $joined
  
  !qiime demux summarize \
    --i-data $joined \
    --o-visualization $joiqzv

[32mSaved SampleData[JoinedSequencesWithQuality] to: Data/Joined/SeqRun_2_joined.qza[0m
[32mSaved Visualization to: Data/Joined/SeqRun_2_joined.qzv[0m


In [3]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

for sr in set(met.SeqRun):
  joined = 'Data/Joined/%s_joined.qza'%sr
  q_filt = 'Data/Joined/%s_q_filt.qza'%sr
  q_stat = 'Data/Joined/%s_q_stat.qza'%sr
  qvfilt = 'Data/Joined/%s_q_filt.qzv'%sr

  !qiime quality-filter q-score \
    --i-demux $joined \
    --o-filtered-sequences $q_filt \
    --o-filter-stats $q_stat
  !qiime demux summarize \
    --i-data $q_filt \
    --o-visualization $qvfilt

[32mSaved SampleData[JoinedSequencesWithQuality] to: Data/Joined/SeqRun_2_q_filt.qza[0m
[32mSaved QualityFilterStats to: Data/Joined/SeqRun_2_q_stat.qza[0m
[32mSaved Visualization to: Data/Joined/SeqRun_2_q_filt.qzv[0m


In [4]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

for sr in set(met.SeqRun):
  q_filt = 'Data/Joined/%s_q_filt.qza'%sr
  t_drpl = 'Data/Joined/%s_q_drpl_tab.qza'%sr
  tvdrpl = 'Data/Joined/%s_q_drpl_tab.qzv'%sr
  q_drpl = 'Data/Joined/%s_q_drpl_seq.qza'%sr

  !qiime vsearch dereplicate-sequences \
    --i-sequences $q_filt \
    --o-dereplicated-table $t_drpl \
    --o-dereplicated-sequences $q_drpl
  !qiime feature-table summarize \
    --i-table $t_drpl \
    --o-visualization $tvdrpl \
    --m-sample-metadata-file metadata.tsv

[32mSaved FeatureTable[Frequency] to: Data/Joined/SeqRun_2_q_drpl_tab.qza[0m
[32mSaved FeatureData[Sequence] to: Data/Joined/SeqRun_2_q_drpl_seq.qza[0m
[32mSaved Visualization to: Data/Joined/SeqRun_2_q_drpl_tab.qzv[0m


In [5]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

for sr in set(met.SeqRun):
  t_drpl = 'Data/Joined/%s_q_drpl_tab.qza'%sr
  q_drpl = 'Data/Joined/%s_q_drpl_seq.qza'%sr
  t_clst = 'Data/Joined/%s_q_clst_tab.qza'%sr
  tvclst = 'Data/Joined/%s_q_clst_tab.qzv'%sr
  q_clst = 'Data/Joined/%s_q_clst_seq.qza'%sr

  !qiime vsearch cluster-features-de-novo \
    --i-sequences $q_drpl \
    --i-table $t_drpl \
    --p-perc-identity 0.97 \
    --p-threads 10 \
    --o-clustered-table $t_clst \
    --o-clustered-sequences $q_clst
  !qiime feature-table summarize \
    --i-table $t_clst \
    --o-visualization $tvclst \
    --m-sample-metadata-file metadata.tsv

[32mSaved FeatureTable[Frequency] to: Data/Joined/SeqRun_2_q_clst_tab.qza[0m
[32mSaved FeatureData[Sequence] to: Data/Joined/SeqRun_2_q_clst_seq.qza[0m
[32mSaved Visualization to: Data/Joined/SeqRun_2_q_clst_tab.qzv[0m


In [6]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

for sr in set(met.SeqRun):
  t_clst = 'Data/Joined/%s_q_clst_tab.qza'%sr
  q_clst = 'Data/Joined/%s_q_clst_seq.qza'%sr
  chimer = 'Data/Joined/%s_chimeras.qza'  %sr
  nonchs = 'Data/Joined/%s_nonchimera.qza'%sr
  stats  = 'Data/Joined/%s_stata.qza'     %sr

  !qiime vsearch uchime-denovo \
    --i-sequences $q_clst \
    --i-table $t_clst \
    --o-chimeras $chimer \
    --o-nonchimeras $nonchs \
    --o-stats $stats

[32mSaved FeatureData[Sequence] to: Data/Joined/SeqRun_2_chimeras.qza[0m
[32mSaved FeatureData[Sequence] to: Data/Joined/SeqRun_2_nonchimera.qza[0m
[32mSaved UchimeStats to: Data/Joined/SeqRun_2_stata.qza[0m


In [13]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

for sr in set(met.SeqRun):
  t_clst = 'Data/Joined/%s_q_clst_tab.qza'%sr
  table  = 'Data/Joined/%s_table.qza'     %sr
  tablv  = 'Data/Joined/%s_table.qzv'     %sr
  nonchs = 'Data/Joined/%s_nonchimera.qza'%sr

  !qiime feature-table filter-features \
    --i-table $t_clst \
    --m-metadata-file $nonchs \
    --o-filtered-table $table
  !qiime feature-table summarize \
    --i-table $table \
    --o-visualization $tablv \
    --m-sample-metadata-file metadata.tsv  

[32mSaved FeatureTable[Frequency] to: Data/Joined/SeqRun_2_table.qza[0m
[32mSaved Visualization to: Data/Joined/SeqRun_2_table.qzv[0m


# Dada2 denoising

In [4]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

outdir = 'Data/Denoised_fwd'
!mkdir $outdir
for sr in set(met.SeqRun):
  cutad = 'Data/Cutadapt/%s_trim.qza'%sr
  table = outdir+'/%s_arc_table.qza'%sr
  taqzv = outdir+'/%s_arc_table.qzv'%sr
  repsq = outdir+'/%s_arc_rep-seqs.qza'%sr
  reqzv = outdir+'/%s_arc_rep-seqs.qzv'%sr
  stats = outdir+'/%s_arc_denoising-stats.qza'%sr
  stqzv = outdir+'/%s_arc_denoising-stats.qzv'%sr
  
  !qiime dada2 denoise-single \
    --i-demultiplexed-seqs $cutad \
    --p-trunc-len 0 \
    --p-max-ee 200 \
    --p-n-threads 6 \
    --o-table $table \
    --o-representative-sequences $repsq \
    --o-denoising-stats $stats

  !qiime feature-table tabulate-seqs \
    --i-data $repsq \
    --o-visualization $reqzv

  !qiime metadata tabulate \
    --m-input-file $stats \
    --o-visualization $stqzv

  !qiime feature-table summarize \
    --i-table $table \
    --o-visualization $taqzv \
    --m-sample-metadata-file metadata.tsv

[32mSaved FeatureTable[Frequency] to: Data/Denoised_fwd/SeqRun_2_arc_table.qza[0m
[32mSaved FeatureData[Sequence] to: Data/Denoised_fwd/SeqRun_2_arc_rep-seqs.qza[0m
[32mSaved SampleData[DADA2Stats] to: Data/Denoised_fwd/SeqRun_2_arc_denoising-stats.qza[0m
[32mSaved Visualization to: Data/Denoised_fwd/SeqRun_2_arc_rep-seqs.qzv[0m
[32mSaved Visualization to: Data/Denoised_fwd/SeqRun_2_arc_denoising-stats.qzv[0m
[32mSaved Visualization to: Data/Denoised_fwd/SeqRun_2_arc_table.qzv[0m


# Merge different runs

In [38]:
!qiime feature-table merge \
  --i-tables Data/Denoised_180-200/*bac_table.qza \
  --o-merged-table Data/merged_table.qza

!qiime feature-table merge-seqs \
  --i-data Data/Denoised_180-200/*bac_rep-seqs.qza \
  --o-merged-data Data/merged_rep-seqs.qza

!qiime feature-table summarize \
  --i-table Data/merged_table.qza \
  --o-visualization Data/merged_table.qzv \
  --m-sample-metadata-file metadata.tsv

[32mSaved FeatureTable[Frequency] to: Data/merged_table.qza[0m


In [22]:
#vsearch
table = 'Data/Joined/SeqRun_2_table.qza'
clstseq = 'Data/Joined/SeqRun_2_q_clst_seq.qza'

!qiime feature-table filter-features \
  --i-table $table \
  --p-min-frequency 50 \
  --p-min-samples 4 \
  --o-filtered-table Data/vsearch-table.qza

!qiime feature-table summarize \
  --i-table Data/vsearch-table.qza \
  --o-visualization Data/vsearch-table.qzv \
  --m-sample-metadata-file metadata.tsv

!qiime feature-table filter-seqs \
  --i-data $clstseq \
  --i-table Data/vsearch-table.qza \
  --o-filtered-data Data/vsearch-rep-seqs.qza

[32mSaved FeatureTable[Frequency] to: Data/vsearch-table.qza[0m
[32mSaved Visualization to: Data/vsearch-table.qzv[0m
[32mSaved FeatureData[Sequence] to: Data/vsearch-rep-seqs.qza[0m


In [5]:
#dada2
table = 'Data/Denoised_fwd/SeqRun_2_arc_table.qza'
clstseq = 'Data/Denoised_fwd/SeqRun_2_arc_rep-seqs.qza'

!qiime feature-table filter-features \
  --i-table $table \
  --p-min-frequency 5 \
  --p-min-samples 2 \
  --o-filtered-table Data/dada2-table.qza

!qiime feature-table summarize \
  --i-table Data/dada2-table.qza \
  --o-visualization Data/dada2-table.qzv \
  --m-sample-metadata-file metadata.tsv

!qiime feature-table filter-seqs \
  --i-data $clstseq \
  --i-table Data/dada2-table.qza \
  --o-filtered-data Data/dada2-rep-seqs.qza

[32mSaved FeatureTable[Frequency] to: Data/dada2-table.qza[0m
[32mSaved Visualization to: Data/dada2-table.qzv[0m
[32mSaved FeatureData[Sequence] to: Data/dada2-rep-seqs.qza[0m


# Taxonomy assignment

In [6]:
#dada2
!qiime feature-classifier classify-hybrid-vsearch-sklearn \
  --i-query Data/dada2-rep-seqs.qza \
  --i-reference-reads ../Classifier/silva-138_1-ssu-nr99-seqs-its-uniq.qza \
  --i-reference-taxonomy ../Classifier/silva-138_1-ssu-nr99-tax-its-derep-uniq.qza \
  --i-classifier ../Classifier/ITS-ssu-nr99-classifier.qza \
  --p-threads 4 \
  --p-no-prefilter \
  --o-classification Data/its_dada2_taxonomy_vsearch-sklearn.qza

!qiime metadata tabulate \
  --m-input-file Data/its_dada2_taxonomy_vsearch-sklearn.qza \
  --o-visualization Data/its_dada2_taxonomy_vsearch-sklearn.qzv

[32mSaved FeatureData[Taxonomy] to: Data/its_dada2_taxonomy_vsearch-sklearn.qza[0m
[32mSaved Visualization to: Data/its_dada2_taxonomy_vsearch-sklearn.qzv[0m


# Combo: Combining ASV hashes with last available taxa 

In [3]:
# Install biopython
!pip install biopython

Collecting biopython
  Downloading biopython-1.78-cp36-cp36m-manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 1.9 MB/s eta 0:00:01
Installing collected packages: biopython
Successfully installed biopython-1.78


In [7]:
import pandas as pd
from Bio import SeqIO

# declare Qiime2 generated artifacts here
table    = 'Data/dada2-table.qza' 
taxo     = 'Data/its_dada2_taxonomy_vsearch-sklearn.qza'
rep_seq  = 'Data/dada2-rep-seqs.qza'

# export rep-seqs.qza, table.qza and taxonomy.qza
!mkdir Biom Taxa Rep-seqs #temp directories

!qiime tools export \
  --input-path $table \
  --output-path Biom/

!qiime tools export \
  --input-path $taxo \
  --output-path Taxa

!qiime tools export \
  --input-path $rep_seq \
  --output-path Rep-seqs/

# convert .biom to .tsv
!biom convert -i Biom/feature-table.biom -o Biom/feature-table.tsv --to-tsv 

# *****replacing hashes with combination of taxonomy and beginings of the hashes*****
# reading tables
taxa = pd.read_csv('Taxa/taxonomy.tsv', sep='\t')
biom = pd.read_csv('Biom/feature-table.tsv', sep='\t', skiprows=1)

#creating a new column with modified taxonomy
#I also shortened some annotations and deleted some symbols 
#that were crashing tree construction with modified files
taxa['Taxon'] = taxa.Taxon.replace(';__','').str.replace('[','').str.replace(']','').str.replace('.','')\
.str.replace('/','_').str.replace("'",'').str.replace(' ','_').str.replace('archaeon_enrichment','arc_enrich')\
.str.replace('uncultured_rumen','unc_rum').str.replace('uncultured_archaeon','unc_arc').str.replace('_archaeon','_arc')\
.str.replace('uncultured_compost','unc_comp').str.replace('uncultured_euryarchaeote','unc_euryarc')\
.str.replace('uncultured','unc').str.replace('unidentified','unid')

#dealing with uncultured taxa to provide additional information
taxa['Combo'] =  taxa['Taxon'].str.split("__").str[-1].str.split(";").str[-1]
for x in ['unc_euryarc','unc_rum','unid_methanogen','arc_enrich','unc_arc','unc_comp','unc']:
  taxa.loc[taxa['Combo'].str[:]==x,'Combo']=taxa['Taxon'].str.split("__").str[-2].str.split(';').str[0]+'_'+x
  
for n in range(3,6):
  taxa.loc[taxa.Combo.str[:]=='unc_unc_arc','Combo']=taxa.Taxon.str.split("__").str[-n].str.split(';').str[0]+'_unc_arc'
  taxa.loc[taxa.Combo.str[:]=='unc_unc_rum','Combo']=taxa.Taxon.str.split("__").str[-n].str.split(';').str[0]+'_unc_rum'
  taxa.loc[taxa.Combo.str[:]=='unc_unc','Combo']=taxa.Taxon.str.split("__").str[-n].str.split(';').str[0]+'_unc'
  
#add modified taxonomy information to feature hashes, separating them by '|'
biom['#OTU ID'] = taxa['Combo']+'|'+taxa['Feature ID']
taxa['Feature ID'] = biom['#OTU ID']
taxa = taxa[['Feature ID', 'Taxon', 'Confidence', 'Consensus', 'Method']]

### writing modified files
biom.to_csv('Biom/feature-table.tsv', sep='\t', index=False)
taxa.to_csv('Taxa/taxonomy.tsv', sep='\t', index=False)
fasta_hash  = r"Rep-seqs/dna-sequences.fasta"
fasta_combo = r"Rep-seqs/dna-sequences.fa"
hlist = biom['#OTU ID'].tolist()
with open(fasta_hash) as hashes, open(fasta_combo, 'w') as combo:
  for record in SeqIO.parse(fasta_hash, 'fasta'):
    for h in hlist:
      if str(record.id) in str(h):
        combo.write('>'+str(h)+'\n'+str(record.seq)+'\n')

#some cleaning and renaming
!rm $fasta_hash
!mv $fasta_combo $fasta_hash

[32mExported Data/dada2-table.qza as BIOMV210DirFmt to directory Biom/[0m
[32mExported Data/its_dada2_taxonomy_vsearch-sklearn.qza as TSVTaxonomyDirectoryFormat to directory Taxa[0m
[32mExported Data/dada2-rep-seqs.qza as DNASequencesDirectoryFormat to directory Rep-seqs/[0m


In [8]:
#creating new rep-seqs.qza, table.qza and taxonomy.qza with modified hashes (added 'combo_' in the name)
!biom convert -i Biom/feature-table.tsv -o Biom/feature-table.biom --table-type="OTU table" --to-hdf5

!qiime tools import \
  --input-path Biom/feature-table.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV210Format \
  --output-path Data/combo_table.qza

!qiime tools import \
  --type 'FeatureData[Taxonomy]' \
  --input-path Taxa/taxonomy.tsv \
  --output-path Data/combo_taxonomy.qza

!qiime tools import \
  --input-path $fasta_hash \
  --type 'FeatureData[Sequence]' \
  --output-path Data/combo_rep-seqs.qza

!rm -r Biom Taxa Rep-seqs #clean temp directories

!qiime feature-table summarize \
  --i-table Data/combo_table.qza \
  --m-sample-metadata-file metadata.tsv \
  --o-visualization Data/combo_table.qzv

[32mImported Biom/feature-table.biom as BIOMV210Format to Data/combo_table.qza[0m
[32mImported Taxa/taxonomy.tsv as TSVTaxonomyDirectoryFormat to Data/combo_taxonomy.qza[0m
[32mImported Rep-seqs/dna-sequences.fasta as DNASequencesDirectoryFormat to Data/combo_rep-seqs.qza[0m
[32mSaved Visualization to: Data/combo_table.qzv[0m


# Filtration to remove low abundant features

In [9]:
tabdir = 'Data/Divided_tables'

!mkdir $tabdir
!qiime taxa filter-table \
  --i-table Data/combo_table.qza \
  --i-taxonomy Data/combo_taxonomy.qza \
  --p-exclude mitochondria,chloroplast,d__Bacteria,d__Eukaryota \
  --p-include p__ \
  --o-filtered-table $tabdir/full-table.qza

#!qiime feature-table filter-samples \
#  --i-table $tabdir/full-table.qza \
#  --p-min-features 25 \
#  --p-min-frequency 10000 \
#  --o-filtered-table $tabdir/full-table.qza

!qiime feature-table summarize \
  --i-table $tabdir/full-table.qza \
  --o-visualization $tabdir/full-table.qzv \
  --m-sample-metadata-file metadata.tsv

!qiime feature-table filter-seqs \
  --i-data Data/combo_rep-seqs.qza \
  --i-table $tabdir/full-table.qza \
  --o-filtered-data Data/combo_rep-seqs.qza

mkdir: cannot create directory ‘Data/Divided_tables’: File exists
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/full-table.qza[0m
[32mSaved Visualization to: Data/Divided_tables/full-table.qzv[0m
[32mSaved FeatureData[Sequence] to: Data/combo_rep-seqs.qza[0m


# Generate a tree for phylogenetic diversity analysis

In [None]:
!qiime phylogeny align-to-tree-mafft-fasttree \
  --i-sequences Data/combo_rep-seqs.qza \
  --p-n-threads 4 \
  --o-alignment Data/aligned-rep-seqs.qza \
  --o-masked-alignment Data/masked-aligned-rep-seqs.qza \
  --p-parttree \
  --o-tree Data/unrooted-tree.qza \
  --o-rooted-tree Data/rooted-tree.qza

# Separating table by day and body site for the analysis

In [2]:
def filt_smpl(tabin,col,tabout):
  !qiime feature-table filter-samples \
    --i-table $tabin \
    --m-metadata-file metadata.tsv \
    --p-where "$col" \
    --o-filtered-table $tabout

def filt_feat(tabin,minfrq,tabout):
  !qiime feature-table filter-features \
    --i-table $tabin \
    --p-min-frequency $minfrq \
    --o-filtered-table $tabout
    
def tabqzv(tabin,tabout):
  !qiime feature-table summarize \
    --i-table $tabin \
    --o-visualization $tabout \
    --m-sample-metadata-file metadata.tsv

In [3]:
# divide by days 
tabdir = 'Data/Divided_tables'

for t in ['d0-RSP_RF','d7-E_FL','d13-E_FL_SAM_FR', 'd7_d13-E_FL']:
  day = "[Day]='%s'" % t.split('-')[0]
  tab = tabdir+'/%s-table.qza' % t
  tav = tabdir+'/%s-table.qzv' % t
  if t == 'd7_d13-E_FL':
    day = "[Day] IN ('d7','d13') AND NOT [BS] IN ('SAM','FR')"
  filt_smpl(tabdir+'/full-table.qza',day,tab)
  filt_feat(tab,5,tab)
  tabqzv(tab,tav)

[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d0-RSP_RF-table.qza[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d0-RSP_RF-table.qza[0m
[32mSaved Visualization to: Data/Divided_tables/d0-RSP_RF-table.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d7-E_FL-table.qza[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d7-E_FL-table.qza[0m
[32mSaved Visualization to: Data/Divided_tables/d7-E_FL-table.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d13-E_FL_SAM_FR-table.qza[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d13-E_FL_SAM_FR-table.qza[0m
[32mSaved Visualization to: Data/Divided_tables/d13-E_FL_SAM_FR-table.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d7_d13-E_FL-table.qza[0m
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/d7_d13-E_FL-table.qza[0m
[32mSaved Visualization to: Data/Divided_tables/d7_d13-E_FL-table.qzv[0m


### List of all tables produced

In [4]:
tabdir = 'Data/Divided_tables'
tables = ['d0-RSP_RF','d7-E_FL','d13-E_FL_SAM_FR','d7_d13-E_FL','full']

# Taxabarplots

### By samples

In [5]:
!mkdir -p Results/Taxa_barplots/By_samples
tabdir = 'Data/Divided_tables'
tables = ['d0-RSP_RF','d7-E_FL','d13-E_FL_SAM_FR','d7_d13-E_FL','full']
for tab in tables:
  table = tabdir+'/%s-table.qza' % tab
  out = 'Results/Taxa_barplots/By_samples/%s-taxabarplot.qzv' % tab
  
  !qiime taxa barplot \
    --i-table $table \
    --i-taxonomy Data/combo_taxonomy.qza \
    --m-metadata-file metadata.tsv \
    --o-visualization $out

[32mSaved Visualization to: Results/Taxa_barplots/By_samples/d0-RSP_RF-taxabarplot.qzv[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_samples/d7-E_FL-taxabarplot.qzv[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_samples/d13-E_FL_SAM_FR-taxabarplot.qzv[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_samples/d7_d13-E_FL-taxabarplot.qzv[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_samples/full-taxabarplot.qzv[0m


### By groups

In [6]:
import pandas as pd

tables = ['d0-RSP_RF','d7-E_FL','d13-E_FL_SAM_FR','d7_d13-E_FL','full']
tabdir = 'Data/Divided_tables'
outres = 'Results/Taxa_barplots/By_groups'
outdat = 'Data/Grouped_tables'
met = pd.read_csv('metadata.tsv',sep='\t',index_col=0)

groups = ['BS_Trt','BS_Src','BS_Dh','BS_Dh_Src','BS_Dh_Trt','BS_Day_Trt','BS_Day_Src']

for tab in tables:
  tabin = tabdir+'/%s-table.qza' % tab
  
  !mkdir -p $outres $outdat
  for group in groups:
  #Collapse metadata
    meta = met.drop_duplicates(subset=group).copy()
    meta.index = meta[group]
    meta.index.name = '#SampleID'
    meta.to_csv(outdat+'/%s-%s-metadata.tsv'%(tab,group),sep='\t')
  #Variables    
    grouped = outdat + '/%s-%s-group_table.qza' % (tab,group)
    taxabar = outres + '/%s-%s-taxabarplot.qzv' % (tab,group)
    metadata= outdat + '/%s-%s-metadata.tsv'    % (tab,group)
    
    try:
    #Group tables    
      !qiime feature-table group \
        --i-table $tabin \
        --p-axis 'sample' \
        --m-metadata-file metadata.tsv \
        --m-metadata-column $group \
        --p-mode 'mean-ceiling' \
        --o-grouped-table $grouped
    #Taxabarplots    
      !qiime taxa barplot \
        --i-table $grouped \
        --i-taxonomy Data/combo_taxonomy.qza \
        --m-metadata-file $metadata \
        --o-visualization $taxabar
    except:
      print('Looks like there is no %s group in table %s' % (group, tabin))

[32mSaved FeatureTable[Frequency] to: Data/Grouped_tables/d0-RSP_RF-BS_Trt-group_table.qza[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_groups/d0-RSP_RF-BS_Trt-taxabarplot.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Grouped_tables/d0-RSP_RF-BS_Src-group_table.qza[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_groups/d0-RSP_RF-BS_Src-taxabarplot.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Grouped_tables/d0-RSP_RF-BS_Dh-group_table.qza[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_groups/d0-RSP_RF-BS_Dh-taxabarplot.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Grouped_tables/d0-RSP_RF-BS_Dh_Src-group_table.qza[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_groups/d0-RSP_RF-BS_Dh_Src-taxabarplot.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Grouped_tables/d0-RSP_RF-BS_Dh_Trt-group_table.qza[0m
[32mSaved Visualization to: Results/Taxa_barplots/By_groups/d0-RSP_RF-BS_Dh_Trt-taxabarplot.qzv[0m
[32mSaved FeatureTa

# Alpha and beta diversity analysis

### Alpha rarefaction plotting

In [11]:
!qiime diversity alpha-rarefaction \
  --i-table Data/Divided_tables/full-table.qza \
  --i-phylogeny Data/rooted-tree.qza \
  --p-max-depth 50000 \
  --m-metadata-file metadata.tsv \
  --o-visualization Results/Alpha_rarefaction.qzv

[32mSaved Visualization to: Results/Alpha_rarefaction.qzv[0m


### Core-metrics-phylogenetic: Core diversity metrics (phylogenetic and non-phylogenetic)

In [12]:
!qiime diversity core-metrics-phylogenetic \
  --i-phylogeny Data/rooted-tree.qza \
  --i-table Data/Divided_tables/full-table.qza \
  --p-sampling-depth 12223 \
  --m-metadata-file metadata.tsv \
  --p-n-jobs-or-threads 'auto' \
  --output-dir Results/Core-metrics

[32mSaved FeatureTable[Frequency] to: Results/Core-metrics/rarefied_table.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/faith_pd_vector.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/observed_features_vector.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/shannon_vector.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/evenness_vector.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/unweighted_unifrac_distance_matrix.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/weighted_unifrac_distance_matrix.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/jaccard_distance_matrix.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/bray_curtis_distance_matrix.qza[0m
[32mSaved PCoAResults to: Results/Core-metrics/unweighted_unifrac_pcoa_results.qza[0m
[32mSaved PCoAResults to: Results/Core-metrics/weighted_unifrac_pcoa_results.qza[0m
[32mSaved PCoAResults to: Resu

### Principal Coordinate Analysis Biplot

In [13]:
table = 'Data/Divided_tables/full-table.qza'
reltab = 'Data/Relative_tables/full-relative_table.qza'
  
  # Converting feature table [Frequency] to [Relative frequency]
!mkdir Data/Relative_tables
!qiime feature-table relative-frequency \
  --i-table Data/Divided_tables/full-table.qza \
  --o-relative-frequency-table $reltab

for artifact in ['unweighted_unifrac', 'weighted_unifrac', 'jaccard', 'bray_curtis']:
  pcoa  = 'Results/Core-metrics/%s_pcoa_results.qza' % artifact
  bidir = 'Results/Biplots/Biplots_full'
  biplot= bidir+'/%s_biplot.qza' % artifact
  bi_qzv= bidir+'/%s_biplot.qzv' % artifact

  !mkdir -p $bidir
    
  # pcoa-biplot: Principal Coordinate Analysis Biplot     
  !qiime diversity pcoa-biplot \
    --i-pcoa $pcoa \
    --i-features $reltab \
    --o-biplot $biplot

  !qiime emperor biplot \
    --i-biplot $biplot \
    --m-sample-metadata-file metadata.tsv \
    --p-ignore-missing-samples \
    --p-number-of-features 5 \
    --o-visualization $bi_qzv

[32mSaved FeatureTable[RelativeFrequency] to: Data/Relative_tables/full-relative_table.qza[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/Biplots_full/unweighted_unifrac_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/Biplots_full/unweighted_unifrac_biplot.qzv[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/Biplots_full/weighted_unifrac_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/Biplots_full/weighted_unifrac_biplot.qzv[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/Biplots_full/jaccard_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/Biplots_full/jaccard_biplot.qzv[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/Biplots_full/bray_curtis_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/Biplots_full/bray_curtis_biplot.qzv[0m


# ANCOM

In [7]:
!mkdir Results/ANCOM
def Ancom(column, ancom):
  ctable = 'Results/ANCOM/ctable.qza'
  !qiime composition add-pseudocount \
    --i-table Results/ANCOM/table.qza \
    --o-composition-table $ctable
  !qiime composition ancom \
    --i-table $ctable \
    --m-metadata-file metadata.tsv \
    --m-metadata-column $column \
    --o-visualization $ancom
  !rm $ctable
    
def Ancol(lev, column, ancol):
  ctable = 'Results/ANCOM/ctable.qza'
  ltable = 'Results/ANCOM/ltable.qza'
  !qiime taxa collapse \
    --i-table Results/ANCOM/table.qza \
    --i-taxonomy Data/combo_taxonomy.qza \
    --p-level $lev \
    --o-collapsed-table $ltable
  !qiime composition add-pseudocount \
    --i-table $ltable \
    --o-composition-table $ctable
  !qiime composition ancom \
    --i-table $ctable \
    --m-metadata-file metadata.tsv \
    --m-metadata-column $column \
    --o-visualization $ancol
  !rm $ctable $ltable 
  
def filter_table(df):
  df.index.names = ['SampleID']
  df.to_csv('Results/ANCOM/table.tsv',sep='\t')
  !qiime feature-table filter-samples \
    --i-table Data/Divided_tables/full-table.qza \
    --m-metadata-file Results/ANCOM/table.tsv \
    --o-filtered-table Results/ANCOM/table.qza
  !rm Results/ANCOM/table.tsv


In [8]:
import pandas as pd
from os import path

met = pd.read_csv('metadata.tsv',sep='\t',index_col=0)

cols = ['BS_Trt','BS_Src','BS_Dh']
days = ['d7','d13']
bodysites = ['E','FL','SAM','FR']
for day in days:
  for bs in bodysites:
    if day=='d7' and bs in ['SAM','FR']:
      continue
    dftab = met.loc[(met.Day==day)&(met.BS==bs)].copy()
    filter_table(dftab)
    for col in cols:
      if col=='BS_Dh' and bs!='FR':
        continue
        
      ancom = 'Results/ANCOM/%s_%s_%s_ancom.qzv'%(day,bs,col)
      Ancom(col,ancom)
      for lev in [6,7]:
        ancol = 'Results/ANCOM/%s_%s_%s_ancom_%s.qzv'%(day,bs,col,lev)
        Ancol(lev,col,ancol)
    !rm Results/ANCOM/table.qza

[32mSaved FeatureTable[Frequency] to: Results/ANCOM/table.qza[0m
[32mSaved FeatureTable[Composition] to: Results/ANCOM/ctable.qza[0m
[32mSaved Visualization to: Results/ANCOM/d7_E_BS_Trt_ancom.qzv[0m
[32mSaved FeatureTable[Frequency] to: Results/ANCOM/ltable.qza[0m
[32mSaved FeatureTable[Composition] to: Results/ANCOM/ctable.qza[0m
[32mSaved Visualization to: Results/ANCOM/d7_E_BS_Trt_ancom_6.qzv[0m
[32mSaved FeatureTable[Frequency] to: Results/ANCOM/ltable.qza[0m
[32mSaved FeatureTable[Composition] to: Results/ANCOM/ctable.qza[0m
[32mSaved Visualization to: Results/ANCOM/d7_E_BS_Trt_ancom_7.qzv[0m
[32mSaved FeatureTable[Composition] to: Results/ANCOM/ctable.qza[0m
[32mSaved Visualization to: Results/ANCOM/d7_E_BS_Src_ancom.qzv[0m
[32mSaved FeatureTable[Frequency] to: Results/ANCOM/ltable.qza[0m
[32mSaved FeatureTable[Composition] to: Results/ANCOM/ctable.qza[0m
[32mSaved Visualization to: Results/ANCOM/d7_E_BS_Src_ancom_6.qzv[0m
[32mSaved FeatureTable[Freq

# Longitudinal analysis

### Pairwise-differences and distances

In [9]:
import pandas as pd

def df_matrix(beta,met):
  matr = beta[beta.index.isin(met.index)].copy()
  matr = matr[met.index]
  matr.sort_index(inplace=True)
  matr = matr.reindex(sorted(matr.columns),axis=1)
  return matr

def first_diff(metric,col,diff):
  !qiime longitudinal pairwise-differences \
    --m-metadata-file Results/First_differences/alpha.tsv \
    --p-metric $metric \
    --p-group-column $col \
    --p-state-column Day \
    --p-state-1 d7 \
    --p-state-2 d13 \
    --p-individual-id-column IID \
    --p-replicate-handling random \
    --o-visualization $diff
  
def first_dist(dista,col,distv):
  !qiime longitudinal pairwise-distances \
    --i-distance-matrix $dista \
    --m-metadata-file Results/First_differences/alpha.tsv \
    --p-group-column $col \
    --p-state-column Day \
    --p-state-1 d7 \
    --p-state-2 d13 \
    --p-individual-id-column IID \
    --p-replicate-handling random \
    --o-visualization $distv
  
def distance_matrix(matr,div,col,dista):
  matr.to_csv('Results/First_distances/matrix.tsv',sep='\t')
  matr = 'Results/First_distances/matrix.tsv'
      
  !qiime tools import \
    --input-path $matr \
    --output-path $dista \
    --type DistanceMatrix
  !rm $matr

def beta_div(qza):  
  a = !unzip $qza
  digest = a[1].split('/')[0].replace('  inflating: ','')
  inf = digest + '/data/distance-matrix.tsv'
  data = pd.read_csv(inf, sep='\t',index_col=0)
  !rm -r $digest
  return data 

In [10]:
diversity = {'observed_features':'Observed ASVs',
             'shannon_entropy':'Shannon\'s entropy',
             'faith_pd':'Faith\'s PD',
             'pielou_evenness':'Pielou\'s evenness'}
cols = ['BS_Trt','BS_Src']

df = pd.read_csv('Results/Core-metrics/alpha.tsv',sep='\t',index_col='#SampleID')
alpha = df.loc[(df.Day!='d0')&(df.BS!='SAM')&(df.BS!='FR')].copy()
alpha['IID'] = ''
for group in set(alpha.BS_Day_Trt_Src):
  temp = alpha.loc[alpha.BS_Day_Trt_Src==group].copy()
  for i,index in enumerate(temp.index):
    alpha.loc[index,'IID'] = temp.loc[index,'BS_Trt_Src']+'_'+str(i)
      
!mkdir Results/First_differences
alpha.to_csv('Results/First_differences/alpha.tsv',sep='\t')
for div in diversity:
  for col in cols:
    diff = 'Results/First_differences/%s_%s_first-diff.qzv'%(div,col)
    first_diff(div,col,diff)

FileNotFoundError: [Errno 2] No such file or directory: 'Results/Core-metrics/alpha.tsv'

In [None]:
diversity = {'unweighted_unifrac':'Unweighted UniFrac',
             'weighted_unifrac':'Weighted UniFrac',
             'jaccard':'Jaccard',
             'bray_curtis':'Bray-Curtis'}
cols = ['BS_Trt','BS_Src']
df = pd.read_csv('Results/Core-metrics/alpha.tsv',sep='\t',index_col='#SampleID')
met = df.loc[(df.Day!='d0')&(df.BS!='SAM')&(df.BS!='FR')].copy()
!mkdir Results/First_distances
for div in diversity:
  beta = beta_div('Results/Core-metrics/%s_distance_matrix.qza'%div)
  for col in cols:
    distv = 'Results/First_distances/%s_%s_first-dist.qzv'%(div,col)
    dista = 'Results/First_distances/%s_%s_distances.qza'%(div,col)
    matr = df_matrix(beta,met)
    distance_matrix(matr,div,col,dista)
    first_dist(dista,col,distv)