# Import demultiplexed filtered reads

In [None]:
import pandas as pd
import os
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

# sort reads by sequencing run
for ri in set(met.Run_index):
  tmp = 'Data/Demux/'+met.loc[met.Run_index == ri]['SeqRun'].tolist()[0]
  !mkdir $tmp
  !mv Data/Demux/$ri/* $tmp
  !rm -r Data/Demux/$ri
  for fq in os.listdir(tmp):
    if '.fastq.' in fq:
      continue
    fastq = fq.replace('.fq.','.fastq.')
    !mv $tmp/$fq $tmp/$fastq
  

In [6]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

# import reads
for sr in set(met.SeqRun):
  demux = 'Data/Demux/%s_demux.qza'%sr
  deqzv = 'Data/Demux/%s_demux.qzv'%sr
  
  !qiime tools import \
    --type 'SampleData[PairedEndSequencesWithQuality]' \
    --input-path Data/Demux/$sr \
    --input-format CasavaOneEightSingleLanePerSampleDirFmt \
    --output-path $demux
  
  !qiime demux summarize \
    --i-data $demux \
    --o-visualization $deqzv

[32mImported Data/Demux/SeqRun_2 as CasavaOneEightSingleLanePerSampleDirFmt to Data/Demux/SeqRun_2_demux.qza[0m
[32mSaved Visualization to: Data/Demux/SeqRun_2_demux.qzv[0m


In [1]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

cutadapt = 'Data/Cutadapt'
!mkdir $cutadapt
for sr in set(met.SeqRun):
  demux = 'Data/Demux/%s_demux.qza'%sr
  cutad = cutadapt+'/%s_trim.qza'%sr
  cuqzv = cutadapt+'/%s_trim.qzv'%sr
  
  !qiime cutadapt trim-paired \
    --i-demultiplexed-sequences $demux \
    --o-trimmed-sequences $cutad \
    --p-cores 10 \
    --p-front-f GYGCASCAGKCGMGAA \
    --p-adapter-f GGGGGGGGGGGGGGGGGGGG \
    --p-front-r GGACTACVSGGGTATCTAAT \
    --p-adapter-r GGGGGGGGGGGGGGGGGGGG \
    --p-error-rate 0.2 \
    --p-match-adapter-wildcards \
    --p-discard-untrimmed \
    --p-match-read-wildcards

  # Visualization
  !qiime demux summarize \
      --i-data $cutad \
      --o-visualization $cuqzv

[32mSaved SampleData[PairedEndSequencesWithQuality] to: Data/Cutadapt/SeqRun_2_trim.qza[0m
[32mSaved Visualization to: Data/Cutadapt/SeqRun_2_trim.qzv[0m


# Dada2 denoising

In [2]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

outdir = 'Data/Denoised_Dada2'
!mkdir $outdir
for sr in set(met.SeqRun):
  cutad = 'Data/Cutadapt/%s_trim.qza'%sr
  table = outdir+'/%s_arc_table.qza'%sr
  taqzv = outdir+'/%s_arc_table.qzv'%sr
  repsq = outdir+'/%s_arc_rep-seqs.qza'%sr
  reqzv = outdir+'/%s_arc_rep-seqs.qzv'%sr
  stats = outdir+'/%s_arc_denoising-stats.qza'%sr
  stqzv = outdir+'/%s_arc_denoising-stats.qzv'%sr
  
  !qiime dada2 denoise-paired \
    --i-demultiplexed-seqs $cutad \
    --p-trunc-len-f 0\
    --p-trunc-len-r 0 \
    --p-n-threads 0 \
    --o-table $table \
    --p-min-overlap 4 \
    --p-max-ee-r 20 \
    --p-max-ee-f 20 \
    --o-representative-sequences $repsq \
    --o-denoising-stats $stats
  
  !qiime feature-table tabulate-seqs \
    --i-data $repsq \
    --o-visualization $reqzv

  !qiime metadata tabulate \
    --m-input-file $stats \
    --o-visualization $stqzv

  !qiime feature-table summarize \
    --i-table $table \
    --o-visualization $taqzv \
    --m-sample-metadata-file metadata.tsv

mkdir: cannot create directory ‘Data/Denoised_Dada2’: File exists
[32mSaved FeatureTable[Frequency] to: Data/Denoised_Dada2/SeqRun_2_arc_table.qza[0m
[32mSaved FeatureData[Sequence] to: Data/Denoised_Dada2/SeqRun_2_arc_rep-seqs.qza[0m
[32mSaved SampleData[DADA2Stats] to: Data/Denoised_Dada2/SeqRun_2_arc_denoising-stats.qza[0m
[32mSaved Visualization to: Data/Denoised_Dada2/SeqRun_2_arc_rep-seqs.qzv[0m
[32mSaved Visualization to: Data/Denoised_Dada2/SeqRun_2_arc_denoising-stats.qzv[0m
[32mSaved Visualization to: Data/Denoised_Dada2/SeqRun_2_arc_table.qzv[0m


# Merge different runs

In [4]:
!qiime feature-table merge \
  --i-tables Data/Denoised_Dada2/*arc_table.qza \
  --o-merged-table Data/merged_table.qza

!qiime feature-table merge-seqs \
  --i-data Data/Denoised_Dada2/*arc_rep-seqs.qza \
  --o-merged-data Data/merged_rep-seqs.qza

!qiime feature-table summarize \
  --i-table Data/merged_table.qza \
  --o-visualization Data/merged_table.qzv \
  --m-sample-metadata-file metadata.tsv

[32mSaved FeatureTable[Frequency] to: Data/merged_table.qza[0m
[32mSaved FeatureData[Sequence] to: Data/merged_rep-seqs.qza[0m
[32mSaved Visualization to: Data/merged_table.qzv[0m


In [5]:
#dada2
table = 'Data/merged_table.qza'
reseq = 'Data/merged_rep-seqs.qza'

!qiime feature-table filter-features \
  --i-table $table \
  --p-min-frequency 10 \
  --p-min-samples 2 \
  --o-filtered-table $table

!qiime feature-table summarize \
  --i-table $table \
  --o-visualization Data/merged_table.qzv \
  --m-sample-metadata-file metadata.tsv

!qiime feature-table filter-seqs \
  --i-data $reseq \
  --i-table $table \
  --o-filtered-data $reseq

[32mSaved FeatureTable[Frequency] to: Data/merged_table.qza[0m
[32mSaved Visualization to: Data/merged_table.qzv[0m
[32mSaved FeatureData[Sequence] to: Data/merged_rep-seqs.qza[0m


# Taxonomy assignment

In [6]:
#dada2
!qiime feature-classifier classify-hybrid-vsearch-sklearn \
  --i-query Data/merged_rep-seqs.qza \
  --i-reference-reads ../../../Porto/Classifiers/silva138_1-99-seqs-Arch349F-806R.qza \
  --i-reference-taxonomy ../../../Porto/Classifiers/silva138_1-99-taxa-Arch349F-806R.qza \
  --i-classifier ../../../Porto/Classifiers/Archaea_*-99-classifier.qza \
  --p-threads 4 \
  --p-no-prefilter \
  --o-classification Data/arc_dada2_taxonomy_vsearch-sklearn.qza

!qiime metadata tabulate \
  --m-input-file Data/arc_dada2_taxonomy_vsearch-sklearn.qza \
  --o-visualization Data/arc_dada2_taxonomy_vsearch-sklearn.qzv

[32mSaved FeatureData[Taxonomy] to: Data/arc_dada2_taxonomy_vsearch-sklearn.qza[0m
[32mSaved Visualization to: Data/arc_dada2_taxonomy_vsearch-sklearn.qzv[0m


# Combo: Combining ASV hashes with last available taxa 

In [3]:
# Install biopython
!pip install biopython

Collecting biopython
  Downloading biopython-1.78-cp36-cp36m-manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 1.9 MB/s eta 0:00:01
Installing collected packages: biopython
Successfully installed biopython-1.78


In [7]:
import pandas as pd
from Bio import SeqIO

# declare Qiime2 generated and output artifacts here
tabin = 'Data/merged_table.qza'     
taxin = 'Data/arc_dada2_taxonomy_vsearch-sklearn.qza'
repin = 'Data/merged_rep-seqs.qza'
tabex = 'Data/combo_table.qza'
taxex = 'Data/combo_taxonomy.qza'
repex = 'Data/combo_rep-seqs.qza'
tabev = 'Data/combo_table.qzv'

# export rep-seqs.qza, table.qza and taxonomy.qza
!mkdir Biom Taxa Rep-seqs #temp directories

!qiime tools export \
  --input-path $tabin \
  --output-path Biom/

!qiime tools export \
  --input-path $taxin \
  --output-path Taxa

!qiime tools export \
  --input-path $repin \
  --output-path Rep-seqs/

# convert .biom to .tsv
!biom convert -i Biom/feature-table.biom -o Biom/feature-table.tsv --to-tsv 

# *****replacing hashes with combination of taxonomy and beginings of the hashes*****
# reading tables
taxa = pd.read_csv('Taxa/taxonomy.tsv', sep='\t',index_col=0)
biom = pd.read_csv('Biom/feature-table.tsv', sep='\t', skiprows=1,index_col=0)
# taxa = taxa.loc[taxa['Feature ID'].isin(biom['#OTU ID'].tolist())].copy()
# display(taxa)

#creating a new column with modified taxonomy. I also modified some annotations and deleted some symbols 
#that were crashing tree construction with modified files
tax_rep = {';__':'','[':'',']':'','.':'','/':'_',"'":'_',' ':'_','archaeon_enrichment':'a_e','YAB2003':'uncl(YAB2003)',
  'uncultured_rumen':'uncl','uncultured_archaeon':'uncl','uncultured_euryarchaeote':'uncl','DNF00809':'uncl(DNF00809)',
  'uncultured_compost':'uncl','_archaeon':'uncl','unidentified':'unid','unid_rumen':'unid','RF39':'uncl(RF39)',
  'uncultured_bacterium':'uncl','uncultured':'uncl','rumen_bacterium':'uncl','p-251-o5':'uncl(p-251-o5)','group':'gr',
  '_gr':'','F082':'uncl(F082)','WPS-2':'uncl(WPS-2)','\(Marine_B\)':'','Family_XIII_AD3011':'uncl(Family_XIII_AD3011)',
  'WCHB1.41':'uncl(WCHB1.41)','UCG.002':'uncl(UCG.002)','NK4A214':'uncl(NK4A214)','X0319.6G20':'uncl(X0319.6G20)',
  'NC2004':'uncl(NC2004)','SAR324_clade':'uncl(SAR324-clade)','UCG-006':'uncl(UCG-006)','SN8':'uncl(SN8)','RF16':'uncl(RF16)',
  'FD2005':'uncl(FD2005)','vadinHA49':'uncl(vadinHA49)','Clostridium_sensu_stricto_1':'Clostridium(sensu_stricto_1)',
  'UCG-014':'uncl(UCG-014)','UCG-014':'uncl(UCG-014)','UCG-010':'uncl(UCG-010)','UCG-004':'uncl(UCG-004)','Ga6A1':'uncl(Ga6A1)',
  'vadinBE97':'uncl(vadinBE97)','vadinBB60':'uncl(vadinBB60)','XBB1006':'uncl(XBB1006)','FE2018':'uncl(FE2018)',
  'UCG-005':'uncl(UCG-005)','UCG-007':'uncl(UCG-007)','UCG-009':'uncl(UCG-009)','R-7':'uncl(R-7)','UCG-003':'uncl(UCG-003)',
  'RC9_gut':'uncl(RC9_gut)','probable_genus_10':'uncl(gen10)','0319-6G20':'uncl(0319-6G20)','AC2044':'uncl(AC2044)',
  'FCS020':'uncl(FCS020)','Family_XIII_UCG-001':'uncl(Family_XIII_UCG001)','UCG-001':'uncl(UCG-001)','NK3A20':'uncl(NK3A20)',
  'Lineage_IV':'uncl(Lineage_IV)'}
taxa['Taxon2'] = taxa['Taxon']
for key in tax_rep.keys():
  taxa['Taxon2'] = taxa['Taxon2'].str.replace(key,tax_rep[key],regex=True)
#dealing with uncultured taxa to provide additional information
taxa['Combo'] =  taxa['Taxon2'].str.split("__").str[-1].str.split(";").str[-1]
for x in ['uncl','unid_methanogen','unid','methanogenic_uncl','uncl_methanogenic_uncl','uncl(UCG-004)','uncl(X0319.6G20)',
          'uncl(DNF00809)','uncl(UCG-006)','uncl(NC2004)','uncl(RF39)','uncl(SN8)','uncl(FD2005)',
          'uncl(vadinHA49)','uncl(vadinBE97)','uncl(vadinBB60)','uncl(XBB1006)','uncl(FE2018)','uncl(UCG-005)','uncl(UCG-007)',
          'uncl(UCG-009)','uncl(R-7)','uncl(UCG-003)','uncl(RF16)','uncl(Ga6A1)','uncl(RC9_gut)','uncl(gen10)','uncl(0319-6G20)',
          'uncl(FCS020)','uncl(Family_XIII_UCG-001)','uncl(Lineage_IV)',]:
  taxa.loc[taxa['Combo'].str[:]==x,'Combo']=taxa['Taxon2'].str.split("__").str[-2].str.split(';').str[0]+'_'+x
for n in range(3,6):
  for y in ['uncl_uncl','unid_uncl','uncl_unid','methanogenic_uncl','uncl(RF39)_uncl(RF39)','uncl(vadinHA49)_uncl(vadinHA49)',
            'uncl(0319-6G20)_uncl(0319-6G20)','uncl(vadinBE97)_uncl(vadinBE97)','unid_unid',]:
    taxa.loc[taxa.Combo.str[:]==y,'Combo']=taxa.Taxon2.str.split("__").str[-n].str.split(';').str[0]+'_'+y.split('_',1)[1]

#add modified taxonomy information to feature hashes, separating them by '|'
biom['New ID'] = taxa['Combo']+'|'+taxa.index
biom.set_index('New ID', inplace=True)
biom.index.names = ['#OTU ID']
taxa['New ID'] = taxa['Combo']+'|'+taxa.index
taxa.set_index('New ID', inplace=True)
taxa.index.names = ['Feature ID']  
taxa = taxa[['Taxon', 'Confidence', 'Consensus', 'Method']]

### writing modified files
biom.to_csv('Biom/feature-table.tsv', sep='\t',)
taxa.to_csv('Taxa/taxonomy.tsv', sep='\t',)
fasta_hash  = r"Rep-seqs/dna-sequences.fasta"
fasta_combo = r"Rep-seqs/dna-sequences.fa"
hlist = biom.index.tolist()
with open(fasta_hash) as hashes, open(fasta_combo, 'w') as combo:
  for record in SeqIO.parse(fasta_hash, 'fasta'):
    for h in hlist:
      if str(record.id) in str(h):
        combo.write('>'+str(h)+'\n'+str(record.seq)+'\n')

#some cleaning and renaming
!rm $fasta_hash
!mv $fasta_combo $fasta_hash

#creating new rep-seqs.qza, table.qza and taxonomy.qza with modified hashes (added 'combo_' in the name)
!biom convert -i Biom/feature-table.tsv -o Biom/feature-table.biom --table-type="OTU table" --to-hdf5

!qiime tools import \
  --input-path Biom/feature-table.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV210Format \
  --output-path $tabex

!qiime tools import \
  --type 'FeatureData[Taxonomy]' \
  --input-path Taxa/taxonomy.tsv \
  --output-path $taxex

!qiime tools import \
  --input-path $fasta_hash \
  --type 'FeatureData[Sequence]' \
  --output-path $repex

!rm -r Biom Taxa Rep-seqs #clean temp directories

!qiime feature-table summarize \
  --i-table $tabex \
  --o-visualization $tabev
!mv $fasta_combo $fasta_hash

[32mExported Data/merged_table.qza as BIOMV210DirFmt to directory Biom/[0m
[32mExported Data/arc_dada2_taxonomy_vsearch-sklearn.qza as TSVTaxonomyDirectoryFormat to directory Taxa[0m
[32mExported Data/merged_rep-seqs.qza as DNASequencesDirectoryFormat to directory Rep-seqs/[0m
[32mImported Biom/feature-table.biom as BIOMV210Format to Data/combo_table.qza[0m
[32mImported Taxa/taxonomy.tsv as TSVTaxonomyDirectoryFormat to Data/combo_taxonomy.qza[0m
[32mImported Rep-seqs/dna-sequences.fasta as DNASequencesDirectoryFormat to Data/combo_rep-seqs.qza[0m
[32mSaved Visualization to: Data/combo_table.qzv[0m
mv: cannot stat 'Rep-seqs/dna-sequences.fa': No such file or directory


# Filtration to remove low abundant features

In [8]:
tabdir = 'Data/Divided_tables'

!mkdir $tabdir
!qiime taxa filter-table \
  --i-table Data/combo_table.qza \
  --i-taxonomy Data/combo_taxonomy.qza \
  --p-exclude mitochondria,chloroplast,d__Bacteria,d__Eukaryota \
  --p-include p__ \
  --o-filtered-table $tabdir/full-table.qza

#!qiime feature-table filter-samples \
#  --i-table $tabdir/full-table.qza \
#  --p-min-features 25 \
#  --p-min-frequency 10000 \
#  --o-filtered-table $tabdir/full-table.qza

!qiime feature-table summarize \
  --i-table $tabdir/full-table.qza \
  --o-visualization $tabdir/full-table.qzv \
  --m-sample-metadata-file metadata.tsv

!qiime feature-table filter-seqs \
  --i-data Data/combo_rep-seqs.qza \
  --i-table $tabdir/full-table.qza \
  --o-filtered-data Data/combo_rep-seqs.qza

mkdir: cannot create directory ‘Data/Divided_tables’: File exists
[32mSaved FeatureTable[Frequency] to: Data/Divided_tables/full-table.qza[0m
[32mSaved Visualization to: Data/Divided_tables/full-table.qzv[0m
[32mSaved FeatureData[Sequence] to: Data/combo_rep-seqs.qza[0m


# Generate a tree for phylogenetic diversity analysis

In [9]:
!qiime phylogeny align-to-tree-mafft-fasttree \
  --i-sequences Data/combo_rep-seqs.qza \
  --p-n-threads 4 \
  --o-alignment Data/aligned-rep-seqs.qza \
  --o-masked-alignment Data/masked-aligned-rep-seqs.qza \
  --o-tree Data/unrooted-tree.qza \
  --o-rooted-tree Data/rooted-tree.qza

[32mSaved FeatureData[AlignedSequence] to: Data/aligned-rep-seqs.qza[0m
[32mSaved FeatureData[AlignedSequence] to: Data/masked-aligned-rep-seqs.qza[0m
[32mSaved Phylogeny[Unrooted] to: Data/unrooted-tree.qza[0m
[32mSaved Phylogeny[Rooted] to: Data/rooted-tree.qza[0m


# Taxabarplots

In [10]:
!mkdir -p Results/Taxa_barplots
!qiime taxa barplot \
  --i-table Data/Divided_tables/full-table.qza \
  --i-taxonomy Data/combo_taxonomy.qza \
  --m-metadata-file metadata.tsv \
  --o-visualization Results/Taxa_barplots/full-taxabarplot.qzv

[32mSaved Visualization to: Results/Taxa_barplots/full-taxabarplot.qzv[0m


# Alpha and beta diversity analysis

### Alpha rarefaction plotting

In [11]:
!qiime diversity alpha-rarefaction \
  --i-table Data/Divided_tables/full-table.qza \
  --i-phylogeny Data/rooted-tree.qza \
  --p-max-depth 20000 \
  --m-metadata-file metadata.tsv \
  --o-visualization Results/Alpha_rarefaction.qzv

[32mSaved Visualization to: Results/Alpha_rarefaction.qzv[0m


### Core-metrics-phylogenetic: Core diversity metrics (phylogenetic and non-phylogenetic)

In [12]:
!qiime diversity core-metrics-phylogenetic \
  --i-phylogeny Data/rooted-tree.qza \
  --i-table Data/Divided_tables/full-table.qza \
  --p-sampling-depth 3580 \
  --m-metadata-file metadata.tsv \
  --p-n-jobs-or-threads 'auto' \
  --output-dir Results/Core-metrics

[32mSaved FeatureTable[Frequency] to: Results/Core-metrics/rarefied_table.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/faith_pd_vector.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/observed_features_vector.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/shannon_vector.qza[0m
[32mSaved SampleData[AlphaDiversity] to: Results/Core-metrics/evenness_vector.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/unweighted_unifrac_distance_matrix.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/weighted_unifrac_distance_matrix.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/jaccard_distance_matrix.qza[0m
[32mSaved DistanceMatrix to: Results/Core-metrics/bray_curtis_distance_matrix.qza[0m
[32mSaved PCoAResults to: Results/Core-metrics/unweighted_unifrac_pcoa_results.qza[0m
[32mSaved PCoAResults to: Results/Core-metrics/weighted_unifrac_pcoa_results.qza[0m
[32mSaved PCoAResults to: Resu

### Principal Coordinate Analysis Biplot

In [13]:
table = 'Data/Divided_tables/full-table.qza'
reltab = 'Data/Relative_tables/full-relative_table.qza'
  
  # Converting feature table [Frequency] to [Relative frequency]
!mkdir Data/Relative_tables
!qiime feature-table relative-frequency \
  --i-table $table \
  --o-relative-frequency-table $reltab

for artifact in ['unweighted_unifrac', 'weighted_unifrac', 'jaccard', 'bray_curtis']:
  pcoa  = 'Results/Core-metrics/%s_pcoa_results.qza' % artifact
  bidir = 'Results/Biplots'
  biplot= bidir+'/%s_biplot.qza' % artifact
  bi_qzv= bidir+'/%s_biplot.qzv' % artifact

  !mkdir -p $bidir
    
  # pcoa-biplot: Principal Coordinate Analysis Biplot     
  !qiime diversity pcoa-biplot \
    --i-pcoa $pcoa \
    --i-features $reltab \
    --o-biplot $biplot

  !qiime emperor biplot \
    --i-biplot $biplot \
    --m-sample-metadata-file metadata.tsv \
    --p-ignore-missing-samples \
    --p-number-of-features 5 \
    --o-visualization $bi_qzv

mkdir: cannot create directory ‘Data/Relative_tables’: File exists
[32mSaved FeatureTable[RelativeFrequency] to: Data/Relative_tables/full-relative_table.qza[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/unweighted_unifrac_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/unweighted_unifrac_biplot.qzv[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/weighted_unifrac_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/weighted_unifrac_biplot.qzv[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/jaccard_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/jaccard_biplot.qzv[0m
[32mSaved PCoAResults % Properties('biplot') to: Results/Biplots/bray_curtis_biplot.qza[0m
[32mSaved Visualization to: Results/Biplots/bray_curtis_biplot.qzv[0m
