# Import demultiplexed filtered reads

In [11]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

# sort reads by sequencing run
for ri in set(met.Run_index):
  tmp = met.loc[met.Run_index == ri]['SeqRun'].tolist()[0]
  !mkdir Data/Demux/$tmp
  !mv Data/Demux/$ri/* Data/Demux/$tmp
  !rm -r Data/Demux/$ri

mkdir: cannot create directory ‘Data/Demux/SeqRun_2’: File exists
mkdir: cannot create directory ‘Data/Demux/SeqRun_2’: File exists
mkdir: cannot create directory ‘Data/Demux/SeqRun_2’: File exists


In [12]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

# import reads
for sr in set(met.SeqRun):
  demux = 'Data/Demux/%s_demux.qza'%sr
  
  !qiime tools import \
    --type 'SampleData[PairedEndSequencesWithQuality]' \
    --input-path Data/Demux/$sr \
    --input-format CasavaOneEightSingleLanePerSampleDirFmt \
    --output-path $demux

[32mImported Data/Demux/SeqRun_1 as CasavaOneEightSingleLanePerSampleDirFmt to Data/Demux/SeqRun_1_demux.qza[0m
[32mImported Data/Demux/SeqRun_2 as CasavaOneEightSingleLanePerSampleDirFmt to Data/Demux/SeqRun_2_demux.qza[0m


In [27]:
for sr in set(met.SeqRun):
  demux = 'Data/Demux/%s_demux.qza'%sr
  deqzv = 'Data/Demux/%s_demux.qzv'%sr
  
  !qiime demux summarize \
    --i-data $demux \
    --o-visualization $deqzv

[32mSaved Visualization to: Data/Demux/SeqRun_1_demux.qzv[0m
[32mSaved Visualization to: Data/Demux/SeqRun_2_demux.qzv[0m


# Dada2 denoising

In [36]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

adapter = len(met.LinkerPrimerSequence.tolist()[0])
!mkdir Data/Denoised
for sr in set(met.SeqRun):
  demux = 'Data/Demux/%s_demux.qza'%sr
  table = 'Data/Denoised/%s_bac_table.qza'%sr
  taqzv = 'Data/Denoised/%s_bac_table.qzv'%sr
  repsq = 'Data/Denoised/%s_bac_rep-seqs.qza'%sr
  reqzv = 'Data/Denoised/%s_bac_rep-seqs.qzv'%sr
  stats = 'Data/Denoised/%s_bac_denoising-stats.qza'%sr
  stqzv = 'Data/Denoised/%s_bac_denoising-stats.qzv'%sr
  
  !qiime dada2 denoise-paired \
    --i-demultiplexed-seqs $demux \
    --p-trim-left-f $adapter \
    --p-trunc-len-f 225 \
    --p-trunc-len-r 185 \
    --p-n-threads 10 \
    --o-table $table \
    --o-representative-sequences $repsq \
    --o-denoising-stats $stats

  !qiime feature-table tabulate-seqs \
    --i-data $repsq \
    --o-visualization $reqzv

  !qiime metadata tabulate \
    --m-input-file $stats \
    --o-visualization $stqzv

  !qiime feature-table summarize \
    --i-table $table \
    --o-visualization $taqzv \
    --m-sample-metadata-file metadata.tsv

[32mSaved FeatureTable[Frequency] to: Data/Denoised/SeqRun_1_bac_table.qza[0m
[32mSaved FeatureData[Sequence] to: Data/Denoised/SeqRun_1_bac_rep-seqs.qza[0m
[32mSaved SampleData[DADA2Stats] to: Data/Denoised/SeqRun_1_bac_denoising-stats.qza[0m
[32mSaved Visualization to: Data/Denoised/SeqRun_1_bac_rep-seqs.qzv[0m
[32mSaved Visualization to: Data/Denoised/SeqRun_1_bac_denoising-stats.qzv[0m
[32mSaved Visualization to: Data/Denoised/SeqRun_1_bac_table.qzv[0m
[32mSaved FeatureTable[Frequency] to: Data/Denoised/SeqRun_2_bac_table.qza[0m
[32mSaved FeatureData[Sequence] to: Data/Denoised/SeqRun_2_bac_rep-seqs.qza[0m
[32mSaved SampleData[DADA2Stats] to: Data/Denoised/SeqRun_2_bac_denoising-stats.qza[0m
[32mSaved Visualization to: Data/Denoised/SeqRun_2_bac_rep-seqs.qzv[0m
[32mSaved Visualization to: Data/Denoised/SeqRun_2_bac_denoising-stats.qzv[0m
[32mSaved Visualization to: Data/Denoised/SeqRun_2_bac_table.qzv[0m


# Merge different runs

In [38]:
!qiime feature-table merge \
  --i-tables Data/Denoised/SeqRun_1_bac_table.qza Data/Denoised/SeqRun_2_bac_table.qza \
  --o-merged-table Data/merged_table.qza

!qiime feature-table merge-seqs \
  --i-data Data/Denoised/SeqRun_1_bac_rep-seqs.qza Data/Denoised/SeqRun_2_bac_rep-seqs.qza \
  --o-merged-data Data/merged_rep-seqs.qza

!qiime feature-table summarize \
  --i-table Data/merged_table.qza \
  --o-visualization Data/merged_table.qzv \
  --m-sample-metadata-file metadata.tsv

[32mSaved FeatureTable[Frequency] to: Data/merged_table.qza[0m


# Training feature classifier 
### 138 Silva db ref-seqs and taxonomy were obtained from Qiime2 website and prodused by rescript

In [43]:
# Extract reference reads
!qiime feature-classifier extract-reads \
  --i-sequences Classifier/silva-138-99-seqs.qza \
  --p-f-primer AGAGTTTGATCCTGGCTCAG \
  --p-r-primer TGCTGCCTCCCGTAGGAGT \
  --p-min-length 280 \
  --p-max-length 500 \
  --o-reads Classifier/ref-seqs.qza

[32mSaved FeatureData[Sequence] to: Classifier/ref-seqs.qza[0m


In [46]:
# Train classifier
!qiime feature-classifier fit-classifier-naive-bayes \
  --i-reference-reads Classifier/ref-seqs.qza \
  --i-reference-taxonomy Classifier/silva-138-99-tax.qza \
  --o-classifier Classifier/V1-V2_silva_138_classifier.qza

[32mSaved TaxonomicClassifier to: Classifier/V1-V2_silva_138_classifier.qza[0m


# Taxonomy assignment

In [125]:
!qiime feature-classifier classify-hybrid-vsearch-sklearn \
  --i-query Data/merged_rep-seqs.qza \
  --i-reference-reads Classifier/ref-seqs.qza \
  --i-reference-taxonomy Classifier/silva-138-99-tax.qza \
  --i-classifier Classifier/V1-V2_silva_138_classifier.qza \
  --p-threads 10 \
  --p-no-prefilter \
  --o-classification Data/taxonomy_vsearch-sklearn.qza

!qiime metadata tabulate \
  --m-input-file Data/taxonomy_vsearch-sklearn.qza \
  --o-visualization Data/taxonomy_vsearch-sklearn.qzv

[32mSaved FeatureData[Taxonomy] to: Data/taxonomy_vsearch-sklearn.qza[0m
[32mSaved Visualization to: Data/taxonomy_vsearch-sklearn.qzv[0m


In [48]:
!qiime feature-classifier classify-consensus-blast \
  --i-query Data/merged_rep-seqs.qza \
  --i-reference-reads Classifier/ref-seqs.qza \
  --i-reference-taxonomy Classifier/silva-138-99-tax.qza \
  --o-classification Data/taxonomy_blast.qza

[32mSaved FeatureData[Taxonomy] to: Data/taxonomy_blast.qza[0m


# Combo: Combining ASV hashes with last available taxa 

In [None]:
# Install biopython
!pip install biopython

In [139]:
import pandas as pd
from Bio import SeqIO

# declare Qiime2 generated artifacts here
table    = 'Data/merged_table.qza' 
taxo     = 'Data/taxonomy_vsearch-sklearn.qza'
rep_seq  = 'Data/merged_rep-seqs.qza'

# export rep-seqs.qza, table.qza and taxonomy.qza
!mkdir Biom Taxa Rep-seqs #temp directories

!qiime tools export \
  --input-path $table \
  --output-path Biom/

!qiime tools export \
  --input-path $taxo \
  --output-path Taxa

!qiime tools export \
  --input-path $rep_seq \
  --output-path Rep-seqs/

# convert .biom to .tsv
!biom convert -i Biom/feature-table.biom -o Biom/feature-table.tsv --to-tsv 

# *****replacing hashes with combination of taxonomy and beginings of the hashes*****
# reading tables
taxa = pd.read_csv('Taxa/taxonomy.tsv', sep='\t')
biom = pd.read_csv('Biom/feature-table.tsv', sep='\t', skiprows=1)

#creating a new column with modified taxonomy
#I also shortened some annotations and deleted some symbols 
#that were crashing tree construction with modified files
taxa['Taxon'] = taxa.Taxon.replace(';__','').str.replace('[','').str.replace(']','').str.replace('.','')\
.str.replace('/','_').str.replace("'",'').str.replace(' ','_').str.replace('uncultured_bacterium','unc_bac')\
.str.replace('uncultured_rumen','unc_rum').str.replace('unidentified_rumen','unid_rum')\
.str.replace('rumen_bacterium','rum_bac').str.replace('uncultured','unc')

taxa['Combo'] =  taxa['Taxon'].str.split("__").str[-1].str.split(";").str[-1]
taxa.loc[taxa['Combo'].str[:]=='unc_bac','Combo']=taxa['Taxon'].str.split("__").str[-2].str.split(';').str[0]+'_unc_bac'
taxa.loc[taxa['Combo'].str[:]=='unc_rum','Combo']=taxa['Taxon'].str.split("__").str[-2].str.split(';').str[0]+'_unc_rum'
taxa.loc[taxa['Combo'].str[:]=='unid_rum','Combo']=taxa['Taxon'].str.split("__").str[-2].str.split(';').str[0]+'_unid_rum'
taxa.loc[taxa['Combo'].str[:]=='rum_bac','Combo']=taxa['Taxon'].str.split("__").str[-2].str.split(';').str[0]+'_rum_bac'
taxa.loc[taxa['Combo'].str[:]=='unc','Combo'] = taxa['Taxon'].str.split("__").str[-2].str.split(';').str[0]+'_unc'

#dealing with uncultured taxa to provide additional information
for n in range(3,6):
  taxa.loc[taxa.Combo.str[:]=='unc_unc_bac','Combo']=taxa.Taxon.str.split("__").str[-n].str.split(';').str[0]+'_unc_bac'
  taxa.loc[taxa.Combo.str[:]=='unc_unc_rum','Combo']=taxa.Taxon.str.split("__").str[-n].str.split(';').str[0]+'_unc_rum'
  taxa.loc[taxa.Combo.str[:]=='unc_unc','Combo']=taxa.Taxon.str.split("__").str[-n].str.split(';').str[0]+'_unc'

#add modified taxonomy information to feature hashes, separating them by '|'
biom['#OTU ID'] = taxa['Combo']+'|'+taxa['Feature ID']
taxa['Feature ID'] = biom['#OTU ID']
taxa = taxa[['Feature ID', 'Taxon', 'Confidence', 'Consensus', 'Method']]

### writing modified files
biom.to_csv('Biom/feature-table.tsv', sep='\t', index=False)
taxa.to_csv('Taxa/taxonomy.tsv', sep='\t', index=False)
fasta_hash  = r"Rep-seqs/dna-sequences.fasta"
fasta_combo = r"Rep-seqs/dna-sequences.fa"
hlist = biom['#OTU ID'].tolist()
with open(fasta_hash) as hashes, open(fasta_combo, 'w') as combo:
  for record in SeqIO.parse(fasta_hash, 'fasta'):
    for h in hlist:
      if str(record.id) in h:
        combo.write('>'+h+'\n'+str(record.seq)+'\n')

#some cleaning and renaming
!rm $fasta_hash
!mv $fasta_combo $fasta_hash

[32mExported Data/merged_table.qza as BIOMV210DirFmt to directory Biom/[0m
[32mExported Data/taxonomy_vsearch-sklearn.qza as TSVTaxonomyDirectoryFormat to directory Taxa[0m
[32mExported Data/merged_rep-seqs.qza as DNASequencesDirectoryFormat to directory Rep-seqs/[0m


In [140]:
#creating new rep-seqs.qza, table.qza and taxonomy.qza with modified hashes (added 'combo_' in the name)
!biom convert -i Biom/feature-table.tsv -o Biom/feature-table.biom --table-type="OTU table" --to-hdf5

!qiime tools import \
  --input-path Biom/feature-table.biom \
  --type 'FeatureTable[Frequency]' \
  --input-format BIOMV210Format \
  --output-path Data/combo_table.qza

!qiime tools import \
  --type 'FeatureData[Taxonomy]' \
  --input-path Taxa/taxonomy.tsv \
  --output-path Data/combo_taxonomy.qza

!qiime tools import \
  --input-path $fasta_hash \
  --type 'FeatureData[Sequence]' \
  --output-path Data/combo_rep-seqs.qza

!rm -r Biom Taxa Rep-seqs #clean temp directories

!qiime feature-table summarize \
  --i-table Data/combo_table.qza \
  --m-sample-metadata-file metadata.tsv \
  --o-visualization Data/combo_table.qzv

[32mImported Biom/feature-table.biom as BIOMV210Format to Data/combo_table.qza[0m
[32mImported Taxa/taxonomy.tsv as TSVTaxonomyDirectoryFormat to Data/combo_taxonomy.qza[0m
[32mImported Rep-seqs/dna-sequences.fasta as DNASequencesDirectoryFormat to Data/combo_rep-seqs.qza[0m
[32mSaved Visualization to: Data/combo_table.qzv[0m


# Generate a tree for phylogenetic diversity analysis

In [141]:
!qiime phylogeny align-to-tree-mafft-fasttree \
  --i-sequences Data/combo_rep-seqs.qza \
  --p-n-threads 10 \
  --o-alignment Data/aligned-rep-seqs.qza \
  --o-masked-alignment Data/masked-aligned-rep-seqs.qza \
  --o-tree Data/unrooted-tree.qza \
  --o-rooted-tree Data/rooted-tree.qza

[32mSaved FeatureData[AlignedSequence] to: Data/aligned-rep-seqs.qza[0m
[32mSaved FeatureData[AlignedSequence] to: Data/masked-aligned-rep-seqs.qza[0m
[32mSaved Phylogeny[Unrooted] to: Data/unrooted-tree.qza[0m
[32mSaved Phylogeny[Rooted] to: Data/rooted-tree.qza[0m


# Filtration to remove low abundant features

In [142]:
!qiime feature-table filter-features \
    --i-table Data/combo_table.qza \
    --p-min-frequency 10 \
    --o-filtered-table Data/filtered-table.qza

!qiime taxa filter-table \
    --i-table Data/filtered-table.qza \
    --i-taxonomy Data/combo_taxonomy.qza \
    --p-include p__ \
    --o-filtered-table Data/filtered-table.qza

!qiime taxa filter-table \
    --i-table Data/filtered-table.qza \
    --i-taxonomy Data/combo_taxonomy.qza \
    --p-exclude mitochondria,chloroplast \
    --o-filtered-table Data/filtered-table.qza

!qiime feature-table filter-samples \
    --i-table Data/filtered-table.qza \
    --p-min-features 25 \
    --p-min-frequency 3000 \
    --o-filtered-table Data/filtered-table.qza

!qiime feature-table summarize \
    --i-table Data/filtered-table.qza \
    --o-visualization Data/filtered-table.qzv \
    --m-sample-metadata-file metadata.tsv

[32mSaved FeatureTable[Frequency] to: Data/filtered-table.qza[0m
[32mSaved FeatureTable[Frequency] to: Data/filtered-table.qza[0m
[32mSaved FeatureTable[Frequency] to: Data/filtered-table.qza[0m
[32mSaved FeatureTable[Frequency] to: Data/filtered-table.qza[0m
[32mSaved Visualization to: Data/filtered-table.qzv[0m
