In [1]:
#install biopython into conda env
!conda install -c conda-forge biopython -y

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [1]:
import os
import shutil
import tarfile
import urllib.request

from Bio import SeqIO
from qiime2 import Artifact
from qiime2.plugins.feature_classifier.methods import extract_reads
from qiime2.plugins.feature_table.methods import merge_seqs, merge_taxa

working_dir = os.path.abspath('../')
refs_dir = working_dir + '/output/taxonomy_references'

In [2]:
def download_file(url, local_filepath):
    """Download a file from a remote url and save to a local filepath
    
    url - the web address of the file you want to download as a string
    local_filepath - the local filepath to which the file will be saved
    """

    print(f"Downloading file: {url}")
    with urllib.request.urlopen(url) as response, open(local_filepath, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
    
    print(f"Saved to local filepath: {local_filepath}")

In [5]:
if not os.path.exists(refs_dir):
    os.mkdir(refs_dir)
if not os.path.isfile(refs_dir + '/Metaxa2_2.2.1.tar.gz'):
    download_file('https://microbiology.se/sw/Metaxa2_2.2.1.tar.gz',
                  refs_dir + '/Metaxa2_2.2.1.tar.gz')
if not os.path.isfile(refs_dir + '/silva-138-99-seqs-515-806.qza'):
    download_file('https://data.qiime2.org/2021.2/common/silva-138-99-seqs-515-806.qza', 
                  refs_dir + '/silva-138-99-seqs-515-806.qza')
if not os.path.isfile(refs_dir + '/silva-138-99-tax-515-806.qza'):
    download_file('https://data.qiime2.org/2021.2/common/silva-138-99-tax-515-806.qza', 
                  refs_dir + '/silva-138-99-tax-515-806.qza')
if not os.path.isfile(refs_dir + '/PhytoRef_with_taxonomy.fasta'):
    download_file('http://phytoref.sb-roscoff.fr/static/downloads/PhytoRef_with_taxonomy.fasta',
                  refs_dir + '/PhytoRef_with_taxonomy.fasta')

In [6]:
tar = tarfile.open(refs_dir + '/Metaxa2_2.2.1.tar.gz', 'r:gz')
tar.extractall(refs_dir)
tar.close()

In [7]:
os.chdir(refs_dir + '/Metaxa2_2.2.1/metaxa2_db/SSU')
!blastdbcmd -entry all -db blast -out metaxa2.fasta
shutil.copyfile(refs_dir + '/Metaxa2_2.2.1/metaxa2_db/SSU/metaxa2.fasta', 
                refs_dir + '/metaxa2.fasta')
os.chdir(working_dir + '/procedure')

In [8]:
with open(refs_dir + '/organelle_taxonomy.tsv', 'w') as organelle_taxonomy:
    organelle_taxonomy.write('Feature ID\tTaxon\n')
    with open(refs_dir + '/organelle_sequences.fasta', 'w') as organelle_seqs:
        for i, entry in enumerate(SeqIO.parse(refs_dir + '/metaxa2.fasta', 'fasta')):
            if 'mitochondria' in entry.description or 'Mitochondria' in entry.description:
                organelle_seqs.write('>metaxa2_mitochondria_' + str(i) + '\n')
                organelle_seqs.write(str(entry.seq + '\n'))
                specific_info = str(entry.description).split(';')[-1]
                organelle_taxonomy.write('metaxa2_mitochondria_' + str(i) + '\td__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__Mitochondria; g__Mitochondria; s__' + specific_info + '\n')
        for i, entry in enumerate(SeqIO.parse(refs_dir + '/PhytoRef_with_taxonomy.fasta', 'fasta')):
            if not 'XXXXXXXXXX' in entry.seq:   #ditch the weird sequence
                organelle_seqs.write('>phytoref_chloroplast_' + str(i) + '\n')
                organelle_seqs.write(str(entry.seq + '\n'))
                specific_info = str(entry.description).split('|')[-1]
                organelle_taxonomy.write('phytoref_chloroplast_' + str(i) + '\td__Bacteria; p__Cyanobacteria; c__Cyanobacteriia; o__Chloroplast; f__Chloroplast; g__Chloroplast; s__' + specific_info + '\n')

In [9]:
#import, select V4 region, merge, save
organelle_seqs = Artifact.import_data('FeatureData[Sequence]',
                                      refs_dir + '/organelle_sequences.fasta')
v4_organelle_seqs, = extract_reads(organelle_seqs, 'GTGYCAGCMGCCGCGGTAA',
                                   'GGACTACNVGGGTWTCTAAT', n_jobs = 24,
                                   read_orientation = 'forward')
extended_seqs, = merge_seqs([v4_organelle_seqs,
                            Artifact.load(working_dir + '/output/taxonomy_references/silva-138-99-seqs-515-806.qza')])
extended_seqs.save(refs_dir + '/extended_sequences.qza')

'/mnt/c/Users/Dylan/Documents/zaneveld/smp/output/taxonomy_references/extended_sequences.qza'

In [10]:
organelle_taxonomy = Artifact.import_data('FeatureData[Taxonomy]',
                                          refs_dir + '/organelle_taxonomy.tsv')
extended_taxonomy, = merge_taxa([organelle_taxonomy,
                                Artifact.load(refs_dir + '/silva-138-99-tax-515-806.qza')])
extended_taxonomy.save(refs_dir + '/extended_taxonomy.qza')

'/mnt/c/Users/Dylan/Documents/zaneveld/smp/output/taxonomy_references/extended_taxonomy.qza'