In [6]:
#install biopython into conda env
!conda install -c conda-forge biopython -y

^C

CondaError: KeyboardInterrupt



In [6]:
from Bio import SeqIO
import glob
import os
from qiime2 import Artifact
from qiime2.plugins.feature_classifier.methods import extract_reads
from qiime2.plugins.feature_table.methods import merge, merge_seqs, merge_taxa
import shutil
import tarfile
import tempfile
import urllib.request
from zipfile import ZipFile

In [2]:
def download_file(url, local_filepath):
    with urllib.request.urlopen(url) as response, open(local_filepath, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)

In [3]:
def download_qiita_files(qiita_study_id, qiita_artifact_id):
    """Download and extract the metadata from the study as well as the biom
    file and fasta file for each artifact listed.
    
    Parameters
    ----------
    study_id : int or str
        the qiita id of the study
    artifact id : int, str, or list of ints or strs
        the artifact id of the deblur reference hit table artifact. One
        artifact per 16S prep in the study, which will be merged"""
    
    #download metadata
    download_file('https://qiita.ucsd.edu/public_download/?data=sample_information&study_id=' +
                  str(qiita_study_id), working_dir + '/input/metadata.zip')
    #download artifacts
    if type(qiita_artifact_id) != list:
        download_file('https://qiita.ucsd.edu/public_artifact_download/?artifact_id=' +
                      str(qiita_artifact_id),
                      working_dir + '/input/qiita_artifact.zip')
    else:
        for artifact_id in qiita_artifact_id:
            download_file('https://qiita.ucsd.edu/public_artifact_download/?artifact_id=' +
                          str(artifact_id),
                          working_dir + '/input/qiita_artifact_' +
                          str(artifact_id) + '.zip')
    #unzip files
    with tempfile.TemporaryDirectory() as temp_dir:
        if type(qiita_artifact_id) != list:
            with ZipFile(working_dir + '/input/qiita_artifact.zip') as artifact_zip:
                artifact_zip.extractall(temp_dir)
                biom_path = glob.glob(temp_dir + '/BIOM/' + str(qiita_artifact_id) + '/*.biom')[0]
                shutil.copyfile(biom_path, working_dir + '/input/qiita.biom')
                fasta_path = glob.glob(temp_dir + '/BIOM/' + str(qiita_artifact_id) + '/*.fa')[0]
                shutil.copyfile(fasta_path, working_dir + '/input/qiita.fa')
        else:
            for artifact_id in qiita_artifact_id:
                with ZipFile(working_dir + '/input/qiita_artifact_' + str(artifact_id) + '.zip') as artifact_zip:
                    artifact_zip.extractall(temp_dir)
                    biom_path = glob.glob(temp_dir + '/BIOM/' + str(artifact_id) + '/*.biom')[0]
                    shutil.copyfile(biom_path, working_dir + '/input/qiita_' + str(artifact_id) + '.biom')
                    fasta_path = glob.glob(temp_dir + '/BIOM/' + str(artifact_id) + '/*.fa')[0]
                    shutil.copyfile(fasta_path, working_dir + '/input/qiita_' + str(artifact_id) + '.fa')
        with ZipFile(working_dir + '/input/metadata.zip') as metadata_zip:
            metadata_name = metadata_zip.namelist()[0].split('/')[1]
            metadata_zip.extractall(temp_dir)
        shutil.copyfile(temp_dir + '/templates/' + metadata_name,
                        working_dir + '/input/metadata.txt')

In [4]:
qiita_study_id = 11166
qiita_artifact_id = [82961, 56596, 56553, 56600, 82881, 56579, 56595, 57972]
working_dir = os.path.abspath('../')
refs_dir = working_dir + '/output/taxonomy_references'

In [11]:
if not os.path.exists(refs_dir):
    os.mkdir(refs_dir)
if not os.path.isfile(refs_dir + '/silva_sequences.qza'):
    download_file('https://data.qiime2.org/2021.2/common/silva-138-99-seqs-515-806.qza', 
                  refs_dir + '/silva_sequences.qza')
if not os.path.isfile(refs_dir + '/silva_taxonomy.qza'):
    download_file('https://data.qiime2.org/2021.2/common/silva-138-99-tax-515-806.qza', 
                  refs_dir + '/silva_taxonomy.qza')
if not os.path.isfile(refs_dir + '/gg_13_8_otus.tar.gz'):
    download_file('ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz',
                  refs_dir + '/gg_13_8_otus.tar.gz')
if not os.path.isfile(refs_dir + '/Metaxa2_2.2.1.tar.gz'):
    download_file('https://microbiology.se/sw/Metaxa2_2.2.1.tar.gz',
                  refs_dir + '/Metaxa2_2.2.1.tar.gz')
if not os.path.isfile(refs_dir + '/PhytoRef_with_taxonomy.fasta'):
    download_file('http://phytoref.sb-roscoff.fr/static/downloads/PhytoRef_with_taxonomy.fasta',
                  refs_dir + '/PhytoRef_with_taxonomy.fasta')

In [12]:
with tarfile.open(refs_dir + '/gg_13_8_otus.tar.gz', 'r:gz') as tar:
    tar.extractall(refs_dir)
with tarfile.open(refs_dir + '/Metaxa2_2.2.1.tar.gz', 'r:gz') as tar:
    tar.extractall(refs_dir)

In [13]:
os.chdir(refs_dir + '/Metaxa2_2.2.1/metaxa2_db/SSU')
!blastdbcmd -entry all -db blast -out metaxa2.fasta
shutil.copyfile(refs_dir + '/Metaxa2_2.2.1/metaxa2_db/SSU/metaxa2.fasta', 
                refs_dir + '/metaxa2.fasta')
os.chdir(working_dir + '/procedure')

In [14]:
with open(refs_dir + '/silva_organelle_taxonomy.tsv', 'w') as silva_taxonomy:
    with open(refs_dir + '/gg_organelle_taxonomy.tsv', 'w') as gg_taxonomy:
        silva_taxonomy.write('Feature ID\tTaxon\n')
        gg_taxonomy.write('Feature ID\tTaxon\n')
        with open(refs_dir + '/organelle_sequences.fasta', 'w') as organelle_seqs:
            for i, entry in enumerate(SeqIO.parse(refs_dir + '/metaxa2.fasta', 'fasta')):
                if 'mitochondria' in entry.description or 'Mitochondria' in entry.description:
                    organelle_seqs.write('>metaxa2_mitochondria_' + str(i) + '\n')
                    organelle_seqs.write(str(entry.seq + '\n'))
                    specific_info = str(entry.description).split(';')[-1]
                    silva_taxonomy.write('metaxa2_mitochondria_' + str(i) + '\td__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__Mitochondria; g__Mitochondria; s__' + specific_info + '\n')
                    gg_taxonomy.write('metaxa2_mitochondria_' + str(i) + '\tk__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__mitochondria; g__Mitochondria; s__' + specific_info + '\n')
            for i, entry in enumerate(SeqIO.parse(refs_dir + '/PhytoRef_with_taxonomy.fasta', 'fasta')):
                if not 'XXXXXXXXXX' in entry.seq:   #ditch the weird sequence
                    organelle_seqs.write('>phytoref_chloroplast_' + str(i) + '\n')
                    organelle_seqs.write(str(entry.seq + '\n'))
                    specific_info = str(entry.description).split('|')[-1]
                    silva_taxonomy.write('phytoref_chloroplast_' + str(i) + '\td__Bacteria; p__Cyanobacteria; c__Cyanobacteriia; o__Chloroplast; f__Chloroplast; g__Chloroplast; s__' + specific_info + '\n')
                    gg_taxonomy.write('phytoref_chloroplast_' + str(i) + '\tk__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Chloroplast; f__Chloroplast; g__Chloroplast; s__' + specific_info + '\n')

In [15]:
#import, select V4 region, merge, save
organelle_seqs = Artifact.import_data('FeatureData[Sequence]',
                                      refs_dir + '/organelle_sequences.fasta')
v4_organelle_seqs, = extract_reads(organelle_seqs, 'GTGYCAGCMGCCGCGGTAA',
                                   'GGACTACNVGGGTWTCTAAT', n_jobs = 24,
                                   read_orientation = 'forward')
silva_extended_seqs, = merge_seqs([v4_organelle_seqs,
                                   Artifact.load(refs_dir +
                                                 '/silva_sequences.qza')])
silva_extended_seqs.save(refs_dir + '/silva_extended_sequences.qza')
gg_seqs = Artifact.import_data('FeatureData[Sequence]', refs_dir +
                               '/gg_13_8_otus/rep_set/99_otus.fasta')
v4_gg_seqs, = extract_reads(gg_seqs, 'GTGYCAGCMGCCGCGGTAA',
                            'GGACTACNVGGGTWTCTAAT', n_jobs = 24,
                            read_orientation = 'forward')
v4_gg_seqs.save(refs_dir + '/gg_sequences.qza')
gg_extended_seqs, = merge_seqs([organelle_seqs, gg_seqs])
gg_extended_seqs.save(refs_dir + '/gg_extended_sequences.qza')

'/mnt/c/Users/Dylan/Documents/zaneveld/upload/output/taxonomy_references/gg_extended_sequences.qza'

In [16]:
silva_organelle_taxonomy = Artifact.import_data('FeatureData[Taxonomy]',
                                                refs_dir +
                                                '/silva_organelle_taxonomy.tsv')
silva_extended_taxonomy, = merge_taxa([silva_organelle_taxonomy,
                                       Artifact.load(refs_dir +
                                                     '/silva_taxonomy.qza')])
silva_extended_taxonomy.save(refs_dir + '/silva_extended_taxonomy.qza')
gg_taxonomy = Artifact.import_data('FeatureData[Taxonomy]', refs_dir +
                                   '/gg_13_8_otus/taxonomy/99_otu_taxonomy.txt',
                                   'HeaderlessTSVTaxonomyFormat')
gg_taxonomy.save(refs_dir + '/gg_taxonomy.qza')
gg_organelle_taxonomy = Artifact.import_data('FeatureData[Taxonomy]',
                                             refs_dir +
                                             '/gg_organelle_taxonomy.tsv')
gg_extended_taxonomy, = merge_taxa([gg_organelle_taxonomy, gg_taxonomy])
gg_extended_taxonomy.save(refs_dir + '/gg_extended_taxonomy.qza')

'/mnt/c/Users/Dylan/Documents/zaneveld/upload/output/taxonomy_references/gg_extended_taxonomy.qza'

In [5]:
download_qiita_files(qiita_study_id, qiita_artifact_id)

In [11]:
if type(qiita_artifact_id) != list:
    ft = Artifact.import_data('FeatureTable[Frequency]',
                              working_dir + '/input/qiita.biom', 'BIOMV210Format')
    seqs = Artifact.import_data('FeatureData[Sequence]',
                            working_dir + '/input/qiita.fa')
else:
    fts = []
    sequences = []
    for artifact_id in qiita_artifact_id:
        fts.append(Artifact.import_data('FeatureTable[Frequency]',
                                        working_dir + '/input/qiita_' +
                                        str(artifact_id) + '.biom',
                                        'BIOMV210Format'))
        sequences.append(Artifact.import_data('FeatureData[Sequence]',
                         working_dir + '/input/qiita_' + str(artifact_id) + '.fa'))
    ft, = merge(fts, 'average')
    seqs, = merge_seqs(sequences)
ft.save(working_dir + '/input/feature_table.qza')
seqs.save(working_dir + '/input/sequences.qza')        

'/mnt/c/Users/Dylan/Documents/zaneveld/upload/output/sequences.qza'

In [12]:
with open(working_dir + '/input/metadata.txt') as md_file:
    lines = md_file.readlines()
#sample_id is not a valid q2 metadata index label and must be changed
headers = lines[0].split('\t', 1)
if not headers[0] == '#SampleID':
    headers[0] = '#SampleID'
    lines[0] = '\t'.join(headers)
    with open(working_dir + '/input/metadata.txt', 'w') as md_file:
        md_file.writelines(lines)