In [2]:
from Bio import SeqIO
import glob
import os
from qiime2 import Artifact
from qiime2.plugins.feature_classifier.methods import extract_reads
from qiime2.plugins.feature_table.methods import merge, merge_seqs, merge_taxa
import shutil
import tarfile
import tempfile
import urllib.request
from zipfile import ZipFile

In [None]:
qiita_study_id = 11166
qiita_artifact_id = [82961, 56596, 56553, 56600, 82881, 56579, 56595, 57972]
working_dir = os.path.abspath('../')
refs_dir = working_dir + '/taxonomy_references'

In [3]:
def download_qiita_files(qiita_study_id, qiita_artifact_id):
    """Download and extract the metadata from the study as well as the biom
    file and fasta file for each artifact listed.
    
    Parameters
    ----------
    study_id : int or str
        the qiita id of the study
    artifact id : int, str, or list of ints or strs
        the artifact id of the deblur reference hit table artifact. One
        artifact per 16S prep in the study, which will be merged"""
    
    #download metadata
    download_file('https://qiita.ucsd.edu/public_download/?data=sample_information&study_id=' +
                  str(qiita_study_id), working_dir + '/input/metadata.zip')
    #download artifacts
    if type(qiita_artifact_id) != list:
        download_file('https://qiita.ucsd.edu/public_artifact_download/?artifact_id=' +
                      str(qiita_artifact_id),
                      working_dir + '/input/qiita_artifact.zip')
    else:
        for artifact_id in qiita_artifact_id:
            download_file('https://qiita.ucsd.edu/public_artifact_download/?artifact_id=' +
                          str(artifact_id),
                          working_dir + '/input/qiita_artifact_' +
                          str(artifact_id) + '.zip')
    #unzip files
    with tempfile.TemporaryDirectory() as temp_dir:
        if type(qiita_artifact_id) != list:
            with ZipFile(working_dir + '/input/qiita_artifact.zip') as artifact_zip:
                artifact_zip.extractall(temp_dir)
                biom_path = glob.glob(temp_dir + '/BIOM/' + str(qiita_artifact_id) + '/*.biom')[0]
                shutil.copyfile(biom_path, working_dir + '/input/qiita.biom')
                fasta_path = glob.glob(temp_dir + '/BIOM/' + str(qiita_artifact_id) + '/*.fa')[0]
                shutil.copyfile(fasta_path, working_dir + '/input/qiita.fa')
        else:
            for artifact_id in qiita_artifact_id:
                with ZipFile(working_dir + '/input/qiita_artifact_' + str(artifact_id) + '.zip') as artifact_zip:
                    artifact_zip.extractall(temp_dir)
                    biom_path = glob.glob(temp_dir + '/BIOM/' + str(artifact_id) + '/*.biom')[0]
                    shutil.copyfile(biom_path, working_dir + '/input/qiita_' + str(artifact_id) + '.biom')
                    fasta_path = glob.glob(temp_dir + '/BIOM/' + str(artifact_id) + '/*.fa')[0]
                    shutil.copyfile(fasta_path, working_dir + '/input/qiita_' + str(artifact_id) + '.fa')
        with ZipFile(working_dir + '/input/metadata.zip') as metadata_zip:
            metadata_name = metadata_zip.namelist()[0].split('/')[1]
            metadata_zip.extractall(temp_dir)
        shutil.copyfile(temp_dir + '/templates/' + metadata_name,
                        working_dir + '/input/metadata.txt')

In [None]:
download_qiita_files(qiita_study_id, qiita_artifact_id)

In [None]:
if type(qiita_artifact_id) != list:
    ft = Artifact.import_data('FeatureTable[Frequency]',
                              working_dir + '/input/qiita.biom', 'BIOMV210Format')
    seqs = Artifact.import_data('FeatureData[Sequence]',
                            working_dir + '/input/qiita.fa')
else:
    fts = []
    sequences = []
    for artifact_id in qiita_artifact_id:
        fts.append(Artifact.import_data('FeatureTable[Frequency]',
                                        working_dir + '/input/qiita_' +
                                        str(artifact_id) + '.biom',
                                        'BIOMV210Format'))
        sequences.append(Artifact.import_data('FeatureData[Sequence]',
                         working_dir + '/input/qiita_' + str(artifact_id) + '.fa'))
    ft, = merge(fts, 'average')
    seqs, = merge_seqs(sequences)
ft.save(working_dir + '/input/feature_table.qza')
seqs.save(working_dir + '/input/sequences.qza')        

In [None]:
with open(working_dir + '/input/metadata.txt') as md_file:
    lines = md_file.readlines()
#sample_id is not a valid q2 metadata index label and must be changed
headers = lines[0].split('\t', 1)
if not headers[0] == '#SampleID':
    headers[0] = '#SampleID'
    lines[0] = '\t'.join(headers)
    with open(working_dir + '/input/metadata.txt', 'w') as md_file:
        md_file.writelines(lines)

In [None]:
ft = Artifact.import_data('FeatureTable[Frequency]', working_dir + '/input/deblur_all.biom', 'BIOMV210Format')
seqs = Artifact.import_data('FeatureData[Sequence]', working_dir + '/input/deblur_all.fa')
ft.save(working_dir + '/input/feature_table.qza')
seqs.save(working_dir + '/input/sequences.qza')