In [29]:
import glob
import gzip
import os
import urllib.request
import shutil
import tempfile
from qiime2 import Artifact
from qiime2 import Metadata
from qiime2.plugins.cutadapt.methods import demux_single
from qiime2.plugins.dada2.methods import denoise_single
from qiime2.plugins.deblur.methods import denoise_16S
from zipfile import ZipFile

In [3]:
#artifact of interest is the 'demultiplexed' artifact directly downstream
#of split libraries FASTQ (not one of the trimmed artifacts)
qiita_prep_id = 6501
qiita_artifact_id = 70146
working_dir = os.path.abspath('../')
refs_dir = working_dir + '/taxonomy_references'

In [4]:
def download_file(url, local_filepath):
    with urllib.request.urlopen(url) as response, open(local_filepath, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)

In [5]:
def download_qiita_files(qiita_prep_id, qiita_artifact_id):
    """Download and extract the metadata from the study as well as the biom
    file and fasta file for each artifact listed.
    
    Parameters
    ----------
    qiita_prep_id : int or str
        the prep id of the run of interest. This ID is unique to each
        run within a study.
    qiita_artifact_id : int, str, or list of ints or strs
        the artifact id of the deblur reference hit table artifact. One
        artifact per 16S prep in the study, which will be merged"""
    
    download_file('https://qiita.ucsd.edu/public_download/?data=prep_information&prep_id=' +
                  str(qiita_prep_id), working_dir + '/input/metadata.zip')
    #download artifacts
    download_file('https://qiita.ucsd.edu/public_artifact_download/?artifact_id=' +
                  str(qiita_artifact_id),
                  working_dir + '/input/qiita_artifact.zip')

In [6]:
download_qiita_files(qiita_prep_id, qiita_artifact_id)

In [7]:
#unzip files
with ZipFile(working_dir + '/input/qiita_artifact.zip') as artifact_zip:
    with artifact_zip.open('Demultiplexed/' + str(qiita_artifact_id) + '/seqs.fastq.gz') as zipped, open(working_dir + '/input/seqs.fastq.gz', 'wb') as file:
        shutil.copyfileobj(zipped, file)
with ZipFile(working_dir + '/input/metadata.zip') as metadata_zip:
    path = metadata_zip.namelist()[0]
    with metadata_zip.open(path) as zipped, open(working_dir + '/input/barcodes.txt', 'wb') as file:
        shutil.copyfileobj(zipped, file)

In [8]:
#set up barcode file
with open(working_dir + '/input/barcodes.txt') as file:
    lines = file.readlines()
    lines[0] = '#SampleID\tbarcode\tcenter_name\texperiment_design_description\tinstrument_model\tlibrary_construction_protocol\tpcr_primers\tplatform\tprimer\tqiita_prep_id\tsequencing_meth\ttarget_gene\ttarget_subfragment\n'
    with open(working_dir + '/input/qiime2/barcode_file.tsv', 'w') as metadata:
        for line in lines:
            metadata.write(line)

In [11]:
metadata = Metadata.load(working_dir + '/input/qiime2/barcode_file.tsv')
barcodes = metadata.get_column('barcode')

In [12]:
fastq = Artifact.import_data('MultiplexedSingleEndBarcodeInSequence', working_dir + '/input/seqs.fastq.gz')
demultiplexed_per_sample_sequences, untrimmed_sequences = demux_single(fastq, barcodes, 0)

Running external command line application. This may print messages to stdout and/or stderr.
The command being run is below. This command cannot be manually re-run as it will depend on temporary files that no longer exist.

Command: cutadapt --front file:/tmp/tmpkdgxn8j3 --error-rate 0 --minimum-length 1 -o /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-7mz2xtrz/{name}.1.fastq.gz --untrimmed-output /tmp/q2-MultiplexedSingleEndBarcodeInSequenceDirFmt-_ivmnm0x/forward.fastq.gz /tmp/qiime2-archive-nf9twiec/0722ff0e-8a20-4306-9d45-ad593030e6d0/data/forward.fastq.gz



In [13]:
#save everything
fastq.save(working_dir + '/input/qiime2/multiplexed-seqs.qza')
demultiplexed_per_sample_sequences.save(working_dir + '/input/qiime2/demuxed.qza')

'/mnt/c/Users/dsone/Documents/zaneveld/organelle_project/input/qiime2/demuxed.qza'

In [14]:
table, rep_seqs, stats = denoise_single(demultiplexed_per_sample_sequences, 100, n_threads = 0)
table.save(working_dir + '/output/dada2_table.qza')
rep_seqs.save(working_dir + '/output/dada2_rep_seqs.qza')

Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: run_dada_single.R /tmp/qiime2-archive-nq03ccp3/cb670c56-4df2-4cb0-bd9f-3a3acd0be245/data /tmp/tmpa1xcooej/output.tsv.biom /tmp/tmpa1xcooej/track.tsv /tmp/tmpa1xcooej 100 0 2.0 2 Inf independent consensus 1.0 0 1000000 NULL 16



In [16]:
table, rep_seqs, stats = denoise_16S(demultiplexed_per_sample_sequences, 100, jobs_to_start = 4)
table.save(working_dir + '/output/deblur_table.qza')
rep_seqs.save(working_dir + '/output/deblur_rep_seqs.qza')

'/mnt/c/Users/dsone/Documents/zaneveld/organelle_project/output/deblur_rep_seqs.qza'

In [18]:
#unoise
#unzip fastq qza
qza = Artifact.load(working_dir + '/input/qiime2/demuxed.qza')
qza.export_data(working_dir + '/input/usearch/')

In [27]:
#concatenate every fastq file
fastqs = glob.glob(working_dir + '/input/usearch/*001.fastq.gz')
with open(working_dir + '/input/usearch/seqs.fastq.gz', 'wb') as concatenated:
    for fp in fastqs:
        with open(fp, 'rb') as fastq:
            shutil.copyfileobj(fastq, concatenated)

In [30]:
#unzip .gz
with gzip.open(working_dir + '/input/usearch/seqs.fastq.gz', 'rb') as gz:
    with open(working_dir + '/input/usearch/seqs.fastq', 'wb') as fastq:
        shutil.copyfileobj(gz, fastq)

In [33]:
#truncate to 100
!../procedure/usearch -fastx_truncate ../input/usearch/seqs.fastq -trunclen 100 -fastqout ../input/usearch/seqs_100.fastq

usearch v11.0.667_i86linux32, 4.0Gb RAM (12.3Gb total), 12 cores
(C) Copyright 2013-18 Robert C. Edgar, all rights reserved.
https://drive5.com/usearch

License: personal use only

00:02 37Mb    100.0% Processing, 185748 (31.8%) too short


In [34]:
!../procedure/usearch -fastq_filter ../input/usearch/seqs_100.fastq -fastq_maxee_rate 0.005 -fastaout ../input/usearch/filtered_reads.fasta

usearch v11.0.667_i86linux32, 4.0Gb RAM (12.3Gb total), 12 cores
(C) Copyright 2013-18 Robert C. Edgar, all rights reserved.
https://drive5.com/usearch

License: personal use only

00:00 4.2Mb  FASTQ base 33 for file ../input/usearch/seqs_100.fastq
00:00 38Mb   CPU has 12 cores, defaulting to 10 threads
00:02 90Mb    100.0% Filtering, 96.4% passed
    397501  Reads (397.5k)                  
    383179  Filtered reads (383.2k, 96.4%)


In [35]:
!../procedure/usearch -fastx_uniques ../input/usearch/filtered_reads.fasta -fastaout ../input/usearch/uniques.fasta -sizeout -relabel Uniq

usearch v11.0.667_i86linux32, 4.0Gb RAM (12.3Gb total), 12 cores
(C) Copyright 2013-18 Robert C. Edgar, all rights reserved.
https://drive5.com/usearch

License: personal use only

00:00 108Mb   100.0% Reading ../input/usearch/filtered_reads.fasta
00:00 75Mb   CPU has 12 cores, defaulting to 10 threads           
00:00 234Mb   100.0% DF
00:00 238Mb  383179 seqs, 28744 uniques, 18704 singletons (65.1%)
00:00 238Mb  Min size 1, median 1, max 28226, avg 13.33
00:00 172Mb   100.0% Writing ../input/usearch/uniques.fasta


In [36]:
!../procedure/usearch -sortbysize ../input/usearch/uniques.fasta -fastaout ../input/usearch/sorted_uniques.fasta 

usearch v11.0.667_i86linux32, 4.0Gb RAM (12.3Gb total), 12 cores
(C) Copyright 2013-18 Robert C. Edgar, all rights reserved.
https://drive5.com/usearch

License: personal use only

00:00 44Mb    100.0% Reading ../input/usearch/uniques.fasta
00:00 11Mb   Getting sizes                                 
00:00 11Mb   Sorting 28744 sequences
00:00 11Mb    100.0% Writing output


In [37]:
!../procedure/usearch -unoise3 ../input/usearch/sorted_uniques.fasta -zotus ../output/unoise_zotus.fasta -tabbedout ../output/unoise3.txt

usearch v11.0.667_i86linux32, 4.0Gb RAM (12.3Gb total), 12 cores
(C) Copyright 2013-18 Robert C. Edgar, all rights reserved.
https://drive5.com/usearch

License: personal use only

00:00 45Mb    100.0% Reading ../input/usearch/sorted_uniques.fasta
00:00 12Mb      0.0% 0 amplicons, 0 bad (size >= 28226)           

00:00 19Mb    100.0% 556 amplicons, 52943 bad (size >= 8)
00:00 26Mb    100.0% 549 good, 8 chimeras                
00:00 26Mb    100.0% Writing zotus       
