In [20]:
from Bio import SeqIO
from qiime2 import Artifact

In [None]:
#manually downloaded:
#all.seqs.fa
#all.biom
#GCMP_EMP_map_r28_no_empty_samples.txt
#gg_13_8_otus.tar.gz (from ftp://greengenes.microbio.me/greengenes_release/gg_13_5/gg_13_8_otus.tar.gz)
#Silva_132_release.zip (from https://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_132_release.zip)

#manually extract silva and greengenes

In [11]:
#extract mitochondria sequences from Metaxa2 and create a separate taxonomy file in the style of greengenes
working_dir = '/home/dylan/Documents/june_reset'
metaxa_path = working_dir + '/input/metaxa2.fasta'
greengenes_path = working_dir + '/input/gg_13_8_otus/rep_set/99_otus.fasta'
with open((working_dir + '/input/m2+gg_otus.fasta'), "a") as otu_file:
    with open((working_dir + '/input/m2+gg_taxonomy.txt'), "a") as taxonomy_file:
        for i, entry in enumerate(SeqIO.parse(metaxa_path, "fasta")):
            if 'mitochondria' in entry.description or 'Mitochondria' in entry.description:
                otu_file.write(">metaxa2_" + str(i) + "\n")
                otu_file.write(str(entry.seq + "\n"))
                taxonomy_file.write("metaxa2_" + str(i) + "\tk__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__mitochondria; g__; s__\n")
            else:
                continue

In [12]:
#copy greengenes otus into the combined file
with open((working_dir + '/input/m2+gg_otus.fasta'), "a") as otu_file:
    for entry in SeqIO.parse(greengenes_path, "fasta"):
        otu_file.write(">" + str(entry.description) + "\n")
        otu_file.write(str(entry.seq) + "\n")

In [15]:
#copy greengenes taxonomy into the combined file
with open((working_dir + '/input/m2+gg_taxonomy.txt'), "a") as taxonomy_file:
    with open(working_dir + '/input/gg_13_8_otus/taxonomy/99_otu_taxonomy.txt') as greengenes_taxonomy_file:
        for line in greengenes_taxonomy_file:
            taxonomy_file.write(line)

In [17]:
#extract mitochondria sequences from Metaxa2 and create a separate taxonomy file in the style of SILVA
silva_path = working_dir + '/input/Silva_132_release/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna'
with open((working_dir + '/input/m2+silva_otus.fasta'), "a") as otu_file:
    with open((working_dir + '/input/m2+silva_taxonomy.txt'), "a") as taxonomy_file:
        for i, entry in enumerate(SeqIO.parse(metaxa_path, "fasta")):
            if 'mitochondria' in entry.description or 'Mitochondria' in entry.description:
                otu_file.write(">metaxa2_" + str(i) + "\n")
                otu_file.write(str(entry.seq + "\n"))
                taxonomy_file.write("metaxa2_" + str(i) + "\tD_0__Bacteria;D_1__Proteobacteria;D_2__Alphaproteobacteria;D_3__Rickettsiales;D_4__Mitochondria;D_5__uncultured bacterium;D_6__uncultured bacterium\n")
            else:
                continue

In [18]:
#copy silva otus into the combined file
with open((working_dir + '/input/m2+silva_otus.fasta'), "a") as otu_file:
    for entry in SeqIO.parse(silva_path, "fasta"):
        otu_file.write(">" + str(entry.description) + "\n")
        otu_file.write(str(entry.seq) + "\n")

In [19]:
#copy silva taxonomy into the combined file
with open((working_dir + '/input/m2+silva_taxonomy.txt'), "a") as taxonomy_file:
    with open(working_dir + '/input/Silva_132_release/SILVA_132_QIIME_release/taxonomy/16S_only/99/taxonomy_7_levels.txt') as silva_taxonomy_file:
        for line in silva_taxonomy_file:
            taxonomy_file.write(line)

In [24]:
#create greengenes taxonomy and OTU artifacts
gg_otu_path = working_dir + '/input/gg_13_8_otus/rep_set/99_otus.fasta'
gg_taxonomy_path = working_dir + '/input/gg_13_8_otus/taxonomy/99_otu_taxonomy.txt'
gg_otus = Artifact.import_data('FeatureData[Sequence]', gg_otu_path)
gg_taxonomy = Artifact.import_data('FeatureData[Taxonomy]', gg_taxonomy_path, 'HeaderlessTSVTaxonomyFormat')

<artifact: FeatureData[Taxonomy] uuid: 57e63fc0-5bbd-4dc0-814d-5c497b803252>

In [23]:
#create greengenes+m2 taxonomy and otu artifacts
gg_m2_otu_path = working_dir + '/input/m2+gg_otus.fasta'
gg_m2_taxonomy_path = working_dir + '/input/m2+gg_taxonomy.txt'
gg_m2_otus = Artifact.import_data('FeatureData[Sequence]', gg_m2_otu_path)
gg_m2_taxonomy = Artifact.import_data('FeatureData[Taxonomy]', gg_m2_taxonomy_path, 'HeaderlessTSVTaxonomyFormat')

<artifact: FeatureData[Taxonomy] uuid: 04ec4b1a-47a1-44db-99d4-8d1dea7b4657>

In [25]:
#create SILVA taxonomy and OTU artifacts
silva_otu_path = working_dir + '/input/Silva_132_release/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna'
silva_taxonomy_path = working_dir + '/input/Silva_132_release/SILVA_132_QIIME_release/taxonomy/16S_only/99/taxonomy_7_levels.txt'
silva_otus = Artifact.import_data('FeatureData[Sequence]', silva_otu_path)
silva_taxonomy = Artifact.import_data('FeatureData[Taxonomy]', silva_taxonomy_path, 'HeaderlessTSVTaxonomyFormat')

<artifact: FeatureData[Taxonomy] uuid: 1f3b97c4-b63a-4b85-b456-736ef2228f33>

In [26]:
#create greengenes+m2 taxonomy and otu artifacts
silva_m2_otu_path = working_dir + '/input/m2+silva_otus.fasta'
silva_m2_taxonomy_path = working_dir + '/input/m2+silva_taxonomy.txt'
silva_m2_otus = Artifact.import_data('FeatureData[Sequence]', silva_m2_otu_path)
silva_m2_taxonomy = Artifact.import_data('FeatureData[Taxonomy]', silva_m2_taxonomy_path, 'HeaderlessTSVTaxonomyFormat')

In [27]:
#export the artifacts to the input folder
gg_otus.save(working_dir + '/input/greengenes_otus.qza')
gg_taxonomy.save(working_dir + '/input/greengenes_taxonomy.qza')
gg_m2_otus.save(working_dir + '/input/greengenes_metaxa2_otus.qza')
gg_m2_taxonomy.save(working_dir + '/input/greengenes_metaxa2_taxonomy.qza')
silva_otus.save(working_dir + '/input/silva_otus.qza')
silva_taxonomy.save(working_dir + '/input/silva_taxonomy.qza')
silva_m2_otus.save(working_dir + '/input/silva_metaxa2_otus.qza')
silva_m2_taxonomy.save(working_dir + '/input/silva_metaxa2_taxonomy.qza')

'/home/dylan/Documents/june_reset/input/silva_m2_taxonomy.qza'