In [1]:
#create a new feature table with all GCMP samples with appropriate comparment label
#annotate all GCMP sequences
#taxa barplot of annotations
#calculate % mitochondria, % unassigned for each sample
#add % mitochondria and % unassigned to metadata file

In [1]:
import csv
from qiime2 import Artifact
from qiime2.plugins.feature_table.methods import filter_samples
from qiime2.plugins.feature_classifier.methods import classify_consensus_vsearch
from qiime2.metadata import Metadata
from qiime2.plugins.taxa.visualizers import barplot
import pandas as pd
import tempfile
from qiime2 import Visualization

In [17]:
working_dir = '/home/dylan/Documents/june_reset'
references = ['greengenes', 'silva', 'greengenes_metaxa2', 'silva_metaxa2']

In [4]:
biom_path = working_dir + '/input/all.biom'
GCMP_ft = Artifact.import_data('FeatureTable[Frequency]', biom_path,
                               'BIOMV210Format')

In [18]:
metadata_path = working_dir + '/input/GCMP_EMP_map_r28_no_empty_samples.txt'
metadata = Metadata.load(metadata_path)

In [6]:
GCMP_filtered, = filter_samples(GCMP_ft, metadata = metadata,
                                   where = "tissue_compartment='M' OR tissue_compartment='T' OR tissue_compartment='S'")
save_path = working_dir + '/input/GCMP_ft.qza'
GCMP_filtered.save(save_path)

'/home/dylan/Documents/june_reset/input/GCMP_ft.qza'

In [4]:
#need to strip the metadata of every column but index otherwise it will interfere with mathematical operations
metadata_path = working_dir + '/input/GCMP_EMP_map_r28_no_empty_samples.txt'
metadata = Metadata.load(metadata_path)
metadata_df = metadata.to_dataframe()
metadata_df = pd.DataFrame(index=metadata_df.index)
metadata = Metadata(metadata_df)

In [5]:
#create an enormous tbp
for reference in references:
    taxonomy_path = working_dir + '/output/' + reference + '_reference_taxonomy.qza'
    taxonomy = Artifact.load(taxonomy_path)
    ft_path = working_dir + '/input/GCMP_ft.qza'
    ft = Artifact.load(ft_path)
    tbp, = barplot(ft, taxonomy, metadata)
    save_path = working_dir + '/output/GCMP_' + reference + '_tbp.qzv'
    tbp.save(save_path)

In [12]:
#calculate per-sample % mitochondria and % unassigned
proportion_unassigned = []
references1 = []
compartments1 = []
samples1 = []
proportion_mitochondria = []
references5 = []
compartments5 = []
samples5 = []
for reference in references:
    with tempfile.TemporaryDirectory() as temp_dir:
        tbp_path = working_dir + '/output/GCMP_' + reference + '_tbp.qzv'
        tbp = Visualization.load(tbp_path)
        tbp.export_data(temp_dir)
        df1 = pd.read_csv(temp_dir + '/level-1.csv')
        df1['total'] = df1.sum(axis = 1)
        df1['reference'] = reference
        proportion_unassigned.extend(list(df1['Unassigned'] / df1['total']))
        references1.extend(list(df1['reference']))
        samples1.extend(list(df1['index']))
        df5 = pd.read_csv(temp_dir + '/level-5.csv')
        df5['total'] = df5.sum(axis = 1)
        df5['reference'] = reference
        if 'greengenes' in reference:
            proportion_mitochondria.extend(list(df5['k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rickettsiales;f__mitochondria'] / df5['total']))
        elif 'silva' in reference:
            proportion_mitochondria.extend(list(df5['D_0__Bacteria;D_1__Proteobacteria;D_2__Alphaproteobacteria;D_3__Rickettsiales;D_4__Mitochondria'] / df5['total']))
        references5.extend(list(df5['reference']))
        samples5.extend(list(df5['index']))

In [13]:
lvl1 = list(zip(samples1, proportion_unassigned, references1))
lvl1_df = pd.DataFrame(lvl1, columns = ['sample_id', 'proportion unassigned', 'reference taxonomy'])
lvl1_df.to_csv(working_dir + '/output/GCMP_lvl1.csv')

In [14]:
lvl5 = list(zip(samples5, proportion_mitochondria, references5))
lvl5_df = pd.DataFrame(lvl5, columns = ['sample_id', 'proportion mitochondria', 'reference taxonomy'])
lvl5_df.to_csv(working_dir + '/output/GCMP_lvl5.csv')

In [2]:
df = pd.read_csv('/home/dylan/Documents/june_reset/sample_ids_with_proportions.tsv', sep = '\t', index_col = 'sample_id')
df

Unnamed: 0_level_0,greengenes_proportion_unassigned,greengenes_proportion_mitochondria,greengenes_metaxa2_proportion_unassigned,greengenes_metaxa2_proportion_mitochondria,silva_proportion_unassigned,silva_proportion_mitochondria,silva_metaxa2_proportion_unassigned,silva_metaxa2_proportion_mitochondria
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10895.E4.11.Fun.coro.1.20150309.S,0.013146,0.0,0.008947,0.004199,0.012880,0.000266,0.008947,0.004199
10895.E4.10.Poc.verr.1.20150305.S,0.596072,0.0,0.002671,0.593401,0.593401,0.000000,0.000000,0.593401
10895.E3.3.Por.cyli.1.20150123.S,0.661209,0.0,0.001253,0.659956,0.661033,0.000020,0.001097,0.659956
10895.E9.17.Lep.phry.1.20150818.T,0.887847,0.0,0.156235,0.731612,0.887847,0.000000,0.156235,0.731612
10895.E9.6.Acr.sp.1.20150824.T,0.047571,0.0,0.000407,0.047164,0.047571,0.000000,0.000407,0.047164
...,...,...,...,...,...,...,...,...
10895.E9.9.Pse.taya.1.20150824.M,0.057817,0.0,0.001516,0.056301,0.057817,0.000000,0.001516,0.056301
10895.E11.2.Tub.sp.4.20150501.M,0.054653,0.0,0.000000,0.054653,0.054653,0.000000,0.000000,0.054653
10895.E5.16.Mon.cave.3.20150306.S,0.029424,0.0,0.000000,0.029424,0.029424,0.000000,0.000000,0.029424
10895.E1.3.Por.loba.1.20140724.S,0.818427,0.0,0.001944,0.816483,0.818018,0.000410,0.001944,0.816483


In [8]:
a = df.transpose()
a

sample_id,10895.E4.11.Fun.coro.1.20150309.S,10895.E4.10.Poc.verr.1.20150305.S,10895.E3.3.Por.cyli.1.20150123.S,10895.E9.17.Lep.phry.1.20150818.T,10895.E9.6.Acr.sp.1.20150824.T,10895.E11.10.Poc.dami.4.20150502.M,10895.E5.21.Dip.laby.2.20150307.T,10895.E3.6.Acr.hyac.1.20150118.S,10895.E11.3.Por.loba.4.20150502.M,10895.E1.15.Dip.heli.1.20140815.M,...,10895.E4.17.Pla.daed.1.20150311.T,10895.E9.5.Gal.fasc.1.20150821.S,10895.E4.3.Por.lute.1.20150309.M,10895.E5.21.Sco.sp.1.20150306.M,10895.E5.9.Sid.side.3.20150307.T,10895.E9.9.Pse.taya.1.20150824.M,10895.E11.2.Tub.sp.4.20150501.M,10895.E5.16.Mon.cave.3.20150306.S,10895.E1.3.Por.loba.1.20140724.S,10895.E4.19.Lob.hemp.1.20150310.T
greengenes_proportion_unassigned,0.013146,0.596072,0.661209,0.887847,0.047571,0.228988,0.318047,0.038194,0.558824,0.931076,...,0.964217,0.005779,0.881974,0.549223,0.061709,0.057817,0.054653,0.029424,0.818427,0.935206
greengenes_proportion_mitochondria,0.0,0.0,0.0,0.0,0.0,0.0,0.001129,0.0,0.0,3.8e-05,...,0.0,0.000134,0.000526,0.002648,0.001878,0.0,0.0,0.0,0.0,0.0
greengenes_metaxa2_proportion_unassigned,0.008947,0.002671,0.001253,0.156235,0.000407,0.002573,0.003711,0.0,0.004902,0.029822,...,0.000202,0.003101,0.012928,0.01603,0.018292,0.001516,0.0,0.0,0.001944,0.000711
greengenes_metaxa2_proportion_mitochondria,0.004199,0.593401,0.659956,0.731612,0.047164,0.226415,0.315465,0.038194,0.553922,0.901292,...,0.964015,0.002811,0.869572,0.53584,0.045294,0.056301,0.054653,0.029424,0.816483,0.934495
silva_proportion_unassigned,0.01288,0.593401,0.661033,0.887847,0.047571,0.226415,0.312094,0.038194,0.558824,0.931076,...,0.964217,0.005779,0.881773,0.549945,0.025396,0.057817,0.054653,0.029424,0.818018,0.935206
silva_proportion_mitochondria,0.000266,0.0,2e-05,0.0,0.0,0.0,0.006992,0.0,0.0,3.8e-05,...,0.0,0.000134,0.000459,0.001926,0.03759,0.0,0.0,0.0,0.00041,0.0
silva_metaxa2_proportion_unassigned,0.008947,0.0,0.001097,0.156235,0.000407,0.0,0.003675,0.0,0.004902,0.029822,...,0.000202,0.003101,0.012727,0.016753,0.016472,0.001516,0.0,0.0,0.001944,0.000711
silva_metaxa2_proportion_mitochondria,0.004199,0.593401,0.659956,0.731612,0.047164,0.226415,0.315411,0.038194,0.553922,0.901292,...,0.964015,0.002811,0.869505,0.535118,0.046553,0.056301,0.054653,0.029424,0.816483,0.934495


In [9]:
sample_proportions = a.to_dict('list')
sample_proportions

{'10895.E4.11.Fun.coro.1.20150309.S': [0.013146238616633001,
  0.0,
  0.008947237872506,
  0.004199000744127,
  0.012880479075866001,
  0.000265759540768,
  0.008947237872506,
  0.004199000744127],
 '10895.E4.10.Poc.verr.1.20150305.S': [0.596072270227808,
  0.0,
  0.00267085624509,
  0.593401413982718,
  0.593401413982718,
  0.0,
  0.0,
  0.593401413982718],
 '10895.E3.3.Por.cyli.1.20150123.S': [0.661209366596447,
  0.0,
  0.001253124343803,
  0.6599562422526429,
  0.6610332518238039,
  2.03209353049156e-05,
  0.001097330506465,
  0.6599562422526429],
 '10895.E9.17.Lep.phry.1.20150818.T': [0.887846972960038,
  0.0,
  0.15623504426896398,
  0.731611928691074,
  0.887846972960038,
  0.0,
  0.15623504426896398,
  0.731611928691074],
 '10895.E9.6.Acr.sp.1.20150824.T': [0.04757064443992701,
  0.0,
  0.000406586704615,
  0.047164057735312,
  0.04757064443992701,
  0.0,
  0.000406586704615,
  0.047164057735312],
 '10895.E11.10.Poc.dami.4.20150502.M': [0.22898799313893697,
  0.0,
  0.002572898

In [37]:
missing = ['Missing: Not collected']*8
updated_metadata_file_path = working_dir + '/output/GCMP_metadata_with_proportions.tsv'
with open(updated_metadata_file_path, 'a') as outfile:
    with open(metadata_path) as metadata:
        metadata_csv = csv.reader(metadata, delimiter="\t")
        for line in metadata_csv:
            if line[0] == '#SampleID':
                line.extend(['greengenes_proportion_unassigned', 'greengenes_proportion_mitochondria', 'greengenes_metaxa2_proportion_unassigned', 'greengenes_metaxa2_proportion_mitochondria', 'silva_proportion_unassigned', 'silva_proportion_mitochondria', 'silva_metaxa2_proportion_unassigned', 'silva_metaxa2_proportion_mitochondria'])
            else:
                for sampleID in sample_proportions:
                    if sampleID in line[0]:
                        line.extend(sample_proportions[sampleID])
                        break
            newline = '\t'.join(str(v) for v in line) + '\n'
            outfile.write(newline)

In [9]:
#add per-sample % mitochondria and % unassigned to metadata

    for reference in references:
        
            for row in metadata_csv:
                if row[0] != '#SampleID":
                break
        break

tissue_compartment


In [None]:
pd.