In [2]:
#create a new feature table with all GCMP samples with appropriate comparment label
#annotate all GCMP sequences
#taxa barplot of annotations
#calculate % mitochondria, % unassigned for each sample
#add % mitochondria and % unassigned to metadata file

In [3]:
import csv
from qiime2 import Artifact
from qiime2.plugins.feature_table.methods import filter_samples
from qiime2.plugins.feature_classifier.methods import classify_consensus_vsearch
from qiime2.metadata import Metadata
from qiime2.plugins.taxa.visualizers import barplot
import pandas as pd
import tempfile
from qiime2 import Visualization

In [4]:
working_dir = '/mnt/c/Users/Dylan/Documents/zaneveld/GCMP_Global_Disease-master/analysis/organelle_removal'
references = ['greengenes', 'silva', 'greengenes_metaxa2', 'silva_metaxa2']

In [5]:
biom_path = working_dir + '/input/all.biom'
GCMP_ft = Artifact.import_data('FeatureTable[Frequency]', biom_path,
                               'BIOMV210Format')

In [6]:
metadata_path = working_dir + '/input/GCMP_EMP_map_r28_no_empty_samples.txt'
metadata = Metadata.load(metadata_path)

In [7]:
GCMP_filtered, = filter_samples(GCMP_ft, metadata = metadata,
                                   where = "tissue_compartment='M' OR tissue_compartment='T' OR tissue_compartment='S'")
save_path = working_dir + '/input/GCMP_ft.qza'
GCMP_filtered.save(save_path)

'/mnt/c/Users/Dylan/Documents/zaneveld/GCMP_Global_Disease-master/analysis/organelle_removal/input/GCMP_ft.qza'

In [8]:
#need to strip the metadata of every column but index otherwise it will interfere with mathematical operations
metadata_path = working_dir + '/input/GCMP_EMP_map_r28_no_empty_samples.txt'
metadata = Metadata.load(metadata_path)
metadata_df = metadata.to_dataframe()
metadata_df = pd.DataFrame(index=metadata_df.index)
metadata = Metadata(metadata_df)

In [9]:
#create an enormous tbp
for reference in references:
    taxonomy_path = working_dir + '/output/' + reference + '_reference_taxonomy.qza'
    taxonomy = Artifact.load(taxonomy_path)
    ft_path = working_dir + '/input/GCMP_ft.qza'
    ft = Artifact.load(ft_path)
    tbp, = barplot(ft, taxonomy, metadata)
    save_path = working_dir + '/output/GCMP_' + reference + '_tbp.qzv'
    tbp.save(save_path)

In [10]:
#calculate per-sample % mitochondria and % unassigned
proportion_unassigned = []
references1 = []
compartments1 = []
samples1 = []
proportion_mitochondria = []
references5 = []
compartments5 = []
samples5 = []
for reference in references:
    with tempfile.TemporaryDirectory() as temp_dir:
        tbp_path = working_dir + '/output/GCMP_' + reference + '_tbp.qzv'
        tbp = Visualization.load(tbp_path)
        tbp.export_data(temp_dir)
        df1 = pd.read_csv(temp_dir + '/level-1.csv')
        df1['total'] = df1.sum(axis = 1)
        df1['reference'] = reference
        proportion_unassigned.extend(list(df1['Unassigned'] / df1['total']))
        references1.extend(list(df1['reference']))
        samples1.extend(list(df1['index']))
        df5 = pd.read_csv(temp_dir + '/level-5.csv')
        df5['total'] = df5.sum(axis = 1)
        df5['reference'] = reference
        if 'greengenes' in reference:
            proportion_mitochondria.extend(list(df5['k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rickettsiales;f__mitochondria'] / df5['total']))
        elif 'silva' in reference:
            proportion_mitochondria.extend(list(df5['D_0__Bacteria;D_1__Proteobacteria;D_2__Alphaproteobacteria;D_3__Rickettsiales;D_4__Mitochondria'] / df5['total']))
        references5.extend(list(df5['reference']))
        samples5.extend(list(df5['index']))

In [11]:
lvl1 = list(zip(samples1, proportion_unassigned, references1))
lvl1_df = pd.DataFrame(lvl1, columns = ['sample_id', 'proportion unassigned', 'reference taxonomy'])
lvl1_df.to_csv(working_dir + '/output/GCMP_lvl1.csv')

In [12]:
lvl5 = list(zip(samples5, proportion_mitochondria, references5))
lvl5_df = pd.DataFrame(lvl5, columns = ['sample_id', 'proportion mitochondria', 'reference taxonomy'])
lvl5_df.to_csv(working_dir + '/output/GCMP_lvl5.csv')

In [13]:
df = pd.read_csv('/home/dylan/Documents/june_reset/sample_ids_with_proportions.tsv', sep = '\t', index_col = 'sample_id')
df

FileNotFoundError: [Errno 2] File b'/home/dylan/Documents/june_reset/sample_ids_with_proportions.tsv' does not exist: b'/home/dylan/Documents/june_reset/sample_ids_with_proportions.tsv'

In [None]:
a = df.transpose()
a

In [None]:
sample_proportions = a.to_dict('list')
sample_proportions

In [None]:
missing = ['Missing: Not collected']*8
updated_metadata_file_path = working_dir + '/output/GCMP_metadata_with_proportions.tsv'
with open(updated_metadata_file_path, 'a') as outfile:
    with open(metadata_path) as metadata:
        metadata_csv = csv.reader(metadata, delimiter="\t")
        for line in metadata_csv:
            if line[0] == '#SampleID':
                line.extend(['greengenes_proportion_unassigned', 'greengenes_proportion_mitochondria', 'greengenes_metaxa2_proportion_unassigned', 'greengenes_metaxa2_proportion_mitochondria', 'silva_proportion_unassigned', 'silva_proportion_mitochondria', 'silva_metaxa2_proportion_unassigned', 'silva_metaxa2_proportion_mitochondria'])
            else:
                for sampleID in sample_proportions:
                    if sampleID in line[0]:
                        line.extend(sample_proportions[sampleID])
                        break
            newline = '\t'.join(str(v) for v in line) + '\n'
            outfile.write(newline)

In [None]:
#add per-sample % mitochondria and % unassigned to metadata

    for reference in references:
        
            for row in metadata_csv:
                if row[0] != '#SampleID":
                break
        break

In [None]:
pd.