In [1]:
%matplotlib inline

from biom.util import biom_open
from biom import load_table

import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns

sns.set_style('whitegrid')

def load_mf(fn, index='#SampleID'):
    _df = pd.read_csv(fn, sep='\t', keep_default_na=False, na_values=[], dtype=str)
    _df.set_index(index, inplace=True) 
    return _df

In [2]:
mf = load_mf('original-metadata.txt')

index = mf.index.copy()
mf.set_index(pd.Index([i.replace('-', '.').lower() for i in index], name='#SampleID'), inplace=True)

We don't explicily have any information for sample d0, and since aour sequences were already demultiplexed we'll copy an existing barcode and primer.

In [3]:
mf = mf.append(pd.Series(['AGCACGAGCCTA', 'YATGCTGCCTCCCGTAGGAGT', 'Sample before spike',
                          '', '', '', '', '', '', '', '', '', '', 'Wednesday', '0', 'Chris'],
                          index=mf.columns, name='d0'))

Comply with the INSDC requirements for missing data

In [4]:
mf.replace({'': 'Missing: Not collected'}, inplace=True)

The `host_subject_id` category is really the `Description` category, so rename that column.

In [5]:
mf.columns = ['host_subject_id' if c == 'Description' else c for c in mf.columns]
mf.columns = ['barcode' if c == 'BarcodeSequence' else c for c in mf.columns]
mf.columns = ['primer' if c == 'LinkerPrimerSequence' else c for c in mf.columns]
mf.columns = ['Volatile_Fatty_Acids' if c == 'Volatile Fatty Acids' else c for c in mf.columns]

mf['description'] = mf['host_subject_id'].copy()

We cannot have slashes in the column names so we need to replace `TSS/VSS` for something else.

In [6]:
mf.columns = ['TSS_VSS' if c == 'TSS/VSS' else c for c in mf.columns]

Latitude and longitude of the biodynamics lab at UCSD.

In [7]:
mf['latitutde'] = '32.875245'
mf['longitude'] = '-117.240135'

Columns and values extracted from a different study (type waste water in the Qiita's search box)

In [8]:
mf['dna_extracted'] = 'TRUE'
mf['physical_specimen_remaining'] = 'TRUE'
mf['sample_type'] = 'waste water'
mf['country'] = 'GAZ:United States of America'
mf['env_biome'] = 'ENVO:urban biome'

Set a decent description for what d0.spike is:

In [9]:
mf = mf.set_value('d0.spike', 'description',
                  'Water sample before it was placed in any of the digesters '
                  'and after it had been spiked with a known concentration of'
                  ' Acinetobacter baylyi and Acinetobacter calcoaceticus')

mf = mf.set_value('d0', 'description',
                  'Water sample before it was placed in any of the digesters')

This is a bit messy, we are only going to keep smples that were *resequenced*, if they were not resequenced then we use the normal samples

In [10]:
names = set([])
with open('forward-read-files.txt', 'r') as f:
    for f in f.read().split():
        names.add(f)

In [11]:
def run_prefix(row, names_set=None):
    # this sample has a naming scheme that breaks the
    # rules below i.e. spike is written as Spike
    if 'spike' in row.name:
        return 'D0-Spike-Re_S9_L001_R1_001'
    if 'd0' == row.name:
        return 'D0_S15_L001_R1_001'

    name = row.name.replace('.', '-').upper()
    matches = [i for i in names_set if i.startswith(name + '_') or i.startswith(name + '-')]
    
    # after sorting the filenames, the first element should
    # be the resequenced file, or alternatively should be
    # the only existing file (because there would only be
    # one match)
    return sorted(matches)[0].replace('.fastq.gz','')

In [12]:
mf['run_prefix'] = mf.apply(run_prefix, axis=1, reduce=False, names_set=names)

We need to add a few extra prep information columns

In [13]:
mf['center_name'] = 'UCSDMI'
mf['platform'] = 'Illumina'
mf['instrument_model'] = 'Illumina MiSeq'
mf['experiment_design_description'] = 'Longitudinal study of four anaerobic digesters'
mf['library_construction_protocol'] = '16S'

# Splitting the columns into sample and preparation specific

In [14]:
prep = {'barcode', 'primer', 'center_name', 'platform', 'instrument_model', 'library_construction_protocol',
        'experiment_design_description', 'run_prefix'}

sample = set(mf.columns) - prep

In [15]:
mf.index.name = 'sample_name'

In [16]:
prep_file = mf[list(prep)]
sample_file = mf[list(sample)]

In [17]:
mkdir -p qiita-files

In [18]:
prep_file.to_csv('qiita-files/prep-information.tsv', sep='\t')

In [19]:
sample_file.to_csv('qiita-files/sample-information.tsv', sep='\t')

# Description of some metadata variables

- COD: chemical oxygen demand.

- PhosphorousR: reactive or inorganic phosphorus.

- PhosphorusT: total phosporus (including organic forms).

- AmoniaN: Amonia Nitrogen.

- NitrateN: Nitrate Nitrogen.

- NitrogenT: Total nitrogen, includes organic forms.

- VFA: volatile fatty acids.

- TSS: total suspended solids.

- VSS: volatile suspended solids.

# Creating Manifest file for QIIME2

See documentation [here](https://docs.qiime2.org/2017.8/tutorials/importing/#id7), we assume that the offset is 33.

In [32]:
manifest = prep_file[['run_prefix', 'platform']].copy()

manifest['platform'] = 'forward'
manifest.columns = ['absolute-filepath', 'direction']

manifest['absolute-filepath'] = ('/home/yovazquezbaeza/research/wastewater/RawData/'
                                 + manifest['absolute-filepath']
                                 + '.fastq.gz')

manifest.index = '11282.' + manifest.index
manifest.index.name = 'sample-id'
manifest.to_csv('MANIFEST')