# Create a metadata file

In [24]:
import pandas as pd

ind = pd.read_csv('RSTC_2019_run_index_barcode - All.tsv', sep='\t') # runs and indexes
bac = pd.read_csv('RSTC_2019_run_index_barcode - Bac.tsv', sep='\t') # bacterial samples
bac.rename(columns={'Sample No.': 'Smpl_no'}, inplace=True)
arc = pd.read_csv('RSTC_2019_run_index_barcode - Arc.tsv', sep='\t') # archaea samples
arc.rename(columns={'Sample No.': 'Smpl_no'}, inplace=True)
met = pd.DataFrame()                                                 # metadata

for ri in set(ind.Run_index): #add bacteria and archeae samples to metadata
  tmp_ind = ind.loc[(ind.Run_index==ri)&(ind.Domain=='bac')].copy()  # bacteria
  tmp = pd.merge(tmp_ind, bac, how='inner', on='Smpl_no')
  met = pd.concat([met, tmp], ignore_index=True)
  tmp_ind = ind.loc[(ind.Run_index==ri)&(ind.Domain=='arc')].copy()  # archaea
  tmp = pd.merge(tmp_ind, arc, how='inner', on='Smpl_no')
  met = pd.concat([met, tmp], ignore_index=True)

met.rename(columns={'Mothur name':'#SampleID'}, inplace=True)        # rename sample id
met.set_index('#SampleID', inplace=True)

# add sequencing run information as separate column
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
met['SeqRun'] = ''
for ri in set(met.Run_index):
  for index, row in met.loc[met.Run_index==ri].copy().iterrows():
    met.loc[index,'SeqRun'] = 'SeqRun_' + ri[1]
    
met.to_csv('metadata.tsv', sep='\t')                                 # save a metadata file

# Prepare reads for import

In [25]:
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
for ri in set(met.Run_index):
  path = 'raw_data/LH_diet_2_%s' % (ri)
  !rm $path/MD5.txt                          # remove unecessary files
  !mv $path/*_1.fq.gz $path/forward.fastq.gz # rename forward read for import
  !mv $path/*_2.fq.gz $path/reverse.fastq.gz # rename reverse read for import

rm: cannot remove 'raw_data/LH_diet_2_r2_i6/MD5.txt': No such file or directory
mv: cannot stat 'raw_data/LH_diet_2_r2_i6/*_1.fq.gz': No such file or directory
mv: cannot stat 'raw_data/LH_diet_2_r2_i6/*_2.fq.gz': No such file or directory
rm: cannot remove 'raw_data/LH_diet_2_r2_i9/MD5.txt': No such file or directory
mv: cannot stat 'raw_data/LH_diet_2_r2_i9/*_1.fq.gz': No such file or directory
mv: cannot stat 'raw_data/LH_diet_2_r2_i9/*_2.fq.gz': No such file or directory
rm: cannot remove 'raw_data/LH_diet_2_r2_i5/MD5.txt': No such file or directory
mv: cannot stat 'raw_data/LH_diet_2_r2_i5/*_1.fq.gz': No such file or directory
mv: cannot stat 'raw_data/LH_diet_2_r2_i5/*_2.fq.gz': No such file or directory
rm: cannot remove 'raw_data/LH_diet_2_r2_i3/MD5.txt': No such file or directory
mv: cannot stat 'raw_data/LH_diet_2_r2_i3/*_1.fq.gz': No such file or directory
mv: cannot stat 'raw_data/LH_diet_2_r2_i3/*_2.fq.gz': No such file or directory
rm: cannot remove 'raw_data/LH_diet_2_r2

# Import raw reads

In [26]:
from qiime2 import Artifact
!mkdir -p Data/Imported_reads
for ri in set(met.Run_index):
  path = 'raw_data/LH_diet_2_%s' % ri
  artf = Artifact.import_data('MultiplexedPairedEndBarcodeInSequence', path)
  artf.save('Data/Imported_reads/%s_multiplexed.qza' % ri)

# Demultiplex

In [29]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
!mkdir -p Data/Demuliplexed
for ri in set(met.Run_index):
  metadata = met.loc[met.Run_index == ri]              # select by run and index
  metadata.to_csv('metadata_%s.tsv'%ri, sep='\t')      # save a temp metadata file
  temp_met = 'metadata_%s.tsv'%ri                      # path to temp metadata
  path  = 'Data/Imported_reads/%s_multiplexed.qza'%ri  # path to the multiplexed reads
  demux = 'Data/Demuliplexed/%s-demux.qza'%ri          # demulptiplexed reads
  untrm = 'Data/Demuliplexed/%s-untrm.qza'%ri          # untrimmed reads
  
  !qiime cutadapt demux-paired \
    --i-seqs $path \
    --m-forward-barcodes-file $temp_met \
    --m-forward-barcodes-column BarcodeSequence \
    --o-per-sample-sequences $demux \
    --o-untrimmed-sequences $untrm
  !rm $temp_met

[32mSaved SampleData[PairedEndSequencesWithQuality] to: Data/Demuliplexed/r2_i6-demux.qza[0m
[32mSaved MultiplexedPairedEndBarcodeInSequence to: Data/Demuliplexed/r2_i6-untrm.qza[0m
[32mSaved SampleData[PairedEndSequencesWithQuality] to: Data/Demuliplexed/r2_i9-demux.qza[0m
[32mSaved MultiplexedPairedEndBarcodeInSequence to: Data/Demuliplexed/r2_i9-untrm.qza[0m
[32mSaved SampleData[PairedEndSequencesWithQuality] to: Data/Demuliplexed/r2_i5-demux.qza[0m
[32mSaved MultiplexedPairedEndBarcodeInSequence to: Data/Demuliplexed/r2_i5-untrm.qza[0m
[32mSaved SampleData[PairedEndSequencesWithQuality] to: Data/Demuliplexed/r2_i3-demux.qza[0m
[32mSaved MultiplexedPairedEndBarcodeInSequence to: Data/Demuliplexed/r2_i3-untrm.qza[0m
[32mSaved SampleData[PairedEndSequencesWithQuality] to: Data/Demuliplexed/r2_i1-demux.qza[0m
[32mSaved MultiplexedPairedEndBarcodeInSequence to: Data/Demuliplexed/r2_i1-untrm.qza[0m
[32mSaved SampleData[PairedEndSequencesWithQuality] to: Data/Demulipl

# Separate bacterial and archaea samples 

In [65]:
import pandas as pd
import os

met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
for ri in set(met.Run_index):
  demux = 'Data/Demuliplexed/%s-demux.qza'%ri
  a = !unzip $demux
  digest = a[1].split('/')[0].replace('  inflating: ','')
  
  for f in os.listdir(digest+'/data'):
    if f[0] == 'B':
      !mkdir -p rstc_2019_bac/Data/Demux/$ri
      !mv $digest/data/$f rstc_2019_bac/Data/Demux/$ri
    if f[0] == 'A':
      !mkdir -p rstc_2019_arc/Data/Demux/$ri
      !mv $digest/data/$f rstc_2019_arc/Data/Demux/$ri
      
  !rm -r $digest
!rm Data # remove processed intermediate files to save some space

# Additional metadata manipulations

In [75]:
import pandas as pd

met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
met.drop(['Domain','Nr reads_0','Subsample'], axis=1, inplace=True)

met.to_csv('metadata.tsv', sep='\t')

In [7]:
import pandas as pd
  
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
met['BS'] = met['BodySite'].map(lambda a: ''.join([x[0].upper() for x in a.split(' ')])) # BodySite abbr.
for ind in met.loc[met.BS=='L'].index:
  met.loc[ind,'BS'] = 'SAM'

dates = sorted(list(set(met['Date of Sampling'].tolist())),key=lambda d:tuple(map(int, d.split('/'))))
tpdict = dict((j,i) for i,j in enumerate(dates))

met['rstc_run'] = ''
met['Day'] = ''
met['Day_hour'] = ''
met['Treatment'] = ''
met['Source'] = ''
for smpl in set(met['Sample Name']):
  for index, row in met.loc[met['Sample Name']==smpl].copy().iterrows(): 
    for r in ['Run1','Run2']:                      # separate rstc run
      if r in smpl:
        met.loc[index,'rstc_run'] = 'rstc_' + r
    for d in ['d0','d7','d13']:                    # separate day
      if d in smpl:
        met.loc[index,'Day'] = d               
        if '48h' in smpl:                          # separate day and hours
          met.loc[index,'Day_hour'] = d+'h48'
        elif '24h' in smpl:
          met.loc[index,'Day_hour'] = d+'h24'
        else:
          met.loc[index,'Day_hour'] = 'not_appl'
    for t in range(1,6):                           # separate treatment
      if 'Trt' not in smpl:
        met.loc[index,'Treatment'] = 'not_appl'
      if 'Trt'+str(t) in smpl:
        met.loc[index,'Treatment'] = 'Trt'+str(t)        
    for s in ['HP1','HP2','C1','C2','C3','mixed']: # source (heat pump, cow, pooled)
      if s in smpl:
        if s == 'mixed':
          met.loc[index,'Source'] = 'mixC'
        else:
          met.loc[index,'Source'] = s
met['Day_num'] = met.Day.str[1:].astype(int)

# adding groups
import itertools as it

met['Src_rstcRun'] = met.Source + '_' + met.rstc_run
cols = {'BS':'BS','Day':'Day','Day_hour':'Dh','Treatment':'Trt','Source':'Src'}
groups = []
for i,c in enumerate(cols):
  if i != 0:
    groups += list(it.combinations(cols,i+1))
groups = [t for t in groups if len([x for x in t if 'Day' in x]) < 2 and 'BS' in t]

for g in groups:
  col = '_'.join([cols[c] for c in g])
  met[col] = ''
  for index,row in met.copy().iterrows():
    met.loc[index,col] = '_'.join([met.at[index,c] for c in g])


met.to_csv('metadata.tsv', sep='\t')

In [8]:
import pandas as pd

met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
domains = {'A':'arc','B':'bac'}
for d in domains:
  met_d = met.loc[met.index.str[0] == d].copy()
  met_d.to_csv('rstc_2019_%s/metadata.tsv'%domains[d], sep='\t')  # separate and save a metadata files