# Create a metadata file

In [3]:
import pandas as pd

ind = pd.read_csv('RSTC_2019_run_index_barcode - All.tsv', sep='\t') # runs and indexes
bac = pd.read_csv('RSTC_2019_run_index_barcode - Bac.tsv', sep='\t') # bacterial samples
bac.rename(columns={'Sample No.': 'Smpl_no'}, inplace=True)
arc = pd.read_csv('RSTC_2019_run_index_barcode - Arc.tsv', sep='\t') # archaea samples
arc.rename(columns={'Sample No.': 'Smpl_no'}, inplace=True)
met = pd.DataFrame()                                                 # metadata

for ri in set(ind.Run_index): #add bacteria and archeae samples to metadata
  tmp_ind = ind.loc[(ind.Run_index==ri)&(ind.Domain=='bac')].copy()  # bacteria
  tmp = pd.merge(tmp_ind, bac, how='inner', on='Smpl_no')
  met = pd.concat([met, tmp], ignore_index=True)
  tmp_ind = ind.loc[(ind.Run_index==ri)&(ind.Domain=='arc')].copy()  # archaea
  tmp = pd.merge(tmp_ind, arc, how='inner', on='Smpl_no')
  met = pd.concat([met, tmp], ignore_index=True)

met.rename(columns={'Mothur name':'#SampleID'}, inplace=True)        # rename sample id
met.set_index('#SampleID', inplace=True)

# add sequencing run information as separate column
#met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
met['SeqRun'] = ''
for ri in set(met.Run_index):
  for index, row in met.loc[met.Run_index==ri].copy().iterrows():
    met.loc[index,'SeqRun'] = 'SeqRun_' + ri[1]
    
met.to_csv('metadata.tsv', sep='\t')                                 # save a metadata file

# Prepare reads for import

In [None]:
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
for ri in set(met.Run_index):
  path = 'raw_data/LH_diet_2_%s' % (ri)
  !rm $path/MD5.txt                          # remove unecessary files
  !mv $path/*_1.fq.gz $path/forward.fastq.gz # rename forward read for import
  !mv $path/*_2.fq.gz $path/reverse.fastq.gz # rename reverse read for import

# Import raw reads

In [2]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')

from qiime2 import Artifact
!mkdir -p Data/Imported_reads
for ri in set(met.Run_index):
  path = 'raw_data/LH_diet_2_%s' % ri
  artf = Artifact.import_data('MultiplexedPairedEndBarcodeInSequence', path)
  artf.save('Data/Imported_reads/%s_multiplexed.qza' % ri)

# Demultiplex

In [13]:
import pandas as pd
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
!mkdir -p Data/Demuliplexed
for ri in set(met.Run_index):
  if ri != 'r2_i4': continue
  metadata = met.loc[met.Run_index == ri]              # select by run and index
  metadata.to_csv('metadata_%s.tsv'%ri, sep='\t')      # save a temp metadata file
  temp_met = 'metadata_%s.tsv'%ri                      # path to temp metadata
  path  = 'Data/Imported_reads/%s_multiplexed.qza'%ri  # path to the multiplexed reads
  demux = 'Data/Demuliplexed/%s-demux.qza'%ri          # demulptiplexed reads
  untrm = 'Data/Demuliplexed/%s-untrm.qza'%ri          # untrimmed reads
  
  !qiime cutadapt demux-paired \
    --i-seqs $path \
    --m-forward-barcodes-file $temp_met \
    --m-forward-barcodes-column BarcodeSequence \
    --o-per-sample-sequences $demux \
    --o-untrimmed-sequences $untrm \
    --p-error-rate 0.1 \
    --p-mixed-orientation
  !rm $temp_met

[32mSaved SampleData[PairedEndSequencesWithQuality] to: Data/Demuliplexed/r2_i4-demux.qza[0m
[32mSaved MultiplexedPairedEndBarcodeInSequence to: Data/Demuliplexed/r2_i4-untrm.qza[0m


In [27]:
import pandas as pd
from Bio import SeqIO
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
!mkdir -p Data/Demuliplexed
for ri in set(met.Run_index):
  if ri != 'r2_i4': continue
  subdir = 'Data/Demuliplexed/cutadapt_'+ri
  !mkdir $subdir
  meta = met.loc[met.Run_index == ri].copy()
  f = 'raw_data/LH_diet_2_%s/forward.fastq.gz'%ri
  r = 'raw_data/LH_diet_2_%s/reverse.fastq.gz'%ri
  fasta = 'Data/fasta.fa'
  with open(fasta,'w') as fa:
    for ind in meta.index:
      bc = meta.loc[ind,'BarcodeSequence']
      fa.write('>'+ind+'_'+bc+'\n'+bc+'\n')
  fout = subdir + '/{name}_L001_R1_001.fastq.gz'
  rout = subdir + '/{name}_L001_R2_001.fastq.gz'
  !cutadapt -e 1 -a file:$fasta -o $fout -p $rout $f $r
  !rm $fasta

mkdir: cannot create directory ‘Data/Demuliplexed/cutadapt_r2_i4’: File exists
This is cutadapt 3.2 with Python 3.6.13
Command line parameters: -e 1 -a file:Data/fasta.fa -o Data/Demuliplexed/cutadapt_r2_i4/{name}_L001_R1_001.fastq.gz -p Data/Demuliplexed/cutadapt_r2_i4/{name}_L001_R2_001.fastq.gz raw_data/LH_diet_2_r2_i4/forward.fastq.gz raw_data/LH_diet_2_r2_i4/reverse.fastq.gz
Processing reads on 1 core in paired-end mode ...
[8<----------] 00:07:12     1,768,732 reads  @    244.4 µs/read;   0.25 M reads/minute
Finished in 432.38 s (244 us/read; 0.25 M reads/minute).

=== Summary ===

Total read pairs processed:          1,768,732
  Read 1 with adapter:               1,768,729 (100.0%)
  Read 2 with adapter:                       0 (0.0%)
Pairs written (passing filters):     1,768,732 (100.0%)

Total basepairs processed:   884,366,000 bp
  Read 1:   442,183,000 bp
  Read 2:   442,183,000 bp
Total written (filtered):    588,767,088 bp (66.6%)
  Read 1:   146,584,088 bp
  Read 2:   44

In [1]:
conda install -y -c bioconda sabre

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/meco/anaconda3/envs/qiime2-2021.2

  added / updated specs:
    - sabre


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2021.5.30          |   py36h06a4308_0         139 KB
    sabre-1.000                |       h5bf99c6_2          18 KB  bioconda
    ------------------------------------------------------------
                                           Total:         157 KB

The following NEW packages will be INSTALLED:

  sabre              bioconda/linux-64::sabre-1.000-h5bf99c6_2

The following packages will be SUPERSEDED by a higher-priority channel:

  certifi            conda-forge::certifi-2021.5.30-py36h5~ --> pkgs/main::certifi-2021.5.30-py36h06a4308_0



Downloading and Extracting Packages
sabre-1.000          | 18 KB     | ##########

In [24]:
import pandas as pd

met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID', \
                  usecols=['#SampleID','BarcodeSequence','Run_index'])
for ri in set(met.Run_index):
  if ri != 'r2_i4': continue
  subdir = 'Data/Demux_'+ri
  !mkdir -p $subdir
  meta = met.loc[met.Run_index == ri].copy()
  meta['r1'] = meta.index +'_'+ meta.BarcodeSequence +'_L001_R1_001.fq'
  meta['r2'] = meta.index +'_'+ meta.BarcodeSequence +'_L001_R2_001.fq'
  meta.set_index('BarcodeSequence', inplace=True)
  meta.drop('Run_index',axis=1,inplace=True)
  meta.to_csv('Data/bc_%s.txt'%ri, sep='\t',header=False)
  bc = 'Data/bc_%s.txt'%ri
  f = 'raw_data/LH_diet_2_%s/forward.fastq.gz'%ri
  r = 'raw_data/LH_diet_2_%s/reverse.fastq.gz'%ri
  !sabre pe -m 0 -f $f -r $r -b $bc -u no_bc_match_R1.fq -w no_bc_match_R2.fq
  !rm $bc
  !mv *.fq $subdir
  !gzip $subdir/*.fq


Total FastQ records: 3537464 (1768732 pairs)

FastQ records for barcode TCCGAG: 72792 (36396 pairs)
FastQ records for barcode TCATAA: 67914 (33957 pairs)
FastQ records for barcode TATCTC: 50570 (25285 pairs)
FastQ records for barcode TACTTG: 71280 (35640 pairs)
FastQ records for barcode TACGGT: 73836 (36918 pairs)
FastQ records for barcode GTTCCG: 92262 (46131 pairs)
FastQ records for barcode CCAACG: 99412 (49706 pairs)
FastQ records for barcode CAGGCC: 89064 (44532 pairs)
FastQ records for barcode CAATTC: 107932 (53966 pairs)
FastQ records for barcode CAAGAG: 53646 (26823 pairs)
FastQ records for barcode ATATTG: 67726 (33863 pairs)
FastQ records for barcode ATAGAC: 106478 (53239 pairs)
FastQ records for barcode ATAACT: 108588 (54294 pairs)
FastQ records for barcode AGTTGG: 116088 (58044 pairs)
FastQ records for barcode AACTGC: 103570 (51785 pairs)
FastQ records for barcode AACGCA: 100600 (50300 pairs)
FastQ records for barcode GGTTAT: 79944 (39972 pairs)
FastQ records for barcode GCT

In [9]:
conda install -y -c bioconda gbsx

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/meco-guest/anaconda3/envs/qiime2-2021.2

  added / updated specs:
    - gbsx


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    gbsx-1.3                   |       hdfd78af_1          89 KB  bioconda
    ------------------------------------------------------------
                                           Total:          89 KB

The following NEW packages will be INSTALLED:

  gbsx               bioconda/noarch::gbsx-1.3-hdfd78af_1



Downloading and Extracting Packages
gbsx-1.3             | 89 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.


In [23]:
import pandas as pd

met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID', \
                  usecols=['#SampleID','BarcodeSequence','Run_index'])
for ri in set(met.Run_index):
  if ri != 'r2_i4': continue
  subdir = 'Data/Demux_'+ri
  !mkdir -p $subdir
  meta = met.loc[met.Run_index == ri].copy()
  #meta['r1'] = meta.index +'_'+ meta.BarcodeSequence +'_L001_R1_001.fq'
  #meta['r2'] = meta.index +'_'+ meta.BarcodeSequence +'_L001_R2_001.fq'
  #meta.set_index('BarcodeSequence', inplace=True)
  meta.drop('Run_index',axis=1,inplace=True)
  meta['Enzyme'] = 'NAN'
  meta.to_csv('Data/bc_%s.txt'%ri, sep='\t',header=False)
  bc = 'Data/bc_%s.txt'%ri
  f = 'raw_data/LH_diet_2_%s/forward.fastq.gz'%ri
  r = 'raw_data/LH_diet_2_%s/reverse.fastq.gz'%ri
  !gbsx --Demultiplexer -f1 $f -f2 $r -i $bc -o $subdir -mb 1 -s 20 -t 6 -gzip true -rad true 
  !rm $bc


Start the demultiplexing.
USE DUAL BARCODING: false
100000 reads demultiplexed
200000 reads demultiplexed
300000 reads demultiplexed
400000 reads demultiplexed
500000 reads demultiplexed
600000 reads demultiplexed
700000 reads demultiplexed
800000 reads demultiplexed
900000 reads demultiplexed
1000000 reads demultiplexed
1100000 reads demultiplexed
1200000 reads demultiplexed
1300000 reads demultiplexed
1400000 reads demultiplexed
1500000 reads demultiplexed
1600000 reads demultiplexed
1700000 reads demultiplexed
1768732 reads demultiplexed
Demultiplexing ended.


In [2]:
import pandas as pd
import os

met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
for ri in set(met.Run_index):
  #if ri != 'r2_i4': continue
  subdir = 'Data/Demux_'+ri
  for f in os.listdir(subdir):
    if f[0] == 'B':
      !mkdir -p rstc_2019_bac/Data/Demux/$ri
      !mv $subdir/$f rstc_2019_bac/Data/Demux/$ri
    if f[0] == 'A':
      !mkdir -p rstc_2019_arc/Data/Demux/$ri
      !mv $subdir/$f rstc_2019_arc/Data/Demux/$ri

In [9]:
!cutadapt -a CAACTT -o out.1.fastq -p out.2.fastq \
raw_data/LH_diet_2_r2_i4/forward.fastq.gz raw_data/LH_diet_2_r2_i4/reverse.fastq.gz \
--pair-filter=any --discard-untrimmed

This is cutadapt 3.1 with Python 3.6.12
Command line parameters: -a CAACTT -A CAACTT -o out.1.fastq -p out.2.fastq raw_data/LH_diet_2_r2_i4/forward.fastq.gz raw_data/LH_diet_2_r2_i4/reverse.fastq.gz --pair-filter=any --discard-untrimmed
Processing reads on 1 core in paired-end mode ...
[---------->8] 00:00:23     1,768,732 reads  @     13.5 µs/read;   4.46 M reads/minute
Finished in 23.80 s (13 us/read; 4.46 M reads/minute).

=== Summary ===

Total read pairs processed:          1,768,732
  Read 1 with adapter:                 125,689 (7.1%)
  Read 2 with adapter:                  18,970 (1.1%)
Pairs written (passing filters):           838 (0.0%)

Total basepairs processed:   884,366,000 bp
  Read 1:   442,183,000 bp
  Read 2:   442,183,000 bp
Total written (filtered):        200,261 bp (0.0%)
  Read 1:        45,879 bp
  Read 2:       154,382 bp

=== First read: Adapter 1 ===

Sequence: CAACTT; Type: regular 3'; Length: 6; Trimmed: 125689 times

No. of allowed errors:
1-6 bp: 0

Base

In [12]:
from Bio import SeqIO
from Bio.Seq import Seq
import gzip
file = gzip.open("raw_data/LH_diet_2_r2_i4/forward.fastq.gz","rt")
fq = SeqIO.parse(file, "fastq")
count = 0
for i,record in enumerate(fq):
  #if i> 100: break
  if 'CCAACG' in record.seq[:6]:
    count += 1
count

49706

# Separate bacterial and archaea samples 

In [4]:
import pandas as pd
import os

met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
for ri in set(met.Run_index):
  demux = 'Data/Demuliplexed/%s-demux.qza'%ri
  a = !unzip $demux
  digest = a[1].split('/')[0].replace('  inflating: ','')
  
  for f in os.listdir(digest+'/data'):
    if f[0] == 'B':
      !mkdir -p rstc_2019_bac/Data/Demux/$ri
      !mv $digest/data/$f rstc_2019_bac/Data/Demux/$ri
    if f[0] == 'A':
      !mkdir -p rstc_2019_arc/Data/Demux/$ri
      !mv $digest/data/$f rstc_2019_arc/Data/Demux/$ri
      
  !rm -r $digest
!rm Data # remove processed intermediate files to save some space

# Additional metadata manipulations

In [4]:
import pandas as pd

met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
met.drop(['Domain','Nr reads_0','Subsample'], axis=1, inplace=True)

met.to_csv('metadata.tsv', sep='\t')

In [5]:
import pandas as pd
  
met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
met['BS'] = met['BodySite'].map(lambda a: ''.join([x[0].upper() for x in a.split(' ')])) # BodySite abbr.
for ind in met.loc[met.BS=='L'].index:
  met.loc[ind,'BS'] = 'SAM'

dates = sorted(list(set(met['Date of Sampling'].tolist())),key=lambda d:tuple(map(int, d.split('/'))))
tpdict = dict((j,i) for i,j in enumerate(dates))

met['rstc_run'] = ''
met['Day'] = ''
met['Day_hour'] = ''
met['Treatment'] = ''
met['Source'] = ''
for smpl in set(met['Sample Name']):
  for index, row in met.loc[met['Sample Name']==smpl].copy().iterrows(): 
    for r in ['Run1','Run2']:                      # separate rstc run
      if r in smpl:
        met.loc[index,'rstc_run'] = 'rstc_' + r
    for d in ['d0','d7','d13']:                    # separate day
      if d in smpl:
        met.loc[index,'Day'] = d               
        if '48h' in smpl:                          # separate day and hours
          met.loc[index,'Day_hour'] = d+'h48'
        elif '24h' in smpl:
          met.loc[index,'Day_hour'] = d+'h24'
        else:
          met.loc[index,'Day_hour'] = 'not_appl'
    for t in range(1,6):                           # separate treatment
      if 'Trt' not in smpl:
        met.loc[index,'Treatment'] = 'not_appl'
      if 'Trt'+str(t) in smpl:
        met.loc[index,'Treatment'] = 'Trt'+str(t)        
    for s in ['HP1','HP2','C1','C2','C3','mixed']: # source (heat pump, cow, pooled)
      if s in smpl:
        if s == 'mixed':
          met.loc[index,'Source'] = 'mixC'
        else:
          met.loc[index,'Source'] = s
met['Day_num'] = met.Day.str[1:].astype(int)

# adding groups
import itertools as it

met['Src_rstcRun'] = met.Source + '_' + met.rstc_run
cols = {'BS':'BS','Day':'Day','Day_hour':'Dh','Treatment':'Trt','Source':'Src'}
groups = []
for i,c in enumerate(cols):
  if i != 0:
    groups += list(it.combinations(cols,i+1))
groups = [t for t in groups if len([x for x in t if 'Day' in x]) < 2 and 'BS' in t]

for g in groups:
  col = '_'.join([cols[c] for c in g])
  met[col] = ''
  for index,row in met.copy().iterrows():
    met.loc[index,col] = '_'.join([met.at[index,c] for c in g])


met.to_csv('metadata.tsv', sep='\t')

In [6]:
import pandas as pd

met = pd.read_csv('metadata.tsv', sep='\t', index_col='#SampleID')
domains = {'A':'arc','B':'bac'}
for d in domains:
  met_d = met.loc[met.index.str[0] == d].copy()
  met_d.to_csv('rstc_2019_%s/metadata.tsv'%domains[d], sep='\t')  # separate and save a metadata files