In [1]:
import glob
import os
import yaml
import pandas as pd


In [2]:
def open_sample_sheet(sample_sheet_fp, lanes=False):
    """Read in an IGM sample sheet and return a pandas DF with primary data table"""
    sample_sheet = pd.read_excel(sample_sheet_fp, skiprows = 18, header=1)

    if lanes:
        sample_sheet = sample_sheet.loc[sample_sheet['Lane'].isin(lanes)]

    return(sample_sheet)

In [3]:
def get_read(sample, seq_dir, read):
    """Function to pull a given read based on sample name from the reads directory"""
    read = glob.glob(os.path.join(seq_dir, "{0}_*_{1}_*.fastq.gz".format(sample, read)))
    if len(read) == 1:
        return(read[0])
    else:
        raise ValueError('Too many reads found: {}'.format(read))

In [4]:
sample_sheet_fp = './2016_08_02_Knight_Sample_Sheet_NexteraXT_Katz_EMP5001-4_KF_Ext._Test_HiSeq-Song.xls'
lanes = [5,6,7,8]
seq_dir = '/sequencing/ucsd/complete_runs/160805_K00180_0231_AHCTHWBBXX'

sample_sheet = open_sample_sheet(sample_sheet_fp, lanes=lanes)

sample_sheet

Unnamed: 0,Lane,Sample_ID,Sample_Name,Sample_Plate,Sample_Well,I7_Index_ID,index,Sample_Project,Description
0,5,Song51_24740,,,,,,,
1,5,Song51_24065,,,,,,,
2,5,Song75_24535,,,,,,,
3,5,Song52_24048,,,,,,,
4,5,Song53_24102,,,,,,,
5,5,Song51_24059,,,,,,,
6,5,Song53_24073,,,,,,,
7,5,Song52_25153,,,,,,,
8,5,Song52_25079,,,,,,,
9,5,Song75_ace,,,,,,,


In [5]:
default_flow_style = False

Make samples dictionary
------

In [8]:
adaptor = '/home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa'
phred = 'phred33'

samples_pe = {'samples_pe': {sample_sheet.loc[x, 'Sample_ID']:
                  {'forward': get_read(sample_sheet.loc[x, 'Sample_ID'], seq_dir, 'R1'),
                   'reverse': get_read(sample_sheet.loc[x, 'Sample_ID'], seq_dir, 'R2'),
                   'adaptor': adaptor,
                   'phred': phred
                  } for x in sample_sheet.index
             }}
samples_pe

{'samples_pe': {'Song51_24059': {'adaptor': '/home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa',
   'forward': '/sequencing/ucsd/complete_runs/160805_K00180_0231_AHCTHWBBXX/Song51_24059_S241_L005_R1_001.fastq.gz',
   'phred': 'phred33',
   'reverse': '/sequencing/ucsd/complete_runs/160805_K00180_0231_AHCTHWBBXX/Song51_24059_S241_L005_R2_001.fastq.gz'},
  'Song51_24065': {'adaptor': '/home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa',
   'forward': '/sequencing/ucsd/complete_runs/160805_K00180_0231_AHCTHWBBXX/Song51_24065_S207_L005_R1_001.fastq.gz',
   'phred': 'phred33',
   'reverse': '/sequencing/ucsd/complete_runs/160805_K00180_0231_AHCTHWBBXX/Song51_24065_S207_L005_R2_001.fastq.gz'},
  'Song51_24081': {'adaptor': '/home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa',
   'forward': '/sequencing/ucsd/complete_runs/160805_K00180_0231_AHCTHWBBXX/Song51_24081_S378_L006_R1_001.fastq.gz',
   'phred': 'phred33',
   'reverse'

In [9]:
config_str = ''

config_str += yaml.dump({'TMP_DIR_ROOT': '/localscratch'}, default_flow_style = default_flow_style)

config_str += yaml.dump({'RUN': 'test2'}, default_flow_style = default_flow_style)

config_str += yaml.dump(samples_pe, default_flow_style = default_flow_style)

config_str += yaml.dump({'software': 
               {'trimmomatic': 'java -jar /home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/trimmomatic-0.36.jar',
                'gzip': 'pigz'}
          }, default_flow_style = default_flow_style)

config_str += yaml.dump({'trimmomatic_params': 'LEADING:20 TRAILING:20 AVGQUAL:30 MINLEN:32 TOPHRED33'},
                        default_flow_style = default_flow_style)

print(config_str)

TMP_DIR_ROOT: /localscratch
RUN: test2
samples_pe:
  Song51_24059:
    adaptor: /home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa
    forward: /sequencing/ucsd/complete_runs/160805_K00180_0231_AHCTHWBBXX/Song51_24059_S241_L005_R1_001.fastq.gz
    phred: phred33
    reverse: /sequencing/ucsd/complete_runs/160805_K00180_0231_AHCTHWBBXX/Song51_24059_S241_L005_R2_001.fastq.gz
  Song51_24065:
    adaptor: /home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa
    forward: /sequencing/ucsd/complete_runs/160805_K00180_0231_AHCTHWBBXX/Song51_24065_S207_L005_R1_001.fastq.gz
    phred: phred33
    reverse: /sequencing/ucsd/complete_runs/160805_K00180_0231_AHCTHWBBXX/Song51_24065_S207_L005_R2_001.fastq.gz
  Song51_24081:
    adaptor: /home/jgsanders/git_sw/git_bin/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa
    forward: /sequencing/ucsd/complete_runs/160805_K00180_0231_AHCTHWBBXX/Song51_24081_S378_L006_R1_001.fastq.gz
    phred: phred33
    reverse: /sequenc

In [10]:
with open('config_song.yaml', 'w') as f:
    f.write(config_str)