In [2]:
import glob
import os
import yaml
import pandas as pd

In [3]:
def open_sample_sheet(sample_sheet_fp, lanes=False):
    """Read in an IGM sample sheet and return a pandas DF with primary data table"""
    sample_sheet = pd.read_excel(sample_sheet_fp, skiprows = 18, header=1)

    if lanes:
        sample_sheet = sample_sheet.loc[sample_sheet['Lane'].isin(lanes)]

    return(sample_sheet)

In [4]:
def open_sequencing_manifest(manifest_fp, lanes='False', sheetname='Sample Information'):
    """Read in an IGM sequencing manifest and return a pandas DF with primary data table"""
    sample_sheet = pd.read_excel(manifest_fp, sheetname=sheetname, skiprows = 20, header=1)
    
    if lanes:
        sample_sheet = sample_sheet.loc[sample_sheet['lane'].isin(lanes)]
        
    return(sample_sheet)

In [5]:
def get_read(sample, seq_dir, read):
    """Function to pull a given read based on sample name from the reads directory"""
    reads = glob.glob(os.path.join(seq_dir, "{0}_*_{1}_*.fastq.gz".format(sample, read)))
    if len(reads) == 1:
        return(reads[0])
    elif len(reads) > 1:
        raise ValueError('Too many reads found for {0} in {1}:\n'
                         'read_str: {2}'.format(sample, seq_dir, reads))
    elif len(reads) < 1:
        raise ValueError('Too few reads found for {0} in {1}:\n'
                         'read_str: {1}\n'.format(sample, seq_dir, reads))   

In [6]:
def make_sample_dict_pe(sample_sheet, seq_dir,
                        forward = 'R1',
                        reverse = 'R2',
                        adaptor = '$CONDA_ENV_PATH/share/trimmomatic-*/adapters/TruSeq3-PE-2.fa',
                        phred = 'phred33',
                        sample_header = 'Sample_Prefix',
                        sample_name = 'Sample'):

    samples_pe = {'samples_pe': {sample_sheet.loc[x, sample_name]:
                      {'forward': get_read(sample_sheet.loc[x, sample_header], seq_dir, forward),
                       'reverse': get_read(sample_sheet.loc[x, sample_header], seq_dir, reverse),
                       'adaptor': adaptor,
                       'phred': phred
                      } for x in sample_sheet.index
                 }}
    
    return(samples_pe)

In [7]:
def make_sample_dict_se(sample_sheet, seq_dir,
                        forward = 'R1',
                        adaptor = '$CONDA_ENV_PATH/share/trimmomatic-*/adapters/TruSeq3-PE-2.fa',
                        phred = 'phred33',
                        sample_header = 'Sample_Prefix',
                        sample_name = 'Sample'):

    samples_se = {'samples_se': {sample_sheet.loc[x, sample_name]:
                      {'forward': get_read(sample_sheet.loc[x, sample_header], seq_dir, forward),
                       'adaptor': adaptor,
                       'phred': phred
                      } for x in sample_sheet.index
                 }}
    
    return(samples_se)

In [8]:
def format_yaml_pe(RUN, samples_pe,
                   TMP_DIR_ROOT = '/localscratch',
                   trimmomatic_path = '$CONDA_ENV_PATH/share/trimmomatic-*',
                   adaptor = '$CONDA_ENV_PATH/share/trimmomatic-*/adapters/TruSeq3-PE-2.fa',
                   kneaddata_db = '$CONDA_ENV_PATH/share/kd_dbs/demo',
                   gzip_path = 'gzip',
                   knead_env = 'source activate kneaddata',
                   params = {'trimmomatic_params': 'LEADING:20 TRAILING:20 AVGQUAL:30 MINLEN:32 TOPHRED33'},
                   default_flow_style = False):
    
    config_str = ''

    config_str += yaml.dump({'TMP_DIR_ROOT': TMP_DIR_ROOT}, default_flow_style = default_flow_style)

    config_str += yaml.dump({'RUN': RUN}, default_flow_style = default_flow_style)
    
    config_str += yaml.dump({'KNEAD_ENV': knead_env}, default_flow_style = default_flow_style)
    
    config_str += yaml.dump({'kneaddata_db': kneaddata_db}, default_flow_style = default_flow_style)
    
    config_str += yaml.dump(samples_pe, default_flow_style = default_flow_style)

    config_str += yaml.dump({'software': 
                   {'trimmomatic': trimmomatic_path,
                    'gzip': gzip_path}
              }, default_flow_style = default_flow_style)

    config_str += yaml.dump(params,
                            default_flow_style = default_flow_style)

    return(config_str)

In [9]:
def format_yaml_se(RUN, samples_se,
                   TMP_DIR_ROOT = '/localscratch',
                   trimmomatic_path = '$CONDA_ENV_PATH/share/trimmomatic-*',
                   adaptor = '$CONDA_ENV_PATH/share/trimmomatic-*/adapters/TruSeq3-PE-2.fa',
                   kneaddata_db = '$CONDA_ENV_PATH/share/kd_dbs/demo',
                   gzip_path = 'gzip',
                   knead_env = 'source activate kneaddata',
                   params = {'trimmomatic_params': 'LEADING:20 TRAILING:20 AVGQUAL:30 MINLEN:32 TOPHRED33'},
                   default_flow_style = False):
    
    config_str = ''

    config_str += yaml.dump({'TMP_DIR_ROOT': TMP_DIR_ROOT}, default_flow_style = default_flow_style)

    config_str += yaml.dump({'RUN': RUN}, default_flow_style = default_flow_style)

    config_str += yaml.dump({'KNEAD_ENV': knead_env}, default_flow_style = default_flow_style)
    
    config_str += yaml.dump({'kneaddata_db': kneaddata_db}, default_flow_style = default_flow_style)
    
    config_str += yaml.dump(samples_se, default_flow_style = default_flow_style)

    config_str += yaml.dump({'software': 
                   {'trimmomatic': trimmomatic_path,
                    'gzip': gzip_path}
              }, default_flow_style = default_flow_style)

    config_str += yaml.dump(params,
                            default_flow_style = default_flow_style)

    return(config_str)

# Read in the sample data 


In [12]:
# this is the Excel document provided by IGM
sequencing_manifest_fp = './example/reads/example_sample_manifest.xlsx'

# If you only want samples from some lanes, you can provide a list of those lane numbers here
lanes = [1]

# This is the path to the folder on Barnacle where the raw reads are located
seq_dir = './example/reads/Run1'

# Read in the sample manifest 
sample_sheet = open_sequencing_manifest(sequencing_manifest_fp, lanes=lanes)


# These columns can be modified with regexes to fix sample names or add common sample prefixes if necessary
sample_sheet['Sample_Prefix'] = sample_sheet['Sample Name']
sample_sheet['Sample'] = sample_sheet['Sample Name']


sample_sheet

Unnamed: 0,Sample Name,Sample Code,Library Size (bp),Library Prep Method,Index 1 (Name),Index 1 (Sequence),Index 2 (Name),Index 2 (Sequence),Conc. (nM),Volume (μl),Quantification Method,lane,Sample_Prefix,Sample
0,sample1,1,400-600,KapaHyperPlus,D001,ATCTAGCCGGCC,,,,,Pico,1,sample1,sample1
1,sample2,2,400-600,KapaHyperPlus,D013,AAGCGTACGTCC,,,,,Pico,1,sample2,sample2


Make samples dictionary
------

This links the sample name to the original location of the forward (and reverse) reads. 

To instead use single ended reads, cal `samples_se()` instead.

In [13]:
samples_pe = make_sample_dict_pe(sample_sheet, seq_dir,
                        adaptor = '$CONDA_ENV_PATH/share/trimmomatic-*/adapters/TruSeq3-PE-2.fa',
                        forward = 'R1',
                        reverse = 'R2',
                        sample_header = 'Sample_Prefix',
                        sample_name = 'Sample')

Format YAML file
================

Make the actual config file.

Going to want to modify `RUN` variable with a good name for the run.

You may also want to pass optional parameters to modify execution. The optional parameters and their defaults are:

```
TMP_DIR_ROOT = '/localscratch',
trimmomatic_path = '$CONDA_ENV_PATH/share/trimmomatic-*',
adaptor = '$CONDA_ENV_PATH/share/trimmomatic-*/adapters/TruSeq3-PE-2.fa',
kneaddata_db = '$CONDA_ENV_PATH/share/kd_dbs/demo',
gzip_path = 'gzip',
knead_env = 'source activate kneaddata',
params = {'trimmomatic_params': 'LEADING:20 TRAILING:20 AVGQUAL:30 MINLEN:32 TOPHRED33'}
```

In [15]:
RUN = 'example_run'
config_str = format_yaml_pe(RUN, samples_pe)
with open('config_%s.yaml' % RUN, 'w') as f:
    f.write(config_str)

In [16]:
print(config_str)

TMP_DIR_ROOT: /localscratch
RUN: example_run
KNEAD_ENV: source activate kneaddata
kneaddata_db: $CONDA_ENV_PATH/share/kd_dbs/demo
samples_pe:
  sample1:
    adaptor: $CONDA_ENV_PATH/share/trimmomatic-*/adapters/TruSeq3-PE-2.fa
    forward: ./example/reads/Run1/sample1_S312_R1_L001.fastq.gz
    phred: phred33
    reverse: ./example/reads/Run1/sample1_S312_R2_L001.fastq.gz
  sample2:
    adaptor: $CONDA_ENV_PATH/share/trimmomatic-*/adapters/TruSeq3-PE-2.fa
    forward: ./example/reads/Run1/sample2_S521_R1_L001.fastq.gz
    phred: phred33
    reverse: ./example/reads/Run1/sample2_S521_R2_L001.fastq.gz
software:
  gzip: gzip
  trimmomatic: $CONDA_ENV_PATH/share/trimmomatic-*
trimmomatic_params: LEADING:20 TRAILING:20 AVGQUAL:30 MINLEN:32 TOPHRED33

