# Main input simulator 

This script simulates inputs for the read generator later
- Inputs: 
    1. BED target file specifies locations of SNV of interest (hg19) 
    2. BED target region size 
    4. Fraction of tumor DNA in total (0.0 - 1.0) 
    5. Number of reads
    6. Standard dev of log(coverage): this is because coverage in hybridcaputer sequencing is approximately log-normal 
    7. Read lengths


- Output in the format:

    #HAPLOTYPE_FRACTION:HAP1;HAP2;HAP3;HAP4	0.5;0.3;0.1;0.1
    
    #CHROM	POSITION	REF	ALT	HAP1	HAP2	HAP3	HAP4	COV
    
    chr1	100000000	A	C,CCC,-	0	1	2	3	1000


    
    
Notes: 
1. Allele freqs includes wild-type allele.
2. Each location can have a different coverage (due to assay bias)


In [80]:
import os
import subprocess
import logging 
import sys 
import numpy as np 

logging.basicConfig(level=logging.DEBUG)

def load_hap_fraction(hap_frac_str):
    tmp = hap_frac_str.strip().split(":")[1].split('\t')
    labels = tmp[0].split(';')
    values = tmp[1].split(';')
    fracs = {}
    for i in range(len(labels)):
        fracs[labels[i]] = float(values[i])
    return fracs

def header_dict(header_list):
    """
    Return a dict with the index of each header in the list"""
    header_idx = {}
    for i in range(len(header_list)):
        header_idx[header_list[i]] = i
    return header_idx

def main_input_split(main_input_file, out_dir, noise=0.0):
    """
    - Takes main input file, output directory, noise level (optional)
    - Split into hap_gen_inputs accordingly
    - For each hap, gives a number of read coverage (on average, they follow the HAPLOTYPE_FRACTION metric)
    """
    # create output dir 
    try:
        os.makedirs(out_dir)
    except FileExistsError:
        pass
    # read the fractions of haplotypes
    for line in open(main_input_file, 'r'):
        if line.startswith("#HAPLOTYPE_FRACTION"):
            fracs = load_hap_fraction(line)

    # check to see if hap_input.txt files already exist, if so, remove them
    hap_input_files = []
    for hap in fracs:
        f = out_dir +'/' + hap + '_input.txt'
        if os.path.exists(f):
            logging.info("Haplotype input file {} already exist. It will be replaced.".format(f))
            os.remove(f)
        hap_input_files.append(f)

    # read the REF + ALT alleles and assign the to each HAPLOTYPE
    for line in open(main_input_file, 'r'):
        if line.startswith('#CHROM'):
            header = line.strip().replace("#",'').split('\t')
            header_idx = header_dict(header)
        elif line.startswith("#") == False:
            tmp = line.strip().split('\t')
            total_cov = int(tmp[header_idx['COV']])
            ref = tmp[header_idx["REF"]]
            alt = tmp[header_idx["ALT"]]
            alleles = [ref] + alt.split(',')
            #print(total_cov, ref, alt, alleles)
            for hap in fracs:
                fout = out_dir +'/' + hap+'_input.txt'
                with open(fout, 'a') as f:
                    line = tmp[:3]
                    line += alleles[int(tmp[header_idx[hap]])]
                    hap_cov = str(total_cov*fracs[hap]) * np.random.normal(loc=1.0, scale=noise)
                    line += [hap_cov]
                    f.write('\t'.join(line)+'\n')

    return fracs, hap_input_files

In [82]:
mainInput = "examples/main_input.txt"
out_dir = './examples'
#for line in open(mainInput, 'r'):
#    print(line)

main_input_split(mainInput, out_dir)

({'HAP1': 0.5, 'HAP2': 0.3, 'HAP3': 0.1, 'HAP4': 0.1},
 ['./examples/HAP1_input.txt',
  './examples/HAP2_input.txt',
  './examples/HAP3_input.txt',
  './examples/HAP4_input.txt'])

In [42]:
string = "#HAPLOTYPE_FRACTION:HAP1;HAP2;HAP3;HAP4	0.5;0.3;0.1;0.1"
load_hap_fraction(string)

{'HAP1': 0.5, 'HAP2': 0.3, 'HAP3': 0.1, 'HAP4': 0.1}

In [85]:
import numpy as np
np.random.normal(loc=1.0, scale=0.0, size=None)

1.0