# Haplotype generator

This is to generate haplotypes

Input:
   - A bed file containing Chromorom   Start   End    Ref   Alt  reg_chrom  reg_start reg_end 
   - A matching reference genome in FASTA format (hg19, hg38 etc.) 
   - Note: 
       - Alt allele can only be one kind, for multiple alleles, create a different input file. 
       - '-' or '*' means deletion

Output: 
   - FASTA file containing modifications specified in the input. Each sequence is extracted using reg_chrom reg_start reg_end

In [132]:
import subprocess
import os 

def load_ref(fastafile, target_chrom=None):
    """ 
    Load the reference genome from a single fasta file. 
    - If sequence name is specified, only load that sequence.
    """
    chroms = open(fastafile).read().split(">")[1:]
    genome = {}
    for chrom in chroms:
        tmp = chrom.split()
        chr_name = tmp[0]
        if target_chrom != None:
            if chr_name != target_chrom:
                continue
        seq = ''.join(tmp[1:])
        genome[chr_name] = seq
    return genome

def get_roi_seqs(bedFile, refFasta):
    """
    1. From region of interest BED file, output FASTA file containig
    sequences for each region.
    2. Load sequences for each region
    """
    tmpFAFile = bedFile + '.tmpfasta'
    subprocess.run(['bedtools', 'getfasta', '-fi', refFasta, '-bed', bedFile, '-fo', tmpFAFile])
    refSeqs = load_ref(tmpFAFile)
    os.remove(tmpFAFile)

    return refSeqs

def convert_roi_names_to_coordinates(listOfROINames):
    listOfROIs = []
    for roi in listOfROINames:
        tmp = roi.split(":")
        tmp = [tmp[0]] +  [int(x) for x in tmp[1].split('-')]
        listOfROIs += [tmp]
    return listOfROIs

def assign_to_roi(loc, listOfROIs, one_based=False):
    """
    From a location, find a region of interest that contain said location and output new loc.
    INPUT:
        - location in the form of [CHROMOSOME, POS] # note, POS is 0-based
        - ROIs in the form of [[CHROMOSOME, START, END]] # START and END are also 0-based
    OUTPUT:
        - ROI that contains the input location
        - New cordinates with respect to that ROI in the form of [ROI, START, END]
    """
    for roi in listOfROIs:
        if roi[0] == loc[0]:
            if one_based:
                loc[1] = loc[1] - 1 
            if roi[1] <= loc[1] and loc[1] < roi[2]:
                newPos = loc[1] - roi[1]
                roiName = roi[0] + ':' + str(roi[1]) + '-' + str(roi[2])
                return roiName, newPos
    return None

def load_hap_gen_input(hap_gen_bed):
    """
    Read haplotype generator input BED file
    TODO: add function to assess input file type
    """
    hap_gen_input = []
    for line in open(hap_gen_bed, 'r'):
        line = line.strip().split('\t')
        if len(line) > 3:
            line[1] = int(line[1])
            hap_gen_input += [line]
    return hap_gen_input

def convert_hapGenInput_to_new_coords(hapGenInput, listOfROIs, one_based=False):
    """
    Convert the hapGenInput to new coordinates based on the reference sequences
    
    If the locations of variants are 1-based, use one_based = True.
    """
    newHapGenInput = []
    for line in hapGenInput:
        loc = line[:2]
        try:
            roi, newPos = assign_to_roi(loc, listOfROIs, one_based=one_based)
        except TypeError: # the return is None, the variant is not inside any ROI
            continue
        newHapGenInput += [[roi, newPos] + line[2:]]
        
    return newHapGenInput

def insert_variants(ref_seq, seq_name, haplotype_calls, one_based=False): 
    # ref_seq is the seqeuence of the chrom of interest
    # seq_name is the name of the ref_seq
    # haplotype_calls is in the form ['chr1', '30001230', 'G', 'A']

    # copy the seqs first, also convert the string into list of characters
    new_seq = ''
    counter = 0
    new_coords = [] # this stores the coordinates of the inserted variants on the newly altered chromosome
    old_coords = [] # this stores old coordinates of inserted variants
    prev_var_coord = 0 # tracks the location of the end of the previous variant (VCF coordinate is 1-based), this is on the reference genome
    total_added = 0 # tracks the difference in bp number after altering chrom
    for line in haplotype_calls: # $haplotypes is in the form [[chrom, loc, ref, alts, gt]]
        #print(line)
        chrom = line[0]
        if chrom != seq_name:
            continue
        counter += 1
        if counter % 10000 == 0:
            print(counter)

        if one_based:
            loc_s = int(line[1]) - 1 # convert 1-based to 0-based 
        else:
            loc_s = int(line[1])

        ref = line[2]
        loc_e = loc_s + len(ref) # loc_s and loc_e are coords on the reference genome
        alt = line[3].replace("-",'').replace("*",'') # if '-', it's a deletion
        new_seq += ref_seq[prev_var_coord:loc_s] # adding the space between the current variant and the previous one
        new_coords += [len(new_seq)]
        old_coords += [loc_s]
        new_seq = new_seq + alt
        prev_var_coord = loc_e
        total_added += len(alt) - len(ref)
    new_seq += ref_seq[prev_var_coord:]

    #print("Inserted number of variants", counter)
    #print("Total bases dif", total_added)

    return new_seq, old_coords, new_coords

def generate_haplotype_seqs(hapGenInput, refSeqs, one_based=False, tag=''):
    """
    INPUT: Haplotype Input, reference sequences
    OUTPUT: Haplotype sequences in FASTA format
    """
    haploSeqs = {}
    for seq in refSeqs:
        newSeq, old_coords, new_coords = insert_variants(refSeqs[seq], seq, hapGenInput, one_based=one_based)
        haploSeqs[seq + '_' + tag] = newSeq
    return haploSeqs

def main():
    """
    This script is just to create haplotypes based on the requirements 
    """
    
    return 



In [133]:
bedFile = "/home/vngo/Work/liquid_biopsy_pipeline/mnv_calling_module/target_beds/somaticCNV_from_SomaticGermline_panel.bed"
refFasta = "/home/vngo/genomes/hg19_ambry.fa"

refSeqs = get_roi_seqs(bedFile, refFasta)
roiNames = ['chr1:115248070-115248190', 'chr1:115251154-115251274']
# reduce the refSeqs to smaller set for testing
tmp = {}
for roi in roiNames:
    tmp[roi] = refSeqs[roi]
refSeqs = tmp

listOfROIs = convert_roi_name_to_coordinates(roiNames)

#for roi in roiNames:
#    print(roi, refSeqs[roi])

hap_gen_bed = "hap_gen_input.bed"
hapGenInput = load_hap_gen_input(hap_gen_bed)
newHapGenInput = convert_hapGenInput_to_new_coords(hapGenInput, listOfROIs, one_based=False)
newHapGenInput


haploSeqs = generate_haplotype_seqs(newHapGenInput, refSeqs, one_based=False, tag='hap1')
haploSeqs 

{'chr1:115248070-115248190_hap1': 'TATGACAGCTGGCCAGGCAVuTTAATTTGGGGAAAGAGAAGGATTTTGAGGTAAACTAGAATTCTTTTCAGCAATCACAAACATAGCCTAAAAACCACGTTTTATACCACACTGAAGTTA',
 'chr1:115251154-115251274_hap1': 'GTvuvuvuvuACATCACCACACATGGCAATCCCATACAACCCTGAGTCCCATCATCACTGCTGTTGAGTTTTTTCATTCGGTACTGGCGTATTTCTCTTACCAGTGTGTAAAAAGCATCTTCAACAC'}

In [126]:
ART="/mnt/vngo/art_bin_MountRainier/art_illumina"

proc = subprocess.Popen([ART], stdout=subprocess.PIPE)
while True:
    line = proc.stdout.readline()
    if not line:
        break


test: b''
test: b'             ART_Illumina (2008-2016)'
test: b'          Q Version 2.5.8 (June 7, 2016)'
test: b'     Contact: Weichun Huang <whduke@gmail.com>'
test: b'    -------------------------------------------'
test: b''
test: b'===== USAGE ====='
test: b''
test: b'art_illumina [options] -ss <sequencing_system> -sam -i <seq_ref_file> -l <read_length> -f <fold_coverage> -o <outfile_prefix>'
test: b'art_illumina [options] -ss <sequencing_system> -sam -i <seq_ref_file> -l <read_length> -c <num_reads_per_sequence> -o <outfile_prefix>'
test: b'art_illumina [options] -ss <sequencing_system> -sam -i <seq_ref_file> -l <read_length> -f <fold_coverage> -m <mean_fragsize> -s <std_fragsize> -o <outfile_prefix>'
test: b'art_illumina [options] -ss <sequencing_system> -sam -i <seq_ref_file> -l <read_length> -c <num_reads_per_sequence> -m <mean_fragsize> -s <std_fragsize> -o <outfile_prefix>'
test: b''
test: b'===== PARAMETERS ====='
test: b''
test: b'  -1   --qprof1   the first-read quality 