In [1]:
import pandas as pd
import numpy as np
import re
import time
import logging
import vaex

In [2]:
# create a log file
logging.basicConfig(filename=f'log_{time.strftime("%Y%m%d-%H%M%S")}.txt', level=10)
logging.info('Starting Analysis')
logging.info('Read alignment and query inputs')


In [3]:
# read data as vaex dataframe
sequence_alignment = vaex.read_csv('../data/small_test/sample_alignment.txt', chunk_size=1000000,convert=True, sep='\t', header=None)
query_sequence = vaex.read_csv('../data/small_test/sample_query.txt', chunk_size=10000, convert=True, sep='\t', header=None)


In [4]:

vaex.__version__

{'vaex': '4.9.2',
 'vaex-core': '4.9.2',
 'vaex-viz': '0.5.2',
 'vaex-hdf5': '0.12.2',
 'vaex-server': '0.8.1',
 'vaex-astro': '0.9.1',
 'vaex-jupyter': '0.8.0',
 'vaex-ml': '0.17.0'}

In [5]:
sequence_alignment

#,0,1,2,3
0,TR1,CHR1,3,8M7D6M2I2M11D7M
1,TR2,CHR2,10,20M


In [6]:
query_sequence

#,0,1
0,TR1,4
1,TR2,0
2,TR1,13
3,TR2,10


In [7]:
def process_query_to_ref_position(cigar_string, ref_start_position, query_index):
    '''To get reference position for a given query sequence index and its cigar string'''
    ref_array = np.array([]).astype('int32')
    query_array = np.array([]).astype('int32')
    query_start_position = 0
    try:
        string_info = re.findall(r'(\d+)(\w)', cigar_string)
    except ValueError as e:
        logging.info(f"Not a valid CIGAR string {cigar_string}")
        logging.exception(e)

    for number, operator in string_info:
        # assess whether the CIGAR operators are valid
        if operator not in ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']:
            raise Exception(f"Not a valid CIGAR operator {operator}: \
                            {cigar_string}")

        if operator in ['M', '=', 'X']:
            # present in query and reference
            ref_position = range(
                                    ref_start_position,
                                    ref_start_position+int(number))
            query_position = range(
                                    query_start_position,
                                    query_start_position+int(number))
            ref_array = np.append(ref_array, list(ref_position))
            query_array = np.append(query_array, list(query_position))
            ref_start_position += int(number)
            query_start_position += int(number)
        if operator in ['D', 'N']:
            # present in reference only
            ref_position = range(
                                    ref_start_position,
                                    ref_start_position+int(number))
            query_position = [query_start_position]*int(number)
            ref_array = np.append(ref_array, list(ref_position))
            query_array = np.append(query_array, query_position)
            ref_start_position += int(number)
        if operator in ['I', 'S']:
            # present in query only
            ref_position = [ref_start_position]*int(number)
            query_position = range(
                                    query_start_position,
                                    query_start_position+int(number))
            ref_array = np.append(ref_array, ref_position)
            query_array = np.append(query_array, list(query_position))
            query_start_position += int(number)
        if operator in ['H', 'P']:
            # present in none
            ref_position = [ref_start_position]*int(number)
            query_position = [query_start_position]*int(number)
            ref_array = np.append(ref_array, ref_position)
            query_array = np.append(query_array, query_position)
    logging.info(f'{cigar_string}\t{string_info}\t{ref_array}\t{query_array}')
    
    try:
        len(ref_array) == len(query_array)
    except ValueError as e:
        logging.info(f"Reference array and query array are not equal")
        logging.exception(e)

    
    try:
        ref_position = int(ref_array[list(query_array).index(query_index)])
    except ValueError as e:
        ref_position = -1
        logging.debug(f'Not a valid query position {query_index}')
        logging.exception(e)

    return ref_position

In [8]:
def get_output(tx_id, query_position):
    '''To get desired output by applying the function on query dataframe'''
    try:
        records = sequence_alignment[sequence_alignment['0'] == tx_id]
        if len(records) >= 1:
            logging.debug(f'There are multiple alignments for {tx_id}')
            chrom = records.to_records()[0]['1']
            ref_start = records.to_records()[0]['2']
            cigar_string = records.to_records()[0]['3']
            ref_position = process_query_to_ref_position(cigar_string, ref_start, query_position)
        if len(records) == 0 :
            logging.debug(f'There is no transcript id in alignment {tx_id}')
            chrom = ''
            ref_start = 0
            cigar_string = ''
            ref_position = -1
    except IndexError as e:
        chrom = ''
        ref_position = -1
        logging.info(f'There are no transcripts in alignment for {tx_id}')
    return((chrom,ref_position))

In [9]:
res = query_sequence.apply(get_output, arguments=[query_sequence['0'],query_sequence['1']], vectorize=False)
res = res.evaluate()

In [10]:
res_vx = vaex.from_pandas(pd.DataFrame(res))

In [11]:
query_sequence = query_sequence.join(res_vx,lprefix='left')

In [12]:
query_sequence

#,left0,left1,0,1
0,TR1,4,CHR1,7
1,TR2,0,CHR2,10
2,TR1,13,CHR1,23
3,TR2,10,CHR2,20


In [13]:
query_sequence.export_csv('test.csv', sep='\t', index=False, header=False)