# STRAT - Short Tandem Repeat Analysis tool

## 1. Detect flanks and high fidelity reads

### 1.1 Imports

In [1]:
import gzip
import regex
from os import listdir
from os.path import isfile, join

### 1.2 Arguments

In [2]:
prefix = 'AGAAAGAAATGGTCTGTGATCCCCC'
suffix = 'CATTCCCGGCTACAAGGACCCTTCG'
motif = 'CAG'
tolerance = '{s<=5}'

# pcr2persons guppy
# input_path = '../../../projects/ONT/data/pcr2persons/fastq/guppy/'
# output_path = '../../../projects/ONT/data/pcr2persons/output/guppy/'

# pcr2persons dorado
# input_path = '../../../projects/ONT/data/pcr2persons/fastq/dorado/'
# output_path = '../../../projects/ONT/data/pcr2persons/output/dorado/'

# jovan guppy
input_path = '../../../projects/ONT/data/jovan/fastq/guppy/'
output_path = '../../../projects/ONT/data/jovan/output/guppy/'

# jovan dorado
# input_path = '../../../projects/ONT/data/jovan/fastq/dorado/'
# output_path = '../../../projects/ONT/data/jovan/output/dorado/'

fastq_paths = sorted(join(input_path, f) for f in listdir(input_path) if 'fastq' in f and isfile(join(input_path, f)))
fastq_paths

['../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_0.fastq.gz',
 '../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_1.fastq.gz',
 '../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_10.fastq.gz',
 '../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_100.fastq.gz',
 '../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_101.fastq.gz',
 '../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_102.fastq.gz',
 '../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_103.fastq.gz',
 '../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_104.fastq.gz',
 '../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_105.fastq.gz',
 '../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_106.fastq.gz',
 '../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_107.fast

### 1.3 Constants

In [3]:
DIRECTIONS = ['fwd', 'rev']

COMPLEMENT = {
    'A': 'T',
    'C': 'G',
    'G': 'C',
    'T': 'A'
}

### 1.4 Functions

In [4]:
def rev_comp(seq, comps):
    return ''.join(comps.get(n, n) for n in reversed(seq))


def chop(string, start, end):
    prefix_flank = string[:start]
    suffix_flank = string[end:]
    ins = string[start:end]
    return prefix_flank, ins, suffix_flank


class ReadProcessor:
    def __init__(
        self,
        prefix,
        suffix,
        tolerance
    ):
        self.prefix = {
            'fwd': f'({prefix})' + tolerance,
            'rev': f'({rev_comp(suffix, COMPLEMENT)})' + tolerance
        }
        self.suffix = {
            'fwd': f'({suffix})' + tolerance,
            'rev': f'({rev_comp(prefix, COMPLEMENT)})' + tolerance
        }

    def process_read(self, id, seq, opt, qual):
        hits = 0
        row = ['lofi', id, seq, qual]
        for k in DIRECTIONS:
            prefixes = regex.findall(self.prefix[k], seq)
            suffixes = regex.findall(self.suffix[k], seq)
            if len(prefixes) == 1 and len(suffixes) == 1:
                if hits == 1:
                    hits = 2
                    break
                
                start = seq.index(prefixes[0]) + len(prefixes[0])
                end = seq.index(suffixes[0])
                if end > start:
                    direction = k
                    hits = 1
                    prefix_flank, ins, suffix_flank = chop(seq, start, end)
                    prefix_flank_q, ins_q, suffix_flank_q = chop(qual, start, end)

        if hits == 1:
            return [
                'hifi',
                direction,
                id,
                prefix_flank, ins, suffix_flank,
                prefix_flank_q, ins_q, suffix_flank_q
            ]
        else:
            return row


def process_fastq(fastq_path, output_path, read_processor):
    fastq_name = fastq_path.split('/')[-1]
    gzipped = fastq_name.endswith('.gz')
    hifi_output_path = f'{output_path}{fastq_name}.hifi.tsv'
    # lofi_output_path = f'{output_path}{fastq_name}.lofi.tsv'
    openner = gzip.open if gzipped else open

    with openner(fastq_path, 'rt') as f, open(hifi_output_path, 'wt') as o:
        for i, line in enumerate(f):
            line = line.strip()
            if i%4 == 0:
                if line.startswith('@'):
                    id = line.split(' ')[0]
                else:
                    print(f'Error in {fastq_path} line {i} - not an ID line')
                    raise
            elif i%4 == 1:
                seq = line
            elif i%4 == 2:
                if line.startswith('+'):
                    opt = line
                else:
                    print(f'Error in {fastq_path} line {i} - not a + line')
                    raise
            elif i%4 == 3:
                qual = line
                res = read_processor.process_read(id, seq, opt, qual)
                if res[0] == 'hifi':
                    o.write('\t'.join(res[1:]) + '\n')
                else:
                    # g.write('\t'.join(res[1:]) + '\n')
                    pass

### 1.5 Main

In [5]:
read_processor = ReadProcessor(prefix, suffix, tolerance)
for fastq_path in fastq_paths:
    print(fastq_path)
    process_fastq(fastq_path, output_path, read_processor)

../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_0.fastq.gz
../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_1.fastq.gz
../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_10.fastq.gz
../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_100.fastq.gz
../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_101.fastq.gz
../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_102.fastq.gz
../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_103.fastq.gz
../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_104.fastq.gz
../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_105.fastq.gz
../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_106.fastq.gz
../../../projects/ONT/data/jovan/fastq/guppy/APX527_pass_77711599_a755ab1c_107.fastq.gz
../../../projects/ONT/data/jovan/fast