# STRAT - Short Tandem Repeat Analysis tool

## 1. Detect flanks and high fidelity reads

### 1.1 Imports

In [1]:
from datetime import datetime
import gzip
from multiprocessing.dummy import Pool
import regex
from os import listdir
from os import sched_getaffinity
from os.path import isfile, join

### 1.2 Arguments

In [2]:
prefix = 'AGAAAGAAATGGTCTGTGATCCCCC'
suffix = 'CATTCCCGGCTACAAGGACCCTTCG'
motif = 'CAG'
tolerance = '{e<=5}'

# pcr2persons guppy
input_path = '/opt/data/pcr2persons/fastq/guppy/'
output_path = '/opt/data/pcr2persons/output/guppy/'

# pcr2persons dorado
# input_path = '/opt/data/pcr2persons/fastq/dorado/'
# output_path = '/opt/data/pcr2persons/output/dorado/'

# jovan guppy
# input_path = '/opt/data/jovan/fastq/guppy/'
# output_path = '/opt/data/jovan/output/guppy/'

# jovan dorado
# input_path = '/opt/data/jovan/fastq/dorado/'
# output_path = '/opt/data/jovan/output/dorado/'

# dm108 guppy
# input_path = '/opt/data/dm108/fastq/guppy/'
# output_path = '/opt/data/dm108/output/guppy/'

fastq_paths = sorted(join(input_path, f) for f in listdir(input_path) if 'fastq' in f and isfile(join(input_path, f)))
# fastq_paths = fastq_paths[:1]
[len(fastq_paths), fastq_paths[0], fastq_paths[-1]]

[68,
 '/opt/data/pcr2persons/fastq/guppy/AQD087_pass_fae3d762_8343086c_0.fastq.gz',
 '/opt/data/pcr2persons/fastq/guppy/AQD087_pass_fae3d762_8343086c_9.fastq.gz']

### 1.3 Constants

In [3]:
DIRECTIONS = ['fwd', 'rev']

COMPLEMENT = {
    'A': 'T',
    'C': 'G',
    'G': 'C',
    'T': 'A'
}

### 1.4 Functions

In [4]:
def rev_comp(seq, comps):
    return ''.join(comps.get(n, n) for n in reversed(seq))


def chop(string, start, end):
    prefix_flank = string[start-10:start]
    suffix_flank = string[end:end+10]
    ins = string[start:end]
    return prefix_flank, ins, suffix_flank


class ReadProcessor:
    def __init__(
        self,
        prefix,
        suffix,
        tolerance
    ):
        self.prefix = {
            'fwd': f'({prefix})' + tolerance,
            'rev': f'({rev_comp(suffix, COMPLEMENT)})' + tolerance
        }
        self.suffix = {
            'fwd': f'({suffix})' + tolerance,
            'rev': f'({rev_comp(prefix, COMPLEMENT)})' + tolerance
        }

    def process_read(self, id, seq, opt, qual):
        hits = 0
        row = ['offtarget', id, seq, qual]
        for k in DIRECTIONS:
            prefixes = regex.findall(self.prefix[k], seq)
            suffixes = regex.findall(self.suffix[k], seq)
            if len(prefixes) == 1 and len(suffixes) == 1:
                if hits == 1:
                    hits = 2
                    break
                
                start = seq.index(prefixes[0]) + len(prefixes[0])
                end = seq.index(suffixes[0])
                if end > start:
                    direction = k
                    hits = 1
                    prefix_flank, ins, suffix_flank = chop(seq, start, end)
                    prefix_flank_q, ins_q, suffix_flank_q = chop(qual, start, end)

        if hits == 1:
            return [
                'ontarget',
                direction,
                id,
                prefix_flank, ins, suffix_flank,
                prefix_flank_q, ins_q, suffix_flank_q
            ]
        else:
            return row


def process_fastq(fastq_path, output_path, read_processor):
    print(f'{datetime.now()} - {fastq_path}')
    fastq_name = fastq_path.split('/')[-1]
    gzipped = fastq_name.endswith('.gz')
    ontarget_output_path = f'{output_path}{fastq_name}.ontarget.tsv'
    # offtarget_output_path = f'{output_path}{fastq_name}.offtarget.tsv'
    openner = gzip.open if gzipped else open

    with openner(fastq_path, 'rt') as f, open(ontarget_output_path, 'wt') as o:
        for i, line in enumerate(f):
            line = line.strip()
            if i%4 == 0:
                if line.startswith('@'):
                    id = line.split(' ')[0]
                else:
                    print(f'Error in {fastq_path} line {i} - not an ID line')
                    raise
            elif i%4 == 1:
                seq = line
            elif i%4 == 2:
                if line.startswith('+'):
                    opt = line
                else:
                    print(f'Error in {fastq_path} line {i} - not a + line')
                    raise
            elif i%4 == 3:
                qual = line
                res = read_processor.process_read(id, seq, opt, qual)
                if res[0] == 'ontarget':
                    o.write('\t'.join(res[1:]) + '\n')
                else:
                    # g.write('\t'.join(res[1:]) + '\n')
                    pass

### 1.5 Main

In [5]:
sched_getaffinity(0)

{0, 1, 2, 3, 4, 5, 6, 7}

In [6]:
# stop

In [7]:
print(f'{datetime.now()} - Strat start')

inputs = [(fastq_path, output_path, ReadProcessor(prefix, suffix, tolerance)) for fastq_path in fastq_paths]

with Pool(2) as p:
    p.starmap(process_fastq, inputs)

print(f'{datetime.now()} - Strat end')

2024-01-27 18:30:33.274182 - Strat start
2024-01-27 18:30:33.289917 - /opt/data/pcr2persons/fastq/guppy/AQD087_pass_fae3d762_8343086c_0.fastq.gz
2024-01-27 18:30:33.290087 - /opt/data/pcr2persons/fastq/guppy/AQD087_pass_fae3d762_8343086c_17.fastq.gz
2024-01-27 18:34:16.164783 - /opt/data/pcr2persons/fastq/guppy/AQD087_pass_fae3d762_8343086c_18.fastq.gz
2024-01-27 18:34:22.225852 - /opt/data/pcr2persons/fastq/guppy/AQD087_pass_fae3d762_8343086c_1.fastq.gz
2024-01-27 18:37:49.371716 - /opt/data/pcr2persons/fastq/guppy/AQD087_pass_fae3d762_8343086c_19.fastq.gz
2024-01-27 18:38:04.124312 - /opt/data/pcr2persons/fastq/guppy/AQD087_pass_fae3d762_8343086c_10.fastq.gz
2024-01-27 18:41:24.819586 - /opt/data/pcr2persons/fastq/guppy/AQD087_pass_fae3d762_8343086c_2.fastq.gz
2024-01-27 18:41:48.312182 - /opt/data/pcr2persons/fastq/guppy/AQD087_pass_fae3d762_8343086c_11.fastq.gz
2024-01-27 18:45:04.083059 - /opt/data/pcr2persons/fastq/guppy/AQD087_pass_fae3d762_8343086c_20.fastq.gz
2024-01-27 18:45:

In [8]:
# print(f'{datetime.now()} - Strat start')
# read_processor = ReadProcessor(prefix, suffix, tolerance)
# for fastq_path in fastq_paths:
#     print(f'{datetime.now()} - {fastq_path}')
#     process_fastq(fastq_path, output_path, read_processor)
# print(f'{datetime.now()} - Strat end')