# STRAT - Short Tandem Repeat Analysis Tool

## 3. Collect statistics on on-target reads - gaps

### 3.1 Imports

In [1]:
from csv import QUOTE_NONE
import numpy as np
import pandas as pd
from string2string.alignment import NeedlemanWunsch

### 3.2 Arguments

In [2]:
motif = 'CAG'

# pcr2persons guppy
# input_path = '/opt/data/pcr2persons/output/guppy/guppy.ontarget.tsv'
# output_path = '/opt/data/pcr2persons/output/guppy/'

# pcr2persons dorado
# input_path = '/opt/data/pcr2persons/output/dorado/dorado.ontarget.tsv'
# output_path = '/opt/data/pcr2persons/output/dorado/'

# jovan guppy
# input_path = '/opt/data/jovan/output/guppy/guppy.ontarget.tsv'
# output_path = '/opt/data/jovan/output/guppy/'

# jovan dorado
# input_path = '/opt/data/jovan/output/dorado/dorado.ontarget.tsv'
# output_path = '/opt/data/jovan/output/dorado/'

# dm108 guppy
# input_path = '/opt/data/dm108/output/guppy/guppy.ontarget.tsv'
# output_path = '/opt/data/dm108/output/guppy/'

# bc3_1 guppy
# input_path = '/opt/data/bc3_1/output/guppy/guppy.ontarget.tsv'
# output_path = '/opt/data/bc3_1/output/guppy/'

# bc3_1 dorado
# input_path = '/opt/data/bc3_1/output/dorado/dorado.ontarget.tsv'
# output_path = '/opt/data/bc3_1/output/dorado/'

# bc3_2 guppy
# input_path = '/opt/data/bc3_2/output/guppy/guppy.ontarget.tsv'
# output_path = '/opt/data/bc3_2/output/guppy/'

# bc3_2 dorado
# input_path = '/opt/data/bc3_2/output/dorado/dorado.ontarget.tsv'
# output_path = '/opt/data/bc3_2/output/dorado/'

# bc3_3 guppy
# input_path = '/opt/data/bc3_3/output/guppy/guppy.ontarget.tsv'
# output_path = '/opt/data/bc3_3/output/guppy/'

# bc3_3 dorado
input_path = '/opt/data/bc3_3/output/dorado/dorado.ontarget.tsv'
output_path = '/opt/data/bc3_3/output/dorado/'

### 3.3 Constants

In [3]:
def rev_comp(seq, comps):
    return ''.join(comps.get(n, n) for n in reversed(seq))


DIRECTIONS = ['fwd', 'rev']

COMPLEMENT = {
    'A': 'T',
    'C': 'G',
    'G': 'C',
    'T': 'A'
}

MOTIFS = {
    'fwd': motif,
    'rev': rev_comp(motif, COMPLEMENT)
}

COLUMNS = [
    'direction',
    'id',
    'prefix_flank',
    'ins',
    'suffix_flank',
    'prefix_flank_q',
    'ins_q',
    'suffix_flank_q',
]

COLUMNS_SEQ_EXT = ['ins_ext']
COLUMNS_LEN_EXT = ['len_ins_ext']

### 3.4 Functions

In [4]:
def load(input_path, columns):
    df = pd.read_csv(input_path, sep='\t', header=None, dtype=str, quoting=QUOTE_NONE)
    df.columns = columns

    return df


def extend_ins(row):
    prefix = row['prefix_flank']
    ins = row['ins']
    suffix = row['suffix_flank']
    motif = MOTIFS[row['direction']]
    target = 2 * motif
    window = len(target)
    for s in range(window - 1):
        # i in (window-1)..1
        i = window - 1 - s
        if prefix[-i:] + ins[:window - i] == target:
            ins = prefix[-i:] + ins
            break
    
    for s in range(window - 1):
        # i in 1..(window-1)
        i = s + 1
        if ins[-i:] + suffix[:window - i] == target:
            ins = ins + suffix[:window - i]
            break

    return ins


def lengths(df, columns_seq, columns_len):
    for s, l in zip(columns_seq, columns_len):
        df[l] = df[s].str.len()
        df[l + '_adj'] = (df[l] / 3).round().astype(int)
    return df


def orient_inserts(row):
    seq = row['ins_ext']
    seq = rev_comp(seq, COMPLEMENT) if row['direction'] == 'rev' else seq
    # seq = seq.replace('CAG', '>>>')
    return seq

### 3.5 Main

In [5]:
df = load(input_path, COLUMNS)

In [6]:
df.head()

Unnamed: 0,direction,id,prefix_flank,ins,suffix_flank,prefix_flank_q,ins_q,suffix_flank_q
0,rev,@034c78ea-5942-4dce-a311-acffb990b1a4,GCCGGAGATG,CTGCTGCTGCTGCTGCTGCTGCTGCTGCTGCTGCTG,GGGGGATCAC,7;=8925222,"69:AEDESISQGC?@@ABESMB=.,.-,*++''),)","*+><3,+-25"
1,fwd,@08467e00-9fe4-4552-94da-56cc69e7f0cf,GTGATCCCCC,AGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC,AGCAGCATTC,",9:89;;99A","BFFDSSKDHCBGEIJ@@@CDSNJPGGS@@@,,,",)))((((.+)
2,rev,@1be5af7a-ed27-435f-b3a5-b75e2b45a8ac,GCCGGGAATG,CTGCTGCTGCTGCTGCTGCTGCTGCTGCTG,TCTGGGGGGA,2233C@C976,"32130,...211188788<HEEEBCEJGEA",*)*+-.AD<;
3,rev,@28b04cd7-a912-4da3-abc8-7c3868f91c08,GCCGGGAATG,CTGCTGCTGCTGCTGCTGCTGCTGCTGCTGCTGCT,GCTGGGGGGA,"7688>:9/,,","*((,89?@BBDCGFKJLHSHC<99:A@?>=?CA@B",BEJ---F>3/
4,rev,@38c0df9b-3daa-4a54-ac0a-35e9e17e11ed,GCCGGGAATG,CTGCTGCTGCTGCTGCTGCTGCTGCTGCTGCTGCTG,CTGGGGGATC,"%%$%%,.334",9;<BHBA<31/0?D<;::4438:774467686320/,..-+.>85+*


In [7]:
df['ins_ext'] = df.apply(extend_ins, axis=1)
sum(df['ins'] != df['ins_ext'])

8930

In [8]:
df = lengths(df, COLUMNS_SEQ_EXT, COLUMNS_LEN_EXT)

In [9]:
df['ins_oriented'] = df.apply(orient_inserts, axis=1)

In [10]:
df[['ins_oriented', 'len_ins_ext']]

Unnamed: 0,ins_oriented,len_ins_ext
0,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,36
1,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,39
2,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,30
3,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,39
4,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,39
...,...,...
10024,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,39
10025,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,39
10026,AGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCA...,510
10027,ACAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,40


In [11]:
countss = set(df['len_ins_ext'])

df_subs = []
for c in countss:
    cond = df['len_ins_ext'] == c
    if len(df[cond]) >= 10:
        df_subs.append(df[cond].sample(n=10))

df_sub = pd.concat(df_subs)
df_sub

Unnamed: 0,direction,id,prefix_flank,ins,suffix_flank,prefix_flank_q,ins_q,suffix_flank_q,ins_ext,len_ins_ext,len_ins_ext_adj,ins_oriented
6157,rev,@18f5a5fd-9c32-4a4f-9c26-cfd0be29e3bf,GCCGGGAATG,CTGCTGCTGC,TGCTGGGGGG,::AACBECAA,@AABB=998>,BADE448MG>,CTGCTGCTGCTGCTG,15,5,CAGCAGCAGCAGCAG
8502,fwd,@3ccc787a-8568-4446-8ce4-e74da7cfde92,GTGATCCCCC,AGCAGCAGCAGCAG,CATTCCCAGC,88423/078=,;<<:;<333)((((,(''''()%%%,CAGCAGCAGCAGCAG,15,5,CAGCAGCAGCAGCAG
4060,fwd,@02cb03c0-d0e0-4503-8527-19e9afc52d3e,GTGATCCCCC,CAGCAGCAGCAGC,AGCATTCCCG,GD?>@<=?@G,GACDDFCCDF(((,((((**+78+,CAGCAGCAGCAGCAG,15,5,CAGCAGCAGCAGCAG
8104,rev,@111c7497-68a5-4109-921e-7914508fbab1,ACCGGGAATG,CTGCTGCTGCT,GCTGGGGGGA,&''+88<;02,'&&&;;9:0/-,-16457KSF@,CTGCTGCTGCTGCTG,15,5,CAGCAGCAGCAGCAG
7298,rev,@ab178a92-6651-472f-8e4a-4aee0c016a39,GCCGGGAATG,CTGCTGCTGCT,GCTGGGGGGA,::;@EFIIFD,DCBCBBIFFIE,SC@510AB<7,CTGCTGCTGCTGCTG,15,5,CAGCAGCAGCAGCAG
...,...,...,...,...,...,...,...,...,...,...,...,...
9106,fwd,@86aee3e6-1a4f-499a-b262-18ed0870ae2b,ATGATCCCCC,AGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCA...,CGGCGTTCCC,"$%+.0,)&'.",/<@?ABAAA=>=:9:8::*))))*///+++))&''8ABBDGISSJG...,+*)+.32...,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,504,168,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...
8701,fwd,@555e3ff3-7f2e-4988-b669-61af5b96b137,GTGAATCCCC,AGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCA...,GCGGCATTCC,/1.-+)**+/,-.-**45DCEFSHKIKMSLOSIJIJIKHHSDBCCCC@?@---*)**...,.-())'))(*,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,504,168,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...
2245,fwd,@15722e2c-7be1-4958-a57d-fef70ef9a166,GTGATCCCCA,GCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG...,CGTCCGGCTA,-/44===89B,CDMSJ=;<<SKBEDAIKFIKGDCDFCCFEIFSJISGKSSGMSMRAA...,*)***+*''0,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,504,168,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...
5492,fwd,@ba75d0a5-4d0a-4b08-886b-db8a53250e4f,GTGATCCCCC,AGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCA...,CAGCATTCCC,-.11:6558=,9946CB?<=>@D>SKFHJBAB?DMEJGOSGFLIGJJABB<<==?AA...,;66665424:,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,504,168,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...


In [12]:
dfg = df_sub.groupby(['ins_oriented', 'len_ins_ext'])['id'].count().reset_index()
dfg.columns = ['ins_oriented', 'len_ins_ext', 'count']
cond = dfg['len_ins_ext'] <= 300
dfg = dfg[cond]
dfg = dfg.sort_values(['len_ins_ext', 'ins_oriented'])
dfg

Unnamed: 0,ins_oriented,len_ins_ext,count
37,CAGCAGCAGCAGCAG,15,10
4,ACAGACAGCAGACAGCAGCAGCACACA,27,1
16,AGGCAGCAGCAGCAGCAGCAGCAGCAG,27,1
59,CAGCAGCAGCAGCAGCAGCAGCAGCAG,27,8
21,CAGCAGACAGCAGACAGCAGCAGCAGCAG,29,1
...,...,...,...
607,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGGAGCAGCAGCAG,42,1
627,CAGCAGCAGCAGCAGCAGCAGGAGCAGCAGCAGCAGCAGCAG,42,1
98,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,60,10
100,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,63,9


In [13]:
match_weight = 10  # weight for a match
mismatch_weight = -8  # weight for a mismatch
gap_weight = -9  # weight for a gap
NW = NeedlemanWunsch(
    match_weight=match_weight,  # weight for a match
    mismatch_weight=mismatch_weight,  # weight for a mismatch
    gap_weight=gap_weight,  # weight for a gap
    gap_char=''  # character to use for a gap
)

In [14]:
def align(row):
    source = int(np.ceil((row['len_ins_ext'] / 3))) * motif
    target = row['ins_oriented']
    aligned_source, aligned_target = NW.get_alignment(source, target, return_score_matrix=False)
    aligned_source = aligned_source.split(' | ')
    aligned_target = aligned_target.split(' | ')
    res = ''.join(t for s, t in zip(aligned_source, aligned_target) if s != ' ')
    return res

In [15]:
%%time
dfg['ins_aligned'] = dfg.apply(align, axis=1)

CPU times: user 343 ms, sys: 0 ns, total: 343 ms
Wall time: 342 ms


In [16]:
dfg['len_ins_aligned'] = dfg['ins_aligned'].str.len()

In [17]:
cond = dfg['ins_aligned'] == dfg['ins_oriented']
dfg[cond]

Unnamed: 0,ins_oriented,len_ins_ext,count,ins_aligned,len_ins_aligned
37,CAGCAGCAGCAGCAG,15,10,CAGCAGCAGCAGCAG,15
59,CAGCAGCAGCAGCAGCAGCAGCAGCAG,27,8,CAGCAGCAGCAGCAGCAGCAGCAGCAG,27
65,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,30,10,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,30
17,CAACAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,33,1,CAACAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,33
42,CAGCAGCAGCAGCAGCAACAGCAGCAGCAGCAG,33,1,CAGCAGCAGCAGCAGCAACAGCAGCAGCAGCAG,33
69,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,33,8,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,33
76,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,36,9,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,36
595,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGGAG,36,1,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGGAG,36
82,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,39,10,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,39
84,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAA,42,1,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAA,42


In [18]:
dfg = dfg.sort_values(['len_ins_aligned', 'ins_aligned'])
dfg

Unnamed: 0,ins_oriented,len_ins_ext,count,ins_aligned,len_ins_aligned
37,CAGCAGCAGCAGCAG,15,10,CAGCAGCAGCAGCAG,15
16,AGGCAGCAGCAGCAGCAGCAGCAGCAG,27,1,AGCAGCAGCAGCAGCAGCAGCAGCAG,27
4,ACAGACAGCAGACAGCAGCAGCACACA,27,1,CAGCAGCAGCAGCAGCAGCA CA CA,27
59,CAGCAGCAGCAGCAGCAGCAGCAGCAG,27,8,CAGCAGCAGCAGCAGCAGCAGCAGCAG,27
21,CAGCAGACAGCAGACAGCAGCAGCAGCAG,29,1,CAGCAG A CAGCAGCAGCAGCAGCAGCAG,30
...,...,...,...,...,...
636,CAGCAGCAGCAGCAGCAGGACAGCAGCAACAGCAGCAGCAG,41,1,CAGCAGCAGCAGCAGCAGGA CAGCAGCAACAGCAGCAGCAG,42
648,CAGCAGCAGCAGGAGACGACAGCAGCGGCGGCAGCAGCGGC,41,1,CAGCAGCAGCAGGAG A C GCAGCAGCGGCGGCAGCAGCGG,42
98,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,60,10,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,60
100,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,63,9,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,63


In [19]:
results = []
for i in dfg.index:
    row = dfg.loc[i]
    for j in range(row['count']):
        results.append({'ins_aligned': row['ins_aligned'], 'len_ins_aligned': row['len_ins_aligned']})
len(results)

180

In [20]:
dfa = pd.DataFrame(results)
dfa

Unnamed: 0,ins_aligned,len_ins_aligned
0,CAGCAGCAGCAGCAG,15
1,CAGCAGCAGCAGCAG,15
2,CAGCAGCAGCAGCAG,15
3,CAGCAGCAGCAGCAG,15
4,CAGCAGCAGCAGCAG,15
...,...,...
175,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,63
176,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,63
177,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,63
178,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,63


In [21]:
# stop

In [22]:
results = []
for i in range(dfa['ins_aligned'].str.len().max()):
    cond = dfa['len_ins_aligned'] >= i + 1
    row = dict(dfa[cond]['ins_aligned'].str[i].value_counts())
    cond = dfa['len_ins_aligned'] == i + 1
    row['insert_count'] = sum(cond)
    cond = dfa['len_ins_aligned'] >= i + 1
    row['coverage_count'] = sum(cond)
    results.append(row)

In [23]:
result_df = pd.DataFrame(results).fillna(0).astype(int)

In [24]:
result_df

Unnamed: 0,C,Unnamed: 2,G,T,insert_count,coverage_count,A
0,167,9,3,1,0,180,0
1,0,1,1,0,0,180,178
2,1,12,166,0,0,180,1
3,180,0,0,0,0,180,0
4,0,0,0,0,0,180,180
...,...,...,...,...,...,...,...
58,0,0,0,0,0,20,20
59,0,0,20,0,10,20,0
60,10,0,0,0,0,10,0
61,0,0,0,0,0,10,10


In [25]:
cond = result_df['coverage_count'] > 10
result_df[cond]

Unnamed: 0,C,Unnamed: 2,G,T,insert_count,coverage_count,A
0,167,9,3,1,0,180,0
1,0,1,1,0,0,180,178
2,1,12,166,0,0,180,1
3,180,0,0,0,0,180,0
4,0,0,0,0,0,180,180
5,0,1,179,0,0,180,0
6,174,6,0,0,0,180,0
7,0,0,0,0,0,180,180
8,0,5,175,0,0,180,0
9,179,1,0,0,0,180,0


In [26]:
from PIL import Image, ImageDraw

In [27]:
colors = {
    'A': 'green',
    'C': 'red',
    'G': 'blue',
    'T': 'yellow',
    ' ': 'orange'
}

def draw(df, colors, output_image):
    width = 3 * len(df)
    height = 200
    # height = 100 + int((np.log2(df['insert_count'].max()) * 100).round())
    # maxx = df['insert_count'].max()
    # print(maxx)
    print(height, width)
    image = Image.new('RGB', (width, height), 'white')
    draw = ImageDraw.Draw(image)
    for i in sorted(df.index):
        row = df.iloc[i]
        # bottom = 101
        # top = int((row['insert_count'] / maxx).round()) * 100 + bottom
        # print(top)
        # for m in range(bottom, top):
        #     draw.point((i, m), fill='black')
        bottom = 0
        for n in ['A', 'C', 'G', 'T', ' ']:
            cnt = row[n]
            freq = int((200 * cnt / row['coverage_count']).round())
            if n == 'CAG'[i%3]:
                color = 'black'
            else:
                color = colors[n]
            for y in range(freq):
                draw.point((3*i, bottom + y), fill=color)
                draw.point((3*i+1, bottom + y), fill=color)
                draw.point((3*i+2, bottom + y), fill=color)
            bottom += freq
        if i % 3 == 0:
            draw.line([(3*i, 0), (3*i, 200)], fill='grey')
    image.save(output_image)

In [28]:
draw(result_df, colors, f'{output_path}test.png')

200 189


In [29]:
# df.sort_values(['len_ins_ext', 'ins_oriented'])['ins_oriented']