# STRAT - Short Tandem Repeat Analysis Tool

## 3. Collect statistics on on-target reads - gaps

### 3.1 Imports

In [1]:
from csv import QUOTE_NONE
import numpy as np
import pandas as pd
from string2string.alignment import NeedlemanWunsch

### 3.2 Arguments

In [2]:
motif = 'CAG'

# pcr2persons guppy
input_path = '/opt/data/pcr2persons/output/guppy/guppy.ontarget.tsv'
output_path = '/opt/data/pcr2persons/output/guppy/'

# jovan guppy
# input_path = '/opt/data/jovan/output/guppy/guppy.ontarget.tsv'
# output_path = '/opt/data/jovan/output/guppy/'

# dm108 guppy
# input_path = '/opt/data/dm108/output/guppy/guppy.ontarget.tsv'
# output_path = '/opt/data/dm108/output/guppy/'

# bc3_1 guppy
# input_path = '/opt/data/bc3_1/output/guppy/guppy.ontarget.tsv'
# output_path = '/opt/data/bc3_1/output/guppy/'

# bc3_2 guppy
# input_path = '/opt/data/bc3_2/output/guppy/guppy.ontarget.tsv'
# output_path = '/opt/data/bc3_2/output/guppy/'

# bc3_3 guppy
# input_path = '/opt/data/bc3_3/output/guppy/guppy.ontarget.tsv'
# output_path = '/opt/data/bc3_3/output/guppy/'

### 3.3 Constants

In [3]:
def rev_comp(seq, comps):
    return ''.join(comps.get(n, n) for n in reversed(seq))


DIRECTIONS = ['fwd', 'rev']

COMPLEMENT = {
    'A': 'T',
    'C': 'G',
    'G': 'C',
    'T': 'A'
}

MOTIFS = {
    'fwd': motif,
    'rev': rev_comp(motif, COMPLEMENT)
}

COLUMNS = [
    'direction',
    'id',
    'prefix_flank',
    'ins',
    'suffix_flank',
    'prefix_flank_q',
    'ins_q',
    'suffix_flank_q',
]

COLUMNS_SEQ_EXT = ['ins_ext']
COLUMNS_LEN_EXT = ['len_ins_ext']

### 3.4 Functions

In [4]:
def load(input_path, columns):
    df = pd.read_csv(input_path, sep='\t', header=None, dtype=str, quoting=QUOTE_NONE)
    df.columns = columns

    return df


def extend_ins(row):
    prefix = row['prefix_flank']
    ins = row['ins']
    suffix = row['suffix_flank']
    motif = MOTIFS[row['direction']]
    target = 2 * motif
    window = len(target)
    for s in range(window - 1):
        # i in (window-1)..1
        i = window - 1 - s
        if prefix[-i:] + ins[:window - i] == target:
            ins = prefix[-i:] + ins
            break
    
    for s in range(window - 1):
        # i in 1..(window-1)
        i = s + 1
        if ins[-i:] + suffix[:window - i] == target:
            ins = ins + suffix[:window - i]
            break

    return ins


def lengths(df, columns_seq, columns_len):
    for s, l in zip(columns_seq, columns_len):
        df[l] = df[s].str.len()
        df[l + '_adj'] = (df[l] / 3).round().astype(int)
    return df


def orient_inserts(row):
    seq = row['ins_ext']
    seq = rev_comp(seq, COMPLEMENT) if row['direction'] == 'rev' else seq
    # seq = seq.replace('CAG', '>>>')
    return seq

### 3.5 Main

In [5]:
df = load(input_path, COLUMNS)

In [6]:
df.head()

Unnamed: 0,direction,id,prefix_flank,ins,suffix_flank,prefix_flank_q,ins_q,suffix_flank_q
0,fwd,@8b642f7f-33ad-40c1-bd3b-423605bad973,TCGATCCCCC,CAGCAGCAGCA,GCAGCATTCC,'&%'(0245>,CBDECDD>>>7,663222334=
1,fwd,@662dfedb-470f-4c26-88a4-fe361f20af49,TGAATCCCCC,AGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCA,GCATTCCGGC,"2//,),47;A","AECBDDFGIGDFBBB-+,9:EDJPI{{FGIGG{F{D3331110","00'&&&,-1)"
2,fwd,@90c36059-b1b8-435f-912d-533eaf89cedd,GTGATCCCCC,CAGCAGCAGCAGC,AGCATTCCCG,3366<666<?,??CMDEJLHIBCD,888<ADBK==
3,rev,@f0796ffb-8fcb-4db2-af05-f4ba8a0ffd10,GCCGGGAATG,CTGCTGCTGCTGCT,GGGGGGATCG,&&&*3144)),*?@FGDINJDEBDL,AAAG{?=:7+
4,rev,@1ec715ce-7ef9-4e82-9bd5-40fd574178b0,GCCGGGAATG,CTGCTGCTGCTGCTGCTGCTGCTGCTGCTGCTGCTGCTGCTGCTGC...,GCTGGGGGGA,",../755<67",889:HG>>>>G{HHLJEGDDEF{G{GC9999JKFHGKPJIGLFLLJ...,*+++/0==64


In [7]:
df['ins_ext'] = df.apply(extend_ins, axis=1)
sum(df['ins'] != df['ins_ext'])

125781

In [8]:
df = lengths(df, COLUMNS_SEQ_EXT, COLUMNS_LEN_EXT)

In [9]:
df['ins_oriented'] = df.apply(orient_inserts, axis=1)

In [10]:
df[['ins_oriented', 'len_ins_ext']]

Unnamed: 0,ins_oriented,len_ins_ext
0,CAGCAGCAGCAGCAG,15
1,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAG,45
2,CAGCAGCAGCAGCAG,15
3,CAGCAGCAGCAGCAG,15
4,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,105
...,...,...
147786,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCGGCAGCAGCGGCG,47
147787,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,105
147788,CAGCAGTAGTAGCAG,15
147789,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCGGCAGCAGCAGCAG,48


In [11]:
dfg = df.groupby(['ins_oriented', 'len_ins_ext'])['id'].count().reset_index()
dfg.columns = ['ins_oriented', 'len_ins_ext', 'count']
cond = dfg['len_ins_ext'] <= 300
dfg = dfg[cond]
dfg = dfg.sort_values(['len_ins_ext', 'ins_oriented'])
dfg

Unnamed: 0,ins_oriented,len_ins_ext,count
2532,C,1,1
1251,AG,2,2
32217,CGG,3,1
33144,GCA,3,1
436,ACAG,4,1
...,...,...,...
20310,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,300,1
22059,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,300,1
24329,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,300,1
25657,CAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGCAGC...,300,1


In [12]:
match_weight = 10  # weight for a match
mismatch_weight = -8  # weight for a mismatch
gap_weight = -9  # weight for a gap
NW = NeedlemanWunsch(
    match_weight=match_weight,  # weight for a match
    mismatch_weight=mismatch_weight,  # weight for a mismatch
    gap_weight=gap_weight,  # weight for a gap
    gap_char=''  # character to use for a gap
)

In [13]:
def align(row):
    source = int(np.ceil((row['len_ins_ext'] / 3))) * motif
    target = row['ins_oriented']
    aligned_source, aligned_target = NW.get_alignment(source, target, return_score_matrix=False)
    aligned_source = aligned_source.split(' | ')
    aligned_target = aligned_target.split(' | ')
    res = ''.join(t for s, t in zip(aligned_source, aligned_target) if s != ' ')
    return res

In [14]:
%%time
dfg['ins_aligned'] = dfg.apply(align, axis=1)

KeyboardInterrupt: 

In [15]:
dfg['len_ins_aligned'] = dfg['ins_aligned'].str.len()

KeyError: 'ins_aligned'

In [None]:
cond = dfg['ins_aligned'] == dfg['ins_oriented']
dfg[cond]

In [None]:
dfg = dfg.sort_values(['len_ins_aligned', 'ins_aligned'])
dfg

In [None]:
results = []
for i in dfg.index:
    row = dfg.loc[i]
    for j in range(row['count']):
        results.append({'ins_aligned': row['ins_aligned'], 'len_ins_aligned': row['len_ins_aligned']})
len(results)

In [None]:
dfa = pd.DataFrame(results)
dfa

In [None]:
# stop

In [None]:
results = []
for i in range(dfa['ins_aligned'].str.len().max()):
    cond = dfa['len_ins_aligned'] >= i + 1
    row = dict(dfa[cond]['ins_aligned'].str[i].value_counts())
    cond = dfa['len_ins_aligned'] == i + 1
    row['insert_count'] = sum(cond)
    cond = dfa['len_ins_aligned'] >= i + 1
    row['coverage_count'] = sum(cond)
    results.append(row)

In [None]:
result_df = pd.DataFrame(results).fillna(0).astype(int)

In [None]:
result_df

In [None]:
cond = result_df['coverage_count'] > 10
result_df[cond]

In [None]:
from PIL import Image, ImageDraw

In [None]:
colors = {
    'A': 'green',
    'C': 'red',
    'G': 'blue',
    'T': 'yellow',
    ' ': 'orange'
}

def draw(df, colors, output_image):
    width = len(df)
    height = 100
    # height = 100 + int((np.log2(df['insert_count'].max()) * 100).round())
    print(height, width)
    image = Image.new('RGB', (width, height), 'white')
    draw = ImageDraw.Draw(image)
    for i in sorted(df.index):
        row = df.iloc[i]
        maxx = 0
        bottom = 0
        for n in ['A', 'C', 'G', 'T', ' ']:
            cnt = row[n]
            freq = int((100 * cnt / row['coverage_count']).round())
            if n == 'CAG'[i%3]:
                color = 'black'
            else:
                color = colors[n]
            for y in range(freq):
                draw.point((i, bottom + y), fill=color)
            bottom += freq
    image.save(output_image)

In [None]:
draw(result_df, colors, f'{output_path}test.png')

In [None]:
df.sort_values(['len_ins_ext', 'ins_oriented'])['ins_oriented']