# STRAT - Short Tandem Repeat Analysis Tool

## 1. Plot various graphs on processed reads

In [1]:
from csv import QUOTE_NONE
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw

## 2. Arguments

In [2]:
motif = 'CAG'

# bc3_1 guppy
input_path = '/opt/data/workdir/bc3_1.dorado.ontarget.processed.tsv'
output_path = '/opt/data/workdir/'

## 3. Constants

In [3]:
COLORS = {
    'A': '#3DA853',  # green
    'C': '#4285F4',  # blue
    'G': '#F8BC07',  # yellow
    'T': '#EA4334',  # red
    ' ': 'white'
}

## 4. Functions

In [4]:
def prepare_for_plotting(df, col_seq, col_len, col_cnt, col_cov):
    results = []
    for i in range(df[col_len].max()):
        cond = df[col_len] >= i + 1
        row = dict(df[cond][col_seq].str[i].value_counts())
        
        cond = df[col_len] == i + 1
        row[col_cnt] = sum(cond)
        
        cond = df[col_len] >= i + 1
        row[col_cov] = sum(cond)
        
        results.append(row)
    
    result_df = pd.DataFrame(results).fillna(0).astype(int)
    return result_df


def plot(df, col_seq, direction):
    output_image = f'/opt/data/workdir/images/test.{col_seq}.{direction}.png'
    col_len = 'len_' + col_seq
    col_cnt = 'cnt_' + col_seq
    col_cov = 'cov_' + col_seq
    
    cond = df['direction'] == direction
    cond &= df[col_len] <= 201
    
    df_prep = prepare_for_plotting(df[cond][[col_seq, col_len]], col_seq, col_len, col_cnt, col_cov)

    
    colors = COLORS
    width = 1500
    height = 1000
    image = Image.new('RGB', (width, height), 'grey')
    draw = ImageDraw.Draw(image)
    reach_max = df_prep[col_cov].max()
    
    for i in sorted(df_prep.index):
        x = i + 1
        row = df_prep.iloc[i]
        cnt = 0 if row[col_cnt] == 0 else row[col_cnt]
        cov = 0 if row[col_cov] == 0 else row[col_cov]
        reach = 500 * cov / reach_max
        bottom = 0
        for n in ['G', 'C', 'A', 'T', ' ']:
            if not n in row:
                continue
            cnt = row[n]
            freq = int((reach * row[n] / cov).round())
            if n == 'CAG'[i%3]:
                color = 'black'
            else:
                color = colors[n]
            draw.line([(10*x, bottom), (10*x, bottom+freq)], width=8, fill=color)
            bottom += freq
    
        if i % 3 == 0:
            draw.line([(10*i+5, 0), (10*i+5, 500)], width=2, fill='#AAAAAA')
    
        if i % 30 == 0:
            draw.line([(10*i+5, 0), (10*i+5, 500)], width=2, fill='white')
    
    image.save(output_image)

## 5. Tryouts

In [5]:
df = pd.read_csv(input_path, sep='\t')

In [6]:
plot(df, 'ins',         'fwd')
plot(df, 'ins_aln',     'fwd')
plot(df, 'ins_ext',     'fwd')
plot(df, 'ins_ext_aln', 'fwd')

In [7]:
plot(df, 'ins',         'rev')
plot(df, 'ins_aln',     'rev')
plot(df, 'ins_ext',     'rev')
plot(df, 'ins_ext_aln', 'rev')