# STRAT - Short Tandem Repeat Analysis Tool

## 1. Plot various graphs on processed reads

In [1]:
from csv import QUOTE_NONE
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw

## 2. Arguments

In [2]:
motif = 'CAG'

# pcr2persons guppy
# input_path = '/opt/data/workdir/pcr2persons.guppy.ontarget.processed.tsv'
# output_path = '/opt/data/workdir/images/pcr2persons.guppy.ontarget.processed.'

# pcr2persons dorado
input_path = '/opt/data/workdir/pcr2persons.dorado.ontarget.processed.tsv'
output_path = '/opt/data/workdir/images/pcr2persons.dorado.ontarget.processed.'

# jovan guppy
# input_path = '/opt/data/workdir/jovan.guppy.ontarget.processed.tsv'
# output_path = '/opt/data/workdir/images/jovan.guppy.ontarget.processed.'

# jovan dorado
# input_path = '/opt/data/workdir/jovan.dorado.ontarget.processed.tsv'
# output_path = '/opt/data/workdir/images/jovan.dorado.ontarget.processed.'

# dm108 guppy
# input_path = '/opt/data/workdir/dm108.guppy.ontarget.processed.tsv'
# output_path = '/opt/data/workdir/images/dm108.guppy.ontarget.processed.'

# bc3_1 guppy
# input_path = '/opt/data/workdir/bc3_1.guppy.ontarget.processed.tsv'
# output_path = '/opt/data/workdir/images/bc3_1.guppy.ontarget.processed.'

# bc3_1 dorado
# input_path = '/opt/data/workdir/bc3_1.dorado.ontarget.processed.tsv'
# output_path = '/opt/data/workdir/images/bc3_1.dorado.ontarget.processed.'

# bc3_2 guppy
# input_path = '/opt/data/workdir/bc3_2.guppy.ontarget.processed.tsv'
# output_path = '/opt/data/workdir/images/bc3_2.guppy.ontarget.processed.'

# bc3_2 dorado
# input_path = '/opt/data/workdir/bc3_2.dorado.ontarget.processed.tsv'
# output_path = '/opt/data/workdir/images/bc3_2.dorado.ontarget.processed.'

# bc3_3 guppy
# input_path = '/opt/data/workdir/bc3_3.guppy.ontarget.processed.tsv'
# output_path = '/opt/data/workdir/images/bc3_3.guppy.ontarget.processed.'

# bc3_3 dorado
# input_path = '/opt/data/workdir/bc3_3.dorado.ontarget.processed.tsv'
# output_path = '/opt/data/workdir/images/bc3_3.dorado.ontarget.processed.'

## 3. Constants

In [3]:
COLORS = {
    'A': '#3DA853',  # green
    'C': '#4285F4',  # blue
    'G': '#F8BC07',  # yellow
    'T': '#EA4334',  # red
    ' ': 'white'
}

## 4. Functions

In [4]:
def prepare_for_plotting(df, col_seq, col_len, col_cnt, col_cov):
    results = []
    for i in range(df[col_len].max()):
        cond = df[col_len] >= i + 1
        row = dict(df[cond][col_seq].str[i].value_counts())
        
        cond = df[col_len] == i + 1
        row[col_cnt] = sum(cond)
        
        cond = df[col_len] >= i + 1
        row[col_cov] = sum(cond)
        
        results.append(row)
    
    result_df = pd.DataFrame(results).fillna(0).astype(int)
    return result_df


def plot(df, col_seq):
    output_image = f'{output_path}{col_seq}.png'
    col_len = 'len_' + col_seq
    col_cnt = 'cnt_' + col_seq
    col_cov = 'cov_' + col_seq
    
    cond = df['direction'] == 'fwd'
    cond &= df[col_len] <= 303
    df_prep_fwd = prepare_for_plotting(df[cond][[col_seq, col_len]], col_seq, col_len, col_cnt, col_cov)

    cond = df['direction'] == 'rev'
    cond &= df[col_len] <= 303
    df_prep_rev = prepare_for_plotting(df[cond][[col_seq, col_len]], col_seq, col_len, col_cnt, col_cov)

    colors = COLORS
    color_set = colors.keys()
    width = 3012
    height = 1002
    half = 500
    image = Image.new('RGB', (width, height), 'grey')
    draw = ImageDraw.Draw(image)
    reach_max = max(
        df_prep_fwd[col_cov].max(),
        df_prep_rev[col_cov].max()
    )
    for i in range(303):
        x = i + 1
        N = 'CAG'[i%3]
        colors_ordered = sorted(color_set - set(N)) + [N]

        if i in df_prep_fwd.index:
            row = df_prep_fwd.iloc[i]
            # cnt = row[col_cnt]
            cov = row[col_cov]
            reach = half * cov / reach_max
            bottom = 500
            for j, n in enumerate(colors_ordered):
                if not n in row:
                    continue
                cnt = row[n]
                freq = int((reach * row[n] / cov).round())
                if j == len(colors_ordered) - 1:
                    color = 'black'
                else:
                    color = colors[n]
                draw.line([(10*x, bottom-freq), (10*x, bottom)], width=8, fill=color)
                bottom -= freq
    
        if i in df_prep_rev.index:
            row = df_prep_rev.iloc[i]
            # cnt = row[col_cnt]
            cov = row[col_cov]
            reach = half * cov / reach_max
            bottom = 502
            for j, n in enumerate(colors_ordered):
                if not n in row:
                    continue
                cnt = row[n]
                freq = int((reach * row[n] / cov).round())
                if j == len(colors_ordered) - 1:
                    color = 'black'
                else:
                    color = colors[n]
                draw.line([(10*x, bottom), (10*x, bottom+freq)], width=8, fill=color)
                bottom += freq
    
        if i % 3 == 0:
            draw.line([(10*i+5, 0), (10*i+5, height)], width=2, fill='#AAAAAA')
    
        if i % 30 == 0:
            draw.line([(10*i+5, 0), (10*i+5, height)], width=2, fill='white')
    
    image.save(output_image)

## 5. Tryouts

In [5]:
df = pd.read_csv(input_path, sep='\t')

In [6]:
plot(df, 'ins')
plot(df, 'ins_aln')
plot(df, 'ins_ext')
plot(df, 'ins_ext_aln')