# Ultimate waterfall

## Imports

In [1]:
from os.path import isfile, join

import pandas as pd

import common

## Constants

In [2]:
COLORS = {
    '7': '#7777FF',  # lightest blue CAGCAG
    '6': '#6666FF',  # light blue CAG
    '5': '#5555FF',  # blue CAACAA
    '4': '#55FF55',  # green CAA/CCG
    '3': '#FF5555',  # red CCGCCG
    '2': '#FF6666',  # light red CTG
    '1': '#FF7777',  # lightest red CTGCTG
    '0': '#FFFFFF',  # white
    '_': '#333333',  # dark grey
}

## Datasets

In [3]:
datasets = [

    # Guppy
    {'dataset': 'bc6_2_05', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_2_06', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_2_07', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_2_08', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_2_09', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_2_10', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_3_05', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_3_06', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_3_07', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_3_08', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_3_09', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_3_10', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_4_05', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_4_06', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_4_07', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_4_08', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_4_09', 'caller': 'guppy', 'method': 'regex'},
    {'dataset': 'bc6_4_10', 'caller': 'guppy', 'method': 'regex'},

    # Guppy - kmers
    {'dataset': 'bc6_2_05', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_2_06', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_2_07', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_2_08', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_2_09', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_2_10', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_3_05', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_3_06', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_3_07', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_3_08', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_3_09', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_3_10', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_4_05', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_4_06', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_4_07', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_4_08', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_4_09', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_4_10', 'caller': 'guppy', 'method': 'kmers'},
]

## Folders

In [4]:
merged_path = '/opt/data_out/merged/'
kmers_merged_path = '/opt/data_out/kmers_merged/'
images_path = '/opt/data_out/waterfalls/'

## Functions

In [5]:
def prepare_row_for_waterfall(row):
    plt_seq = row['seq']

    if not pd.isna(row['prefix_flank']):
        # print(row['prefix_flank'])
        plt_seq = plt_seq.replace(row['prefix_flank'], '0000000000')
        plt_seq = plt_seq.replace(row['suffix_flank'], '0000000000')
    
    plt_seq = plt_seq.replace('CAGCAG', '777777')
    plt_seq = plt_seq.replace('CTGCTG', '111111')
    plt_seq = plt_seq.replace('CAG', '666')
    plt_seq = plt_seq.replace('CTG', '222')
    plt_seq = plt_seq.replace('CAACAA', '555555')
    plt_seq = plt_seq.replace('CCGCCG', '333333')
    plt_seq = plt_seq.replace('CAA', '444')
    plt_seq = plt_seq.replace('CCG', '444')

    plt_seq = plt_seq.replace('A', '_')
    plt_seq = plt_seq.replace('T', '_')
    plt_seq = plt_seq.replace('C', '_')
    plt_seq = plt_seq.replace('G', '_')

    return plt_seq


def prepare_df_for_waterfall(df, sample_size=4000, max_len_seq=3000):
    df['len_seq'] = df['seq'].str.len()
    cond = df['len_seq'] <= max_len_seq
    cond &= df['ins'].str.len() > 150
    cond &= df['direction'].notnull()
    plt_df = df[cond].sample(sample_size, replace=True)
    plt_df['plt_seq'] = plt_df.apply(prepare_row_for_waterfall, axis=1)
    plt_df['fwd'] = plt_df['plt_seq'].str.count('7') + plt_df['plt_seq'].str.count('6')
    plt_df['rev'] = plt_df['plt_seq'].str.count('1') + plt_df['plt_seq'].str.count('2')
    plt_df['dir'] = plt_df['fwd'] - plt_df['rev']
    cond = plt_df['dir'] > 0
    plt_df_fwd = plt_df[cond].sort_values(['len_seq', 'seq'])

    cond = plt_df['dir'] <= 0
    plt_df_rev = plt_df[cond].sort_values(['len_seq', 'seq'], ascending=[False, False])
    plt_df = pd.concat([plt_df_fwd, plt_df_rev])

    return plt_df


def plot_waterfall(df, col_len, col_seq, output_path):
    width = df[col_len].max()
    height = len(df)

    inss = list(df[col_seq])

    image = common.Image.new('RGB', (width, height), 'grey')
    draw = common.ImageDraw.Draw(image)
    bottom = 0
    for i, seq in enumerate(inss):
        y = i
        half = len(seq) / 2
        left = 0
        # left = (width - len(seq)) // 2
        right = width - len(seq)
        for j, n in enumerate(seq):
            color = COLORS[n]
            # draw.point([left+j, i], fill=color)
            if j < half:
                draw.point([left+j, i], fill=color)
            else:
                draw.point([right+j, i], fill=color)

    image.save(output_path)


def load_merged(path):
    return common.load_tsv(path)


def load(row):
    dataset = row['dataset']
    caller = row['caller']
    method = row['method']
    kmers = method == 'kmers'
    
    if kmers:
        path = f'{kmers_merged_path}{dataset}.{caller}.merged.tsv'
    else:
        path = f'{merged_path}{dataset}.{caller}.merged.tsv'
    df = load_merged(path)
    
    # Waterfall
    plt_df = prepare_df_for_waterfall(df)
    output_path = f'{images_path}{dataset}.{caller}.{method}.png'
    plot_waterfall(plt_df, 'len_seq', 'plt_seq', output_path)

In [6]:
df = pd.DataFrame(datasets)
df

Unnamed: 0,dataset,caller,method
0,bc6_2_05,guppy,regex
1,bc6_2_06,guppy,regex
2,bc6_2_07,guppy,regex
3,bc6_2_08,guppy,regex
4,bc6_2_09,guppy,regex
5,bc6_2_10,guppy,regex
6,bc6_3_05,guppy,regex
7,bc6_3_06,guppy,regex
8,bc6_3_07,guppy,regex
9,bc6_3_08,guppy,regex


In [7]:
df = pd.concat([df, df.apply(load, axis=1)], axis=1)

ValueError: a must be greater than 0 unless no samples are taken