# Ultimate dataset analysis

## Imports

In [1]:
from os.path import isfile, join

import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import pandas as pd
import seaborn as sns

import common

## Settings

In [2]:
pd.set_option('display.max_columns', 14)
pd.set_option('display.max_rows', 10)

## Constants

In [3]:
COLORS = {
    '6': '#7777FF',  # light blue
    '5': '#5555FF',  # blue
    '4': '#FF5555',  # red
    '3': '#FF7777',  # light red
    '0': '#FFFFFF',  # white
    '_': '#333333',  # dark grey
}

## Datasets

In [4]:
datasets = [

    # Guppy
    {'dataset': 'bc3_01', 'caller': 'guppy'},
    {'dataset': 'bc3_02', 'caller': 'guppy'},
    {'dataset': 'bc3_03', 'caller': 'guppy'},
    {'dataset': 'bc6_05', 'caller': 'guppy'},
    {'dataset': 'bc6_06', 'caller': 'guppy'},
    {'dataset': 'bc6_07', 'caller': 'guppy'},
    {'dataset': 'bc6_08', 'caller': 'guppy'},
    {'dataset': 'bc6_09', 'caller': 'guppy'},
    {'dataset': 'bc6_10', 'caller': 'guppy'},
    {'dataset': 'bc6_2_05', 'caller': 'guppy'},
    {'dataset': 'bc6_2_06', 'caller': 'guppy'},
    {'dataset': 'bc6_2_07', 'caller': 'guppy'},
    {'dataset': 'bc6_2_08', 'caller': 'guppy'},
    {'dataset': 'bc6_2_09', 'caller': 'guppy'},
    {'dataset': 'bc6_2_10', 'caller': 'guppy'},
    {'dataset': 'bc6_3_05', 'caller': 'guppy'},
    {'dataset': 'bc6_3_06', 'caller': 'guppy'},
    {'dataset': 'bc6_3_07', 'caller': 'guppy'},
    {'dataset': 'bc6_3_08', 'caller': 'guppy'},
    {'dataset': 'bc6_3_09', 'caller': 'guppy'},
    {'dataset': 'bc6_3_10', 'caller': 'guppy'},
    {'dataset': 'bc6_4_05', 'caller': 'guppy'},
    {'dataset': 'bc6_4_06', 'caller': 'guppy'},
    {'dataset': 'bc6_4_07', 'caller': 'guppy'},
    {'dataset': 'bc6_4_08', 'caller': 'guppy'},
    {'dataset': 'bc6_4_09', 'caller': 'guppy'},
    {'dataset': 'bc6_4_10', 'caller': 'guppy'},
    {'dataset': 'bc7_1_18', 'caller': 'guppy'},
    {'dataset': 'bc7_1_19', 'caller': 'guppy'},
    {'dataset': 'bc7_1_20', 'caller': 'guppy'},
    {'dataset': 'bc7_1_21', 'caller': 'guppy'},
    {'dataset': 'bc7_1_22', 'caller': 'guppy'},
    {'dataset': 'bc7_1_23', 'caller': 'guppy'},
    {'dataset': 'bc7_1_24', 'caller': 'guppy'},
    {'dataset': 'bc7_2_18', 'caller': 'guppy'},
    {'dataset': 'bc7_2_19', 'caller': 'guppy'},
    {'dataset': 'bc7_2_20', 'caller': 'guppy'},
    {'dataset': 'bc7_2_21', 'caller': 'guppy'},
    {'dataset': 'bc7_2_22', 'caller': 'guppy'},
    {'dataset': 'bc7_2_23', 'caller': 'guppy'},
    {'dataset': 'bc7_2_24', 'caller': 'guppy'},
    {'dataset': 'sca8_1_11', 'caller': 'guppy'},
    {'dataset': 'sca8_1_12', 'caller': 'guppy'},
    {'dataset': 'sca8_1_15', 'caller': 'guppy'},
    {'dataset': 'sca8_1_16', 'caller': 'guppy'},
    {'dataset': 'sca8_1_19', 'caller': 'guppy'},
    {'dataset': 'sca8_1_20', 'caller': 'guppy'},
    {'dataset': 'sca8_2_11', 'caller': 'guppy'},
    {'dataset': 'sca8_2_12', 'caller': 'guppy'},
    {'dataset': 'sca8_2_15', 'caller': 'guppy'},
    {'dataset': 'sca8_2_16', 'caller': 'guppy'},
    {'dataset': 'sca8_2_19', 'caller': 'guppy'},
    {'dataset': 'sca8_2_20', 'caller': 'guppy'},

    # Guppy - kmers
    {'dataset': 'bc6_2_05', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_2_06', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_2_07', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_2_08', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_2_09', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_2_10', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_3_05', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_3_06', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_3_07', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_3_08', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_3_09', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_3_10', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_4_05', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_4_06', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_4_07', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_4_08', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_4_09', 'caller': 'guppy', 'method': 'kmers'},
    {'dataset': 'bc6_4_10', 'caller': 'guppy', 'method': 'kmers'},

    # Dorado
    {'dataset': 'bc3_01', 'caller': 'dorado'},
    {'dataset': 'bc3_02', 'caller': 'dorado'},
    {'dataset': 'bc3_03', 'caller': 'dorado'},
    {'dataset': 'bc6_05', 'caller': 'dorado'},
    {'dataset': 'bc6_06', 'caller': 'dorado'},
    {'dataset': 'bc6_07', 'caller': 'dorado'},
    {'dataset': 'bc6_08', 'caller': 'dorado'},
    {'dataset': 'bc6_09', 'caller': 'dorado'},
    {'dataset': 'bc6_10', 'caller': 'dorado'},

    # Dorado HAC
    {'dataset': 'bc7_1_18', 'caller': 'dorado_hac'},
    {'dataset': 'bc7_1_19', 'caller': 'dorado_hac'},
    {'dataset': 'bc7_1_20', 'caller': 'dorado_hac'},
    {'dataset': 'bc7_1_21', 'caller': 'dorado_hac'},
    {'dataset': 'bc7_1_22', 'caller': 'dorado_hac'},
    {'dataset': 'bc7_1_23', 'caller': 'dorado_hac'},
    {'dataset': 'bc7_1_24', 'caller': 'dorado_hac'},
    {'dataset': 'bc7_2_18', 'caller': 'dorado_hac'},
    {'dataset': 'bc7_2_19', 'caller': 'dorado_hac'},
    {'dataset': 'bc7_2_20', 'caller': 'dorado_hac'},
    {'dataset': 'bc7_2_21', 'caller': 'dorado_hac'},
    {'dataset': 'bc7_2_22', 'caller': 'dorado_hac'},
    {'dataset': 'bc7_2_23', 'caller': 'dorado_hac'},
    {'dataset': 'bc7_2_24', 'caller': 'dorado_hac'},
    {'dataset': 'sca8_1_11', 'caller': 'dorado_hac'},
    {'dataset': 'sca8_1_12', 'caller': 'dorado_hac'},
    {'dataset': 'sca8_1_15', 'caller': 'dorado_hac'},
    {'dataset': 'sca8_1_16', 'caller': 'dorado_hac'},
    {'dataset': 'sca8_1_19', 'caller': 'dorado_hac'},
    {'dataset': 'sca8_1_20', 'caller': 'dorado_hac'},
    {'dataset': 'sca8_2_11', 'caller': 'dorado_hac'},
    {'dataset': 'sca8_2_12', 'caller': 'dorado_hac'},
    {'dataset': 'sca8_2_15', 'caller': 'dorado_hac'},
    {'dataset': 'sca8_2_16', 'caller': 'dorado_hac'},
    {'dataset': 'sca8_2_19', 'caller': 'dorado_hac'},
    {'dataset': 'sca8_2_20', 'caller': 'dorado_hac'},

    # Dorado SUP
    {'dataset': 'bc7_1_18', 'caller': 'dorado_sup'},
    {'dataset': 'bc7_1_19', 'caller': 'dorado_sup'},
    {'dataset': 'bc7_1_20', 'caller': 'dorado_sup'},
    {'dataset': 'bc7_1_21', 'caller': 'dorado_sup'},
    {'dataset': 'bc7_1_22', 'caller': 'dorado_sup'},
    {'dataset': 'bc7_1_23', 'caller': 'dorado_sup'},
    {'dataset': 'bc7_1_24', 'caller': 'dorado_sup'},
    {'dataset': 'bc7_2_18', 'caller': 'dorado_sup'},
    {'dataset': 'bc7_2_19', 'caller': 'dorado_sup'},
    {'dataset': 'bc7_2_20', 'caller': 'dorado_sup'},
    {'dataset': 'bc7_2_21', 'caller': 'dorado_sup'},
    {'dataset': 'bc7_2_22', 'caller': 'dorado_sup'},
    {'dataset': 'bc7_2_23', 'caller': 'dorado_sup'},
    {'dataset': 'bc7_2_24', 'caller': 'dorado_sup'},
    {'dataset': 'sca8_1_11', 'caller': 'dorado_sup'},
    {'dataset': 'sca8_1_12', 'caller': 'dorado_sup'},
    {'dataset': 'sca8_1_15', 'caller': 'dorado_sup'},
    {'dataset': 'sca8_1_16', 'caller': 'dorado_sup'},
    {'dataset': 'sca8_1_19', 'caller': 'dorado_sup'},
    {'dataset': 'sca8_1_20', 'caller': 'dorado_sup'},
    {'dataset': 'sca8_2_11', 'caller': 'dorado_sup'},
    {'dataset': 'sca8_2_12', 'caller': 'dorado_sup'},
    {'dataset': 'sca8_2_15', 'caller': 'dorado_sup'},
    {'dataset': 'sca8_2_16', 'caller': 'dorado_sup'},
    {'dataset': 'sca8_2_19', 'caller': 'dorado_sup'},
    {'dataset': 'sca8_2_20', 'caller': 'dorado_sup'},
]

# datasets = [
#     {'dataset': 'bc6_2_05', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_2_06', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_2_07', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_2_08', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_2_09', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_2_10', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_3_05', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_3_06', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_3_07', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_3_08', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_3_09', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_3_10', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_4_05', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_4_06', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_4_07', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_4_08', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_4_09', 'caller': 'guppy', 'method': 'kmers'},
#     {'dataset': 'bc6_4_10', 'caller': 'guppy', 'method': 'kmers'},
# ]

## Folders

In [5]:
fastq_path = '/opt/data/'
fastq_tsv_path = '/opt/data_out/fastq_tsv/'
ontarget_path = '/opt/data_out/ontarget/'
kmers_path = '/opt/data_out/kmers/'
processed_path = '/opt/data_out/processed/'
kmers_processed_path = '/opt/data_out/kmers_processed/'
images_path = '/opt/data_out/images/'
kmers_images_path = '/opt/data_out/kmers_images/'
merged_path = '/opt/data_out/merged/'
kmers_merged_path = '/opt/data_out/kmers_merged/'

## Waterfall functions

In [6]:
def plot_waterfall_processed(df, col_len, col_seq, stretch, grid, output_path):
    width = min(1500, df[col_len].max())
    
    cond = df['direction'] == 'fwd'
    fwd = df[cond][[col_seq, col_len]].sort_values([col_len, col_seq], ascending=[True, True]).reset_index()

    cond = df['direction'] == 'rev'
    rev = df[cond][[col_seq, col_len]].sort_values([col_len, col_seq], ascending=[False, False]).reset_index()

    fwd = list(fwd[col_seq])
    rev = list(rev[col_seq])
    inss = fwd + [width*'I'] + rev

    width = (width)*stretch+stretch
    height = len(inss)
    image = common.Image.new('RGB', (width, height), 'grey')
    draw = common.ImageDraw.Draw(image)
    bottom = 0
    for i, seq in enumerate(inss):
        y = i
        # for j, n in enumerate(reversed(seq)):
        for j, n in enumerate(seq):
            x = j + 1
            N = 'CAG'[j%3]
            if n == N:
                color = 'black'
            else:
                color = common.COLORS[n]
            draw.line([(stretch*x, y), (stretch*x, y+1)], width=stretch, fill=color)

    for i in range(width):
        y = stretch*i+stretch//2
        if i % 3 == 0:
            draw.line([(y, 0), (y, height)], width=grid, fill='#AAAAAA')
    
        if i % 30 == 0:
            draw.line([(y, 0), (y, height)], width=grid, fill='white')

        if i % 300 == 0:
            draw.line([(y, 0), (y, height)], width=grid, fill='black')

    image.save(output_path)


def plot_waterfalls_processed(df, output_path):
    col_seq = 'ins_ext_aln'
    col_len = 'len_' + col_seq

    cond = df[col_len] <= 150
    if len(df[cond]) > 0:
        df_sampled = df[cond].sample(n=1000, replace=True, random_state=42)
        output_path_50 = f'{output_path}.wtrf.50.png'
        plot_waterfall_processed(df_sampled, col_len, col_seq, 15, 2, output_path_50)

    cond = df[col_len] > 150
    if len(df[cond]) > 0:
        df_sampled = df[cond].sample(n=1000, replace=True, random_state=42)
        output_path_51 = f'{output_path}.wtrf.51.png'
        plot_waterfall_processed(df_sampled, col_len, col_seq, 7, 1, output_path_51)

## Functions

In [7]:
def prepare_row_for_waterfall(row):
    plt_seq = row['seq']

    if not pd.isna(row['prefix_flank']):
        # print(row['prefix_flank'])
        plt_seq = plt_seq.replace(row['prefix_flank'], '0000000000')
        plt_seq = plt_seq.replace(row['suffix_flank'], '0000000000')
    
    plt_seq = plt_seq.replace('CAGCAG', '666666')
    plt_seq = plt_seq.replace('CTGCTG', '333333')
    plt_seq = plt_seq.replace('CAG', '555')
    plt_seq = plt_seq.replace('CTG', '444')

    plt_seq = plt_seq.replace('A', '_')
    plt_seq = plt_seq.replace('T', '_')
    plt_seq = plt_seq.replace('C', '_')
    plt_seq = plt_seq.replace('G', '_')

    return plt_seq


def prepare_df_for_waterfall(df, sample_size=4000, max_len_seq=2000):
    df['len_seq'] = df['seq'].str.len()
    cond = df['len_seq'] <= max_len_seq
    plt_df = df[cond].sample(sample_size, replace=True)
    plt_df['plt_seq'] = plt_df.apply(prepare_row_for_waterfall, axis=1)
    plt_df['fwd'] = plt_df['plt_seq'].str.count('6')
    plt_df['rev'] = plt_df['plt_seq'].str.count('3')
    plt_df['dir'] = plt_df['fwd'] - plt_df['rev']
    cond = plt_df['dir'] > 0
    plt_df_fwd = plt_df[cond].sort_values(['len_seq', 'seq'])

    cond = plt_df['dir'] <= 0
    plt_df_rev = plt_df[cond].sort_values(['len_seq', 'seq'], ascending=[False, False])
    plt_df = pd.concat([plt_df_fwd, plt_df_rev])

    return plt_df


def plot_waterfall(df, col_len, col_seq, output_path):
    width = df[col_len].max()
    height = len(df)

    inss = list(df[col_seq])

    image = common.Image.new('RGB', (width, height), 'grey')
    draw = common.ImageDraw.Draw(image)
    bottom = 0
    for i, seq in enumerate(inss):
        y = i
        half = len(seq) / 2
        # left = 0
        left = (width - len(seq)) // 2
        # right = width - len(seq)
        for j, n in enumerate(seq):
            color = COLORS[n]
            draw.point([left+j, i], fill=color)
            # if j < half:
            #     draw.point([left+j, i], fill=color)
            # else:
            #     draw.point([right+j, i], fill=color)

    image.save(output_path)


def plot_histogram(df, x, hue, base, output_histogram):
    fig, ax = plt.subplots(figsize=(16, 10))
    gfg = sns.histplot(df, x=x, discrete=True, hue=hue, multiple='stack')
    # gfg.set_xlim(0, 1000)
    # gfg.set_yscale("log")
    loc = plticker.MultipleLocator(base=base)
    gfg.xaxis.set_major_locator(loc)
    gfg.set_xticklabels(gfg.get_xticklabels(), rotation=90)
    fig.savefig(output_histogram)


def plot_histograms(df, x, output_path):
    cond = df[x] <= 50
    if len(df[cond]) > 0:
        plot_histogram(df[cond].sort_values('direction', ascending=False), x, 'direction', 5, f'{output_path}.hist.50.png')
    cond = df[x] > 50
    if len(df[cond]) > 0:
        plot_histogram(df[cond].sort_values('direction', ascending=False), x, 'direction', 5, f'{output_path}.hist.51.png')


def load_fastq_tsv(path):
    df = pd.read_csv(path, sep='\t', header=None)
    df.columns = ['seq', 'id']
    return df


def load_ontarget(path):
    return common.load_tsv(path, common.COLUMNS_PREPARED)


def load_kmers(path):
    cols = ['direction', 'id', 'prefix_flank', 'ins', 'suffix_flank', 'start_cnt', 'start_stdev', 'end_cnt', 'end_stdev']
    return common.load_tsv(path, cols)


def load_processed(path):
    return common.load_tsv(path)


def load_kmers_processed(path):
    return common.load_tsv(path)


def load(row):
    dataset = row['dataset']
    caller = row['caller']
    method = row['method']
    kmers = method == 'kmers'
    
    path = f'{fastq_tsv_path}{dataset}_{caller}.fastq.tsv'
    df_fastq = load_fastq_tsv(path)
    reads = len(df_fastq)
    
    if kmers:
        path = f'{kmers_path}{dataset}.{caller}.kmers.tsv'
        df_ontarget = load_kmers(path)
    else:
        path = f'{ontarget_path}{dataset}.{caller}.ontarget.tsv'
        df_ontarget = load_ontarget(path)
    ontarget = len(df_ontarget)
    cond = df_ontarget['direction'] == 'fwd'
    ontarget_fwd = len(df_ontarget[cond])
    ontarget_rev = ontarget - ontarget_fwd
    
    # Merge FASTQ and on-target dataframes
    # df = pd.merge(df_fastq, df_ontarget, how="outer", on=["id", "id"])
    # if kmers:
    #     path = f'{kmers_merged_path}{dataset}.{caller}.merged.tsv'
    # else:
    #     path = f'{merged_path}{dataset}.{caller}.merged.tsv'
    # df.to_csv(path, sep='\t', index=False)
    
    # Waterfall
    # plt_df = prepare_df_for_waterfall(df)
    # if kmers:
    #     output_path = f'{kmers_images_path}{dataset}.{caller}.png'
    # else:
    #     output_path = f'{images_path}{dataset}.{caller}.png'
    # plot_waterfall(plt_df, 'len_seq', 'plt_seq', output_path)

    if kmers:
        path = f'{kmers_processed_path}{dataset}.{caller}.kmers.processed.tsv'
        df_processed = load_kmers_processed(path)
    else:
        path = f'{processed_path}{dataset}.{caller}.ontarget.processed.tsv'
        df_processed = load_processed(path)
    cond_fwd = df_processed['direction'] == 'fwd'
    cond_rev = df_processed['direction'] == 'rev'
    df_processed['len'] = df_processed['len_ins_ext_aln'] / 3
    cond = df_processed['len'] <= 50
    processed_50_fwd = sum(cond & cond_fwd)
    processed_50_rev = sum(cond & cond_rev)
    percentiles_50 = df_processed[cond]['len'].quantile([0.1, 0.5, 0.9])
    cond = df_processed['len'] > 50
    processed_51_fwd = sum(cond & cond_fwd)
    processed_51_rev = sum(cond & cond_rev)
    percentiles_51 = df_processed[cond]['len'].quantile([0.1, 0.5, 0.9])

    # Histograms
    # if kmers:
    #     output_path = f'{kmers_images_path}{dataset}.{caller}'
    # else:
    #     output_path = f'{images_path}{dataset}.{caller}'
    # plot_histograms(df_processed, 'len', output_path)

    # Waterfalls
    # if kmers:
    #     output_path = f'{kmers_images_path}{dataset}.{caller}'
    # else:
    #     output_path = f'{images_path}{dataset}.{caller}'
    # plot_waterfalls_processed(df_processed, output_path)

    return pd.Series({
        # 'df_fastq': df_fastq,
        # 'df_ontarget': df_ontarget,
        'reads': reads,
        'ontarget': ontarget,
        'ontarget_fwd': ontarget_fwd,
        'ontarget_rev': ontarget_rev,
        'processed_50_fwd': processed_50_fwd,
        'processed_50_rev': processed_50_rev,
        '50_10th': percentiles_50[0.1],
        '50_50th': percentiles_50[0.5],
        '50_90th': percentiles_50[0.9],
        'processed_51_fwd': processed_51_fwd,
        'processed_51_rev': processed_51_rev,
        '51_10th': percentiles_51[0.1],
        '51_50th': percentiles_51[0.5],
        '51_90th': percentiles_51[0.9],
    })

## Main

In [8]:
for d in datasets:
    dataset = d['dataset']
    caller = d['caller']
    method = d.get('method', 'regex')

    path = f'{fastq_tsv_path}{dataset}_{caller}.fastq.tsv'
    if not isfile(path):
        print(path)
    if method == 'kmers':
        path = f'{kmers_path}{dataset}.{caller}.kmers.tsv'
        if not isfile(path):
            print(path)
        path = f'{kmers_processed_path}{dataset}.{caller}.kmers.processed.tsv'
        if not isfile(path):
            print(path)
    else:
        path = f'{ontarget_path}{dataset}.{caller}.ontarget.tsv'
        if not isfile(path):
            print(path)
        path = f'{processed_path}{dataset}.{caller}.ontarget.processed.tsv'
        if not isfile(path):
            print(path)

In [9]:
df = pd.DataFrame(datasets)
df

Unnamed: 0,dataset,caller,method
0,bc3_01,guppy,
1,bc3_02,guppy,
2,bc3_03,guppy,
3,bc6_05,guppy,
4,bc6_06,guppy,
...,...,...,...
127,sca8_2_12,dorado_sup,
128,sca8_2_15,dorado_sup,
129,sca8_2_16,dorado_sup,
130,sca8_2_19,dorado_sup,


In [10]:
df = pd.concat([df, df.apply(load, axis=1)], axis=1)

In [11]:
def reformat_dataset(row):
    dataset = row['dataset']
    parts = dataset.split('_')
    if len(parts) == 2:
        return pd.Series({
            'sample': dataset,
            'run': 1,
        })
    else:
        return pd.Series({
            'sample': '_'.join([parts[0], parts[2]]),
            'run': int(parts[1]),
        })


df = pd.concat([df, df.apply(reformat_dataset, axis=1)], axis=1)

In [12]:
df = df.sort_values(['sample', 'run', 'caller', 'method'])
df

Unnamed: 0,dataset,caller,method,reads,ontarget,ontarget_fwd,ontarget_rev,...,processed_51_fwd,processed_51_rev,51_10th,51_50th,51_90th,sample,run
71,bc3_01,dorado,,66000.0,48348.0,25425.0,22923.0,...,361.0,208.0,67.0,101.0,156.0,bc3_01,1
0,bc3_01,guppy,,95490.0,69287.0,36729.0,32558.0,...,525.0,261.0,67.0,105.0,166.5,bc3_01,1
72,bc3_02,dorado,,3000.0,2227.0,1238.0,989.0,...,469.0,353.0,88.0,99.0,116.0,bc3_02,1
1,bc3_02,guppy,,212845.0,150981.0,86653.0,64328.0,...,34175.0,20486.0,89.0,100.0,120.0,bc3_02,1
73,bc3_03,dorado,,15049.0,10029.0,6012.0,4017.0,...,1812.0,434.0,116.0,141.0,203.0,bc3_03,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,sca8_1_20,dorado_sup,,37452.0,31544.0,24845.0,6699.0,...,6428.0,166.0,75.0,123.0,127.0,sca8_20,1
46,sca8_1_20,guppy,,38712.0,28785.0,21579.0,7206.0,...,3628.0,202.0,63.0,122.0,127.0,sca8_20,1
105,sca8_2_20,dorado_hac,,28454.0,16879.0,14661.0,2218.0,...,6525.0,150.0,92.0,124.0,127.0,sca8_20,2
131,sca8_2_20,dorado_sup,,28454.0,20115.0,17988.0,2127.0,...,9396.0,159.0,101.0,124.0,127.0,sca8_20,2


In [13]:
output_path = '/opt/data_out/datasets_overview.tsv'
df.to_csv(output_path, sep='\t', index=False)

## Tryouts

In [14]:
stop

NameError: name 'stop' is not defined

In [None]:
df = pd.read_csv('/opt/data_out/merged/bc3_01.guppy.merged.tsv', sep='\t')

In [None]:
cond = df['direction'].notnull()
df = df[cond]

In [None]:
def chop(row):
    idx = row['seq'].find(row['prefix_flank'])
    prefix = row['seq'][:idx]
    idx = row['seq'].find(row['suffix_flank'])
    suffix = row['seq'][idx+10:]

    return pd.Series({
        'prefix': prefix,
        'suffix': suffix,
    })

In [None]:
df = pd.concat([df, df.apply(chop, axis=1)], axis=1)

In [None]:
df

In [None]:
df['len_prefix'] = df['prefix'].str.len()

In [None]:
df['len_suffix'] = df['suffix'].str.len()

In [None]:
df['len_prefix'].quantile([0.25, 0.5, 0.75])

In [None]:
df['len_suffix'].quantile([0.25, 0.5, 0.75])

In [None]:
cond = df['direction'] == 'rev'

In [None]:
df[cond]['len_prefix'].quantile([0.25, 0.5, 0.75])
# len(df[cond])

In [None]:
def count_kmers(seq):
    len_kmer = 11
    kmers = {}
    
    for i in range(len(seq) - len_kmer + 1):
        kmer = seq[i:i+len_kmer]
        if kmer in kmers:
            kmers[kmer] += 1
        else:
            kmers[kmer] = 1

    return {k: kmers[k] for k in kmers if kmers[k] == 1}

In [None]:
# kmers = df[cond].apply(lambda x: count_kmers(x['prefix'][280:380]), axis=1)
kmers = df[cond].apply(lambda x: count_kmers(x['suffix'][-360:-260]), axis=1)

In [None]:
from collections import Counter

merged_dict = Counter()

for k in kmers:
    merged_dict.update(k)

In [None]:
len(merged_dict)

In [None]:
top_5 = sorted(merged_dict.items(), key=lambda x: x[1], reverse=True)[:5]

In [None]:
top_5

In [None]:
fwd_prefix = 'CACGGACGGCC'
fwd_suffix = 'TCGCCGGCCGC'
rev_prefix = 'TCCTGTGATCC'
rev_suffix = 'GCTGAGGCCCT'

In [None]:
# sns.scatterplot(df, x='len_prefix', y='len_suffix', hue='direction')

In [None]:
# df['len_prefix'].hist(bins=100)

In [None]:
# df['len_suffix'].hist(bins=100)

In [None]:
'ABCDE'[-3:-1]

In [None]:
dfm = pd.read_csv('/opt/data_out/merged/bc3_01.guppy.merged.tsv', sep='\t')

In [None]:
cond_fwd_prefix = dfm['seq'].str.find(fwd_prefix) != -1
cond_fwd_suffix = dfm['seq'].str.find(fwd_suffix) != -1
cond_dir = dfm['direction'].isnull()
sum(cond_fwd_prefix & cond_fwd_suffix & cond_dir)
# df

In [None]:
cond = df['direction'] == 'fwd'
sum(cond)

In [None]:
dfm['len_seq'] = dfm['seq'].str.len()
dfm[cond_fwd_prefix & cond_fwd_suffix & cond_dir]['len_seq'].hist(bins=100)

In [None]:
# dfm['seq'].str.find(fwd_prefix).hist(bins=100)
dfm['seq'].str.find(rev_prefix)