# Investigate flanks

## Arguments

In [1]:
fastqs = '/opt/data/'
workdir = '/opt/data_out/L25E5M0_workdir/'
# outdir = '/opt/data_out/L25E5M0_images_rev/'

callers = [
    # 'dorado',
    'guppy',
]

datasets = [
    # 'bc3_01',
    # 'bc3_02',
    # 'bc3_03',
    # 'bc6_05',
    # 'bc6_06',
    # 'bc6_07',
    # 'bc6_08',
    # 'bc6_09',
    # 'bc6_10',
    # 'bc7_1_18',
    # 'bc7_1_19',
    # 'bc7_1_20',
    # 'bc7_1_21',
    # 'bc7_1_22',
    # 'bc7_1_23',
    # 'bc7_1_24',
    # 'bc7_2_18',
    # 'bc7_2_19',
    # 'bc7_2_20',
    # 'bc7_2_21',
    # 'bc7_2_22',
    # 'bc7_2_23',
    # 'bc7_2_24',
    'sca8_1_11',
    'sca8_1_12',
    'sca8_1_15',
    'sca8_1_16',
    'sca8_1_19',
    'sca8_1_20',
    # 'sca8_2_11',
    # 'sca8_2_12',
    # 'sca8_2_15',
    # 'sca8_2_16',
    # 'sca8_2_19',
    # 'sca8_2_20',
]

## Imports

In [2]:
import gzip
from os import listdir
from os.path import isfile, join

import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import pandas as pd
import seaborn as sns

import common

## Functions

In [3]:
def read_fastq(fastq_path):
    fastq_name = fastq_path.split('/')[-1]
    gzipped = fastq_name.endswith('.gz')
    openner = gzip.open if gzipped else open

    reads = []

    if not isfile(fastq_path):
        return reads

    with openner(fastq_path, 'rt') as f:
        for i, line in enumerate(f):
            line = line.strip()
            if i%4 == 0:
                if line.startswith('@'):
                    id = line.split(' ')[0]
                else:
                    print(f'Error in {fastq_path} line {i} - not an ID line')
                    raise
            elif i%4 == 1:
                seq = line
            elif i%4 == 2:
                if line.startswith('+'):
                    opt = line
                else:
                    print(f'Error in {fastq_path} line {i} - not a + line')
                    raise
            elif i%4 == 3:
                qual = line
                reads.append({
                    'id': id,
                    'seq': seq,
                    'opt': opt,
                    'qual': qual,
                })

    return reads


def count_reads(input_path):
    tot = 0
    try:
        fastq_paths = sorted(join(input_path, f) for f in listdir(input_path) if 'fastq' in f and isfile(join(input_path, f)))
    except FileNotFoundError:
        return {
            'fastqs': -1,
            'reads': -1,
        }

    for fastq_path in fastq_paths:
        tot += len(read_fastq(fastq_path))

    return {
        'fastqs': len(fastq_paths),
        'reads': tot,
    }


def count_prepared(input_path):
    df = common.load_tsv(input_path, common.COLUMNS_PREPARED)
    fwd = sum(df['direction'] == 'fwd')
    rev = sum(df['direction'] == 'rev')
    tot = fwd + rev
    return {
        'prepared_fwd': fwd,
        'prepared_rev': rev,
        'prepared_tot': tot,
    }


def count_processed(input_path):
    df = common.load_tsv(input_path)
    fwd = sum(df['direction'] == 'fwd')
    rev = sum(df['direction'] == 'rev')
    tot = fwd + rev
    return {
        'processed_fwd': fwd,
        'processed_rev': rev,
        'processed_tot': tot,
    }


def plot_histogram(df, x, hue, base, output_histogram):
    fig, ax = plt.subplots(figsize=(16, 10))
    gfg = sns.histplot(df, x=x, discrete=True, hue=hue, multiple='stack')
    # gfg.set_xlim(0, 1000)
    # gfg.set_yscale("log")
    loc = plticker.MultipleLocator(base=base)
    gfg.xaxis.set_major_locator(loc)
    gfg.set_xticklabels(gfg.get_xticklabels(), rotation=90)
    fig.savefig(output_histogram)


def plot_histograms(df, x, output_path):
    df[x] = df[x] / 3
    cond = df[x] <= 50
    if len(df[cond]) > 0:
        plot_histogram(df[cond].sort_values('direction', ascending=False), x, 'direction', 5, f'{output_path}.hist.50.png')
    cond = df[x] > 50
    if len(df[cond]) > 0:
        plot_histogram(df[cond].sort_values('direction', ascending=False), x, 'direction', 10, f'{output_path}.hist.51.png')


def plot(dataset, caller):
    input_path = f'{workdir}{dataset}.{caller}.ontarget.processed.tsv'
    output_path = f'{outdir}{dataset}.{caller}'
    df = pd.read_csv(input_path, sep='\t')
    # for col in common.COLUMNS_LEN:
    #     plot_histograms(df, col, output_path)
    plot_histograms(df, 'len_ins_ext_aln', output_path)

## Main

In [4]:
res = []

for caller in callers:
    for dataset in datasets:
        input_path = f'{workdir}{dataset}.{caller}.ontarget.tsv'
        if not isfile(input_path):
            print(f'Skipping: {dataset}.{caller}')
            continue

        dict_info = {
            'dataset': dataset,
            'caller': caller,
        }

        # input_path = f'{fastqs}{dataset}/fastq/{caller}/'
        # dict_reads = count_reads(input_path)

        # input_path = f'{workdir}{dataset}.{caller}.ontarget.tsv'
        # dict_ontarget = count_prepared(input_path)

        input_path = f'{workdir}{dataset}.{caller}.ontarget.tsv'
        dict_df = {'df': common.load_tsv(input_path, common.COLUMNS_PREPARED)}

        # input_path = f'{workdir}{dataset}.{caller}.ontarget.processed.tsv'
        # dict_processed = count_processed(input_path)

        # plot(dataset, caller)

        # if 'sca' in dataset:
        #     output_path = f'{workdir}images/{dataset}.{caller}.1wtp.'
        #     common.plot_range(input_path, 'ins_aln', 1, 42*3+1, output_path)
        #     output_path = f'{workdir}images/{dataset}.{caller}.2pre.'
        #     common.plot_range(input_path, 'ins_aln', 42*3+1, 74*3+1, output_path)
        #     output_path = f'{workdir}images/{dataset}.{caller}.3mut.'
        #     common.plot_range(input_path, 'ins_aln', 74*3+1, 3001, output_path)
        # else:
        #     output_path = f'{workdir}images/{dataset}.{caller}.1wtp.'
        #     common.plot_range(input_path, 'ins_aln', 1, 37*3+1, output_path)
        #     output_path = f'{workdir}images/{dataset}.{caller}.2pre.'
        #     common.plot_range(input_path, 'ins_aln', 37*3+1, 50*3+1, output_path)
        #     output_path = f'{workdir}images/{dataset}.{caller}.3mut.'
        #     common.plot_range(input_path, 'ins_aln', 50*3+1, 3001, output_path)

        res.append({
            **dict_info,
            # **dict_reads,
            # **dict_ontarget,
            **dict_df,
            # **dict_processed,
        })

In [5]:
len(res)

6

In [6]:
res[0].keys()

dict_keys(['dataset', 'caller', 'df'])

In [7]:
dfs = [r['df'] for r in res]

df = pd.concat(dfs)

In [8]:
df.groupby('direction').count()

Unnamed: 0_level_0,id,prefix_flank,ins,suffix_flank,prefix_flank_q,ins_q,suffix_flank_q
direction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
fwd,176531,176531,176531,176531,176531,176531,176531
rev,72296,72296,72296,72296,72296,72296,72296


In [9]:
df.groupby(['direction', 'id']).count()['ins'].reset_index().sort_values('ins', ascending=False)

Unnamed: 0,direction,id,ins
0,fwd,@000001ba-09b2-472a-9ab1-fe95bd4693c3,1
165904,fwd,@f08f718f-215f-4e9b-83c3-1dbab91b3962,1
165876,fwd,@f083d43c-ce3e-4be1-a03c-d80e57dcb8e4,1
165877,fwd,@f0843058-fe8c-4d32-bb14-d6041f95b26d,1
165878,fwd,@f08498f3-d30c-41ad-a8f2-6c4e66cfe2ff,1
...,...,...,...
82947,fwd,@77d77c6b-a909-4b65-b3cc-00348da44ef3,1
82948,fwd,@77d793f0-5017-437c-98b2-399c70c8f45c,1
82949,fwd,@77d802c1-8978-415b-92d2-fc45535d633e,1
82950,fwd,@77d8441e-1a8b-4fc1-a9c3-7e6727cd2fec,1


In [10]:
def extract_breaks(row):
    return pd.Series({
        'b00': row['prefix_flank'][-8:],
        'b01': row['ins'][:6],
        'b10': row['ins'][-6:],
        'b11': row['suffix_flank'][:8],
    })

In [11]:
df = pd.concat([df, df.apply(extract_breaks, axis=1)], axis=1)

In [16]:
cond = df['direction'] == 'rev'
df[cond].groupby(['direction', 'b00', 'b01']).count()['ins'].reset_index().sort_values('ins', ascending=False).head(20)

Unnamed: 0,direction,b00,b01,ins
814,rev,TGGCTTTA,CTACTA,57739
910,rev,TGGCTTTA,TTACTA,2774
815,rev,TGGCTTTA,CTACTC,738
826,rev,TGGCTTTA,CTATTA,463
817,rev,TGGCTTTA,CTACTT,462
1012,rev,TGGCTTTC,TTACTA,455
972,rev,TGGCTTTC,TACTAC,454
859,rev,TGGCTTTA,CTTACT,450
320,rev,GGGCTTTA,CTACTA,374
816,rev,TGGCTTTA,CTACTG,334


In [13]:
def extend_ins(prefix, ins):
    # prefix = row['prefix_flank']
    # ins = row['ins']
    # suffix = row['suffix_flank']
    motif = 'CAG'
    target = 2 * motif
    window = len(target)
    for s in range(window - 1):
        # i in (window-1)..1
        i = window - 1 - s
        if prefix[-i:] + ins[:window - i] == target:
            ins = prefix[-i:] + ins
            break
    
    # for s in range(window - 1):
    #     # i in 1..(window-1)
    #     i = s + 1
    #     if ins[-i:] + suffix[:window - i] == target:
    #         ins = ins + suffix[:window - i]
    #         break

    return ins

extend_ins('GATGCCAG', 'CAGCAG')

'CAGCAGCAG'

In [14]:
cond = df['direction'] == 'fwd'
df[cond].groupby(['direction', 'b10', 'b11']).count()['ins'].reset_index().sort_values('ins', ascending=False).head(20)

Unnamed: 0,direction,b10,b11,ins
507,fwd,AGTAGT,AGTAGTAA,91186
3672,fwd,GTAGTA,GTAGTAAA,17402
5791,fwd,TAGTAG,TAGTAAAG,12587
484,fwd,AGTAGT,AGTAAAGC,10482
3612,fwd,GTAGTA,GTAAAGCC,4292
5649,fwd,TAGTAG,TAAAGCCA,2384
5812,fwd,TAGTAG,TAGTAGTA,570
512,fwd,AGTAGT,AGTAGTGA,532
554,fwd,AGTAGT,CATAGTAA,501
3674,fwd,GTAGTA,GTAGTAAG,465


In [15]:
'12345678'[-6:]

'345678'

In [27]:
def fit_target(t, len_motif):
    if t is not None and len(t) > 0 and len(t) % len_motif != 0:
        len_fill = len_motif - len(t) % len_motif
        fill = ''.join('I' for _ in range(len_fill))
        t = t + fill

    return t


def fit(row, column_seq, motif):
    target = row[column_seq]
    targets = target.split(motif)
    if targets:
        return motif.join(fit_target(t, len(motif)) for t in targets)
    else:
        return target

In [29]:
fit({'a': '12345'}, 'a', 'CATG')

'12345III'