# Raw reads analysis

## Imports

In [1]:
import common
import pandas as pd
import seaborn as sns

## Constants

In [2]:
COLORS = {
    '6': '#7777FF',  # light blue
    '5': '#5555FF',  # blue
    '4': '#FF5555',  # red
    '3': '#FF7777',  # light red
    '0': '#FFFFFF',  # white
    '_': '#333333',  # dark grey
}

## Arguments

In [3]:
folder = '/opt/data_out/fastq_tsv/'
callers = [
    # 'dorado_hac',
    # 'dorado_sup',
    'guppy',
]

datasets = [
    # 'bc3_01',
    # 'bc3_02',
    # 'bc3_03',
    'bc6_05',
    'bc6_06',
    'bc6_07',
    'bc6_08',
    'bc6_09',
    'bc6_10',
    'bc6_2_05',
    'bc6_2_06',
    'bc6_2_07',
    'bc6_2_08',
    'bc6_2_09',
    'bc6_2_10',
    # 'bc7_1_18',
    # 'bc7_1_19',
    # 'bc7_1_20',
    # 'bc7_1_21',
    # 'bc7_1_22',
    # 'bc7_1_23',
    # 'bc7_1_24',
    # 'bc7_2_18',
    # 'bc7_2_19',
    # 'bc7_2_20',
    # 'bc7_2_21',
    # 'bc7_2_22',
    # 'bc7_2_23',
    # 'bc7_2_24',
    # 'sca8_1_11',
    # 'sca8_1_12',
    # 'sca8_1_15',
    # 'sca8_1_16',
    # 'sca8_1_19',
    # 'sca8_1_20',
    # 'sca8_2_11',
    # 'sca8_2_12',
    # 'sca8_2_15',
    # 'sca8_2_16',
    # 'sca8_2_19',
    # 'sca8_2_20',
]

## Functions

In [4]:
def plot_waterfall(df, col_len, col_seq, output_path):
    width = df[col_len].max()
    height = len(df)

    inss = list(df[col_seq])

    image = common.Image.new('RGB', (width, height), 'grey')
    draw = common.ImageDraw.Draw(image)
    bottom = 0
    for i, seq in enumerate(inss):
        y = i
        half = len(seq) / 2
        # left = 0
        left = (width - len(seq)) // 2
        # right = width - len(seq)
        for j, n in enumerate(seq):
            color = COLORS[n]
            draw.point([left+j, i], fill=color)
            # if j < half:
            #     draw.point([left+j, i], fill=color)
            # else:
            #     draw.point([right+j, i], fill=color)

    image.save(output_path)

In [5]:
def process_fastq_tsv(dataset, caller, folder, sample_size=4000):
    path = f'{folder}{dataset}_{caller}.fastq.tsv'
    out_path = f'{folder}{dataset}_{caller}.png'
    try:
        df = pd.read_csv(path, sep='\t', header=None)
    except Exception as e:
        print(f'{dataset} - {caller}: {e}')
        return

    df.columns = ['seq', 'id']
    df['len_seq'] = df['seq'].str.len()

    cond = df['len_seq'] < 1500
    df = df[cond]

    df['plt_seq'] = df['seq'].copy()
    df['plt_seq'] = df['plt_seq'].str.replace('CAGCAG', '666666')
    df['plt_seq'] = df['plt_seq'].str.replace('CTGCTG', '333333')
    df['plt_seq'] = df['plt_seq'].str.replace('CAG', '555')
    df['plt_seq'] = df['plt_seq'].str.replace('CTG', '444')

    # df['plt_seq'] = df['plt_seq'].str.replace('CCCCC', '00000')
    # df['plt_seq'] = df['plt_seq'].str.replace('GGGGG', '00000')

    df['plt_seq'] = df['plt_seq'].str.replace('A', '_')
    df['plt_seq'] = df['plt_seq'].str.replace('T', '_')
    df['plt_seq'] = df['plt_seq'].str.replace('C', '_')
    df['plt_seq'] = df['plt_seq'].str.replace('G', '_')

    df['fwd'] = df['plt_seq'].str.count('6')
    df['rev'] = df['plt_seq'].str.count('3')
    df['dir'] = df['fwd'] - df['rev']

    cond = df['fwd'] > 3
    cond |= df['rev'] > 3
    df = df[cond]

    plt_df = df.sample(sample_size, replace=True)

    mean = 0
    # mean = plt_df['dir'].mean()
    cond = plt_df['dir'] > mean
    plt_df_fwd = plt_df[cond].sort_values(['len_seq', 'fwd', 'seq'])

    cond = plt_df['dir'] <= -1*mean
    plt_df_rev = plt_df[cond].sort_values(['len_seq', 'rev', 'seq'], ascending=[False, False, False])

    plt_df = pd.concat([plt_df_fwd, plt_df_rev])

    plot_waterfall(plt_df, 'len_seq', 'plt_seq', out_path)

In [6]:
for dataset in datasets:
    for caller in callers:
        process_fastq_tsv(dataset, caller, folder, sample_size=4000)

## Try stuff out

In [7]:
stop

NameError: name 'stop' is not defined

In [None]:
s = 'ABCABCABC'
import regex

regex.findall('AB', s)

In [None]:
def find_indices(haystack, needle):
    indices = []
    
    if not haystack or not needle or len(needle) > len(haystack):
        return indices
    
    idx = -1
    while True:
        idx = haystack.find(needle, idx+1)
        print(idx)
        if idx == -1:
            break
        else:
            indices.append(idx)

    return indices

In [None]:
s.find('G', 2)

In [None]:
find_indices(s, 'AB')

## Most common read

In [None]:
path = f'{folder}bc3_01_guppy.fastq.tsv'
datasets = [
    'bc6_2_05_guppy.fastq.tsv',
    # 'bc3_02_guppy.fastq.tsv',
    # 'bc3_03_guppy.fastq.tsv',
]

dfs = []

for dataset in datasets:
    path = f'{folder}{dataset}'
    df = pd.read_csv(path, sep='\t', header=None)
    df.columns = ['seq', 'id']
    df['len_seq'] = df['seq'].str.len()
    
    df['plt_seq'] = df['seq']
    
    df['plt_seq'] = df['plt_seq'].str.replace('CAG', '555')
    df['plt_seq'] = df['plt_seq'].str.replace('CTG', '444')
    
    df['plt_seq'] = df['plt_seq'].str.replace('A', '_')
    df['plt_seq'] = df['plt_seq'].str.replace('T', '_')
    df['plt_seq'] = df['plt_seq'].str.replace('C', '_')
    df['plt_seq'] = df['plt_seq'].str.replace('G', '_')
    
    df['fwd'] = df['plt_seq'].str.count('555')
    df['rev'] = df['plt_seq'].str.count('444')
    df['dir'] = df['fwd'] - df['rev']

    dfs.append(df)

dfs = pd.concat(dfs)

In [None]:
sns.scatterplot(dfs, x='fwd', y='rev')

In [None]:
df['dir'].hist(bins=100)

In [None]:
dfc = df.groupby('seq').count().reset_index()

In [None]:
dfc.sort_values('len_seq', ascending=False)

In [None]:
dfs

In [None]:
def count_kmers(seq):
    len_kmer = 9
    kmers = {}
    
    for i in range(len(seq) - len_kmer - 100 + 1):
        kmer = seq[i+100:i+100+len_kmer]
        if kmer in kmers:
            kmers[kmer] += 1
        else:
            kmers[kmer] = 1

    return {k: kmers[k] for k in kmers if kmers[k] == 1}

In [None]:
seq = 'ABCABCABCABCABCABC'
count_kmers(seq)

In [None]:
kmers = dfs.apply(lambda x: count_kmers(x['seq']), axis=1)

In [None]:
len(kmers[0])

In [None]:
from collections import Counter

merged_dict = Counter()

for k in kmers:
    merged_dict.update(k)

In [None]:
len(merged_dict)

In [None]:
top_5 = sorted(merged_dict.items(), key=lambda x: x[1], reverse=True)[:5]

In [None]:
top_5