# Count and Review

## Arguments

In [1]:
fastqs = '/opt/data/'
workdir = '/opt/data/workdir/'
outdir = f'{workdir}'

callers = [
    'dorado',
    'guppy',
]

datasets = [
    # 'bc3_1',
    # 'bc3_2',
    # 'bc3_3',
    # 'bc6_1',
    # 'bc6_2',
    # 'bc6_3',
    # 'bc6_4',
    # 'bc6_5',
    # 'bc6_6',
    # 'bc6_7',
    # 'bc6_8',
    # 'bc6_9',
    # 'bc6_10',
    # 'bc6_11',
    # 'bc6_12',
    # 'bc7_1_18',
    # 'bc7_1_19',
    # 'bc7_1_20',
    # 'bc7_1_21',
    # 'bc7_1_22',
    # 'bc7_1_23',
    # 'bc7_1_24',
    # 'bc7_2_18',
    # 'bc7_2_19',
    # 'bc7_2_20',
    # 'bc7_2_21',
    # 'bc7_2_22',
    # 'bc7_2_23',
    # 'bc7_2_24',
    'sca8_11',
    'sca8_12',
    'sca8_15',
    'sca8_16',
    'sca8_19',
    'sca8_20',
    'sca8_2_11',
    'sca8_2_12',
    'sca8_2_15',
    'sca8_2_16',
    'sca8_2_19',
    'sca8_2_20',
]

## Imports

In [2]:
import gzip
from os import listdir
from os.path import isfile, join

import pandas as pd

import common

## Functions

In [3]:
def read_fastq(fastq_path):
    fastq_name = fastq_path.split('/')[-1]
    gzipped = fastq_name.endswith('.gz')
    openner = gzip.open if gzipped else open

    reads = []

    if not isfile(fastq_path):
        return reads

    with openner(fastq_path, 'rt') as f:
        for i, line in enumerate(f):
            line = line.strip()
            if i%4 == 0:
                if line.startswith('@'):
                    id = line.split(' ')[0]
                else:
                    print(f'Error in {fastq_path} line {i} - not an ID line')
                    raise
            elif i%4 == 1:
                seq = line
            elif i%4 == 2:
                if line.startswith('+'):
                    opt = line
                else:
                    print(f'Error in {fastq_path} line {i} - not a + line')
                    raise
            elif i%4 == 3:
                qual = line
                reads.append({
                    'id': id,
                    'seq': seq,
                    'opt': opt,
                    'qual': qual,
                })

    return reads


def count_reads(input_path):
    tot = 0
    try:
        fastq_paths = sorted(join(input_path, f) for f in listdir(input_path) if 'fastq' in f and isfile(join(input_path, f)))
    except FileNotFoundError:
        return {
            'fastqs': -1,
            'reads': -1,
        }

    for fastq_path in fastq_paths:
        tot += len(read_fastq(fastq_path))

    return {
        'fastqs': len(fastq_paths),
        'reads': tot,
    }


def count_prepared(input_path):
    df = common.load_tsv(input_path, common.COLUMNS_PREPARED)
    fwd = sum(df['direction'] == 'fwd')
    rev = sum(df['direction'] == 'rev')
    tot = fwd + rev
    return {
        'fwd': fwd,
        'rev': rev,
        'tot': tot,
    }


def count_processed(input_path):
    df = common.load_tsv(input_path)
    fwd = sum(df['direction'] == 'fwd')
    rev = sum(df['direction'] == 'rev')
    tot = fwd + rev
    return {
        'fwd': fwd,
        'rev': rev,
        'tot': tot,
    }

## Main

In [4]:
res = []

for caller in callers:
    for dataset in datasets:
        input_path = f'{workdir}{dataset}.{caller}.ontarget.tsv'
        if not isfile(input_path):
            print(f'Skipping: {dataset}.{caller}')
            continue

        dict_info = {
            'dataset': dataset,
            'caller': caller,
        }

        input_path = f'{fastqs}{dataset}/fastq/{caller}/'
        dict_reads = count_reads(input_path)
        
        input_path = f'{workdir}{dataset}.{caller}.ontarget.tsv'
        dict_ontarget = count_prepared(input_path)
        
        input_path = f'{workdir}{dataset}.{caller}.ontarget.processed.tsv'
        output_path = f'{workdir}images/{dataset}.{caller}.1wtp.'
        common.plot_range(input_path, 'ins_ext_aln', 1, 37*3+1, output_path)
        output_path = f'{workdir}images/{dataset}.{caller}.2pre.'
        common.plot_range(input_path, 'ins_ext_aln', 37*3+1, 50*3+1, output_path)
        output_path = f'{workdir}images/{dataset}.{caller}.3mut.'
        common.plot_range(input_path, 'ins_ext_aln', 50*3+1, 3001, output_path)

        res.append({**dict_info, **dict_reads, **dict_ontarget})

Skipping: sca8_11.dorado
Skipping: sca8_12.dorado
Skipping: sca8_15.dorado
Skipping: sca8_16.dorado
Skipping: sca8_19.dorado
Skipping: sca8_20.dorado
Skipping: sca8_2_11.dorado
Skipping: sca8_2_12.dorado
Skipping: sca8_2_15.dorado
Skipping: sca8_2_16.dorado
Skipping: sca8_2_19.dorado
Skipping: sca8_2_20.dorado


In [5]:
df = pd.DataFrame(res)

In [6]:
df

Unnamed: 0,dataset,caller,fastqs,reads,fwd,rev,tot
0,sca8_11,guppy,15,43572,21969,9537,31506
1,sca8_12,guppy,20,58373,31628,16554,48182
2,sca8_15,guppy,15,43633,24666,8446,33112
3,sca8_16,guppy,25,72899,40902,18373,59275
4,sca8_19,guppy,21,61509,35787,12180,47967
5,sca8_20,guppy,13,38712,21579,7206,28785
6,sca8_2_11,guppy,8,21981,7883,1520,9403
7,sca8_2_12,guppy,8,23268,11636,3544,15180
8,sca8_2_15,guppy,5,12622,5662,711,6373
9,sca8_2_16,guppy,23,66388,37852,11549,49401
