# FASTQ to TSV

## Arguments

In [1]:
fastqs = '/opt/data/'
outdir = '/opt/data_out/fastq_tsv/'

callers = [
    # 'dorado',
    # 'dorado_hac',
    # 'dorado_sup',
    'guppy',
]

datasets = [
    # 'bc3_01',
    # 'bc3_02',
    # 'bc3_03',
    # 'bc6_05',
    # 'bc6_06',
    # 'bc6_07',
    # 'bc6_08',
    # 'bc6_09',
    # 'bc6_10',
    # 'bc6_2_05',
    # 'bc6_2_06',
    # 'bc6_2_07',
    # 'bc6_2_08',
    # 'bc6_2_09',
    # 'bc6_2_10',
    # 'bc6_3_05',
    # 'bc6_3_06',
    # 'bc6_3_07',
    # 'bc6_3_08',
    # 'bc6_3_09',
    # 'bc6_3_10',
    'bc6_4_05',
    'bc6_4_06',
    'bc6_4_07',
    'bc6_4_08',
    'bc6_4_09',
    'bc6_4_10',
    # 'bc7_1_18',
    # 'bc7_1_19',
    # 'bc7_1_20',
    # 'bc7_1_21',
    # 'bc7_1_22',
    # 'bc7_1_23',
    # 'bc7_1_24',
    # 'bc7_2_18',
    # 'bc7_2_19',
    # 'bc7_2_20',
    # 'bc7_2_21',
    # 'bc7_2_22',
    # 'bc7_2_23',
    # 'bc7_2_24',
    # 'sca8_1_11',
    # 'sca8_1_12',
    # 'sca8_1_15',
    # 'sca8_1_16',
    # 'sca8_1_19',
    # 'sca8_1_20',
    # 'sca8_2_11',
    # 'sca8_2_12',
    # 'sca8_2_15',
    # 'sca8_2_16',
    # 'sca8_2_19',
    # 'sca8_2_20',
]

## Imports

In [2]:
import gzip
from os import listdir
from os.path import isfile, join

## Functions

In [3]:
def read_fastq(fastq_path):
    fastq_name = fastq_path.split('/')[-1]
    gzipped = fastq_name.endswith('.gz')
    openner = gzip.open if gzipped else open

    reads = []

    if not isfile(fastq_path):
        return reads

    with openner(fastq_path, 'rt') as f:
        for i, line in enumerate(f):
            line = line.strip()
            if i%4 == 0:
                if line.startswith('@'):
                    id = line.split(' ')[0]
                else:
                    print(f'Error in {fastq_path} line {i} - not an ID line')
                    raise
            elif i%4 == 1:
                seq = line
            elif i%4 == 2:
                if line.startswith('+'):
                    opt = line
                else:
                    print(f'Error in {fastq_path} line {i} - not a + line')
                    raise
            elif i%4 == 3:
                qual = line
                reads.append({
                    'id': id,
                    'seq': seq,
                    'qual': qual,
                })

    return reads


def write_reads(input_path, output_path):
    try:
        fastq_paths = sorted(join(input_path, f) for f in listdir(input_path) if 'fastq' in f and isfile(join(input_path, f)))
    except FileNotFoundError:
        return {
            'fastqs': -1,
            'reads': -1,
        }

    with open(output_path, 'wt') as o:
    
        for fastq_path in fastq_paths:
            reads = read_fastq(fastq_path)
            for read in reads:
                row = f"{read['seq']}\t{read['id']}\n"
                o.write(row)

## Main

In [4]:
for caller in callers:
    for dataset in datasets:
        input_path = f'{fastqs}{dataset}/fastq/{caller}/'
        output_path = f'{outdir}{dataset}_{caller}.fastq.tsv'
        print(f'{dataset} - {caller}')
        print(write_reads(input_path, output_path))

bc6_4_05 - guppy
None
bc6_4_06 - guppy
None
bc6_4_07 - guppy
None
bc6_4_08 - guppy
None
bc6_4_09 - guppy
None
bc6_4_10 - guppy
None
