# SCA8 flank analysis

In [10]:
import gzip
from os import listdir
from os.path import isfile, join

input_path = '/opt/data/sca8_11/fastq/guppy/'
fastq_paths = sorted(join(input_path, f) for f in listdir(input_path) if 'fastq' in f and isfile(join(input_path, f)))

In [11]:
fastq_paths

['/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_0.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_1.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_10.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_11.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_12.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_13.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_14.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_2.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_3.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_4.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_5.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb

In [39]:
COMPLEMENT = {
    'A': 'T',
    'C': 'G',
    'G': 'C',
    'T': 'A'
}

def rev_comp(seq, comps):
    return ''.join(comps.get(n, n) for n in reversed(seq))

In [14]:
def process_fastq(fastq_path):
    fastq_name = fastq_path.split('/')[-1]
    gzipped = fastq_name.endswith('.gz')
    openner = gzip.open if gzipped else open

    with openner(fastq_path, 'rt') as f:
        seqs = []
        for i, line in enumerate(f):
            line = line.strip()
            if i%4 == 0:
                if line.startswith('@'):
                    id = line.split(' ')[0]
                else:
                    print(f'Error in {fastq_path} line {i} - not an ID line')
                    raise
            elif i%4 == 1:
                seq = line
                seqs.append(seq)
            elif i%4 == 2:
                if line.startswith('+'):
                    opt = line
                else:
                    print(f'Error in {fastq_path} line {i} - not a + line')
                    raise
            elif i%4 == 3:
                qual = line
        return seqs

In [15]:
reads = [process_fastq(f) for f in fastq_paths]

In [16]:
len(reads)

15

In [19]:
reads = sum(reads, [])

In [20]:
len(reads)

43572

In [26]:
sum(1 for r in reads if 'TAGTAGTAG' in r)

27287

In [34]:
fwd_prefixes = {}
rev_prefixes = {}
for r in reads:
    ix = r.find('CTACTACTA')
    if ix != -1:
        prefix = r[ix-25:ix]
        if prefix in fwd_prefixes:
            fwd_prefixes[prefix] += 1
        else:
            fwd_prefixes[prefix] = 1

    ix = r.find('CAGCAGCAG')
    if ix != -1:
        prefix = r[ix-25:ix]
        if prefix in rev_prefixes:
            rev_prefixes[prefix] += 1
        else:
            rev_prefixes[prefix] = 1

In [29]:
len(fwd_prefixes)

2349

In [35]:
len(rev_prefixes)

5506

In [36]:
rev_prefixes

{'TAAGATAATATATTTTTAAAAAATG': 8407,
 'AAATAAGATAATATATTAAAAAATG': 1407,
 'AATAAGATAATATCTTTAAAAAATG': 154,
 'AATAAGATAATATATTTAAAAAATG': 2031,
 'TGGATTAATATATTTTTAAAAAATG': 8,
 'TAAGATGATGTATTTTTAAAAAATG': 35,
 'ATAAGATAATATATTTTAAAAAATG': 3470,
 'AAATAAGATGATCTGTTAAAAAATG': 140,
 'ATAAGATAATATCCTTTAAAAAATG': 82,
 'AAATAAGATAAATATCTAAAAAATG': 1,
 'TAAATAAGATAATATCTAAAAAATG': 24,
 'AAGATAATATATTTAGAAAAAAATG': 2,
 'AAGATAATATATTTTTTAAAAAATG': 708,
 'AAGATCATGTATTTTTTAAAAAATG': 2,
 'TAAGATAATATACTTTTAAAAAATG': 157,
 'AGATAATATATTTTAAAAAATGCGG': 15,
 'GATAATCTTGACTTTTTAAAAAATG': 1,
 'ATAAGATAATATATTTTTAAAAATG': 176,
 'TATATTTTAAAAAATGCAACAGCGG': 1,
 'GCTGCTGCTGCTGCTGCTGCTGCTG': 46,
 'GATAATATATTTTTTCATTTGCGGA': 1,
 'AGATGATATTATTTTTTAAAAAATG': 15,
 'TGATGTGTTTGAAAAAATGCAGCGG': 1,
 'AGATAATATATATTTTTAAAAAATG': 77,
 'AATAAGATAATATGTACAAAAAATG': 1,
 'ATAAGATAATATACCTTAAAAAATG': 5,
 'TAAGATAATATATATTTAAAAAATG': 52,
 'GAATAAGATGATCTGTTAAAAAATG': 4,
 'CAAGATAATATATTTTTAAAAAATG': 36,
 'AAGAATAATA

In [37]:
# If you want to get all keys with the maximum value
max_keys = [key for key, value in fwd_prefixes.items() if value == max(fwd_prefixes.values())]
print("Keys with maximum value:", max_keys)

Keys with maximum value: ['CTTCATGTTAGAAAACCTGGCTTTA']


In [38]:
# If you want to get all keys with the maximum value
max_keys = [key for key, value in rev_prefixes.items() if value == max(rev_prefixes.values())]
print("Keys with maximum value:", max_keys)

Keys with maximum value: ['TAAGATAATATATTTTTAAAAAATG']


In [40]:
rev_comp('CTTCATGTTAGAAAACCTGGCTTTA', COMPLEMENT)

'TAAAGCCAGGTTTTCTAACATGAAG'