# SCA8 flank analysis

In [1]:
import gzip
from os import listdir
from os.path import isfile, join

# SCA8
fwd_start = 'CAGCAGCAG'
rev_start = 'CTACTACTA'

fwd_end = 'CTACTACTA'
rev_end = 'CAGCAGCAG'

input_path = '/opt/data/sca8_11/fastq/guppy/'
fastq_paths = sorted(join(input_path, f) for f in listdir(input_path) if 'fastq' in f and isfile(join(input_path, f)))

# DMPK1
# fwd_start = 'CAGCAGCAG'
# rev_start = 'CTGCTGCTG'

# fwd_end = 'CTGCTGCTG'
# rev_end = 'CAGCAGCAG'

# input_path = '/opt/data/bc7_1_18/fastq/guppy/'
# fastq_paths = sorted(join(input_path, f) for f in listdir(input_path) if 'fastq' in f and isfile(join(input_path, f)))


In [2]:
fastq_paths

['/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_0.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_1.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_10.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_11.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_12.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_13.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_14.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_2.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_3.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_4.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb5b_bf372731_5.fastq.gz',
 '/opt/data/sca8_11/fastq/guppy/ASF635_pass_barcode11_08b5eb

In [3]:
COMPLEMENT = {
    'A': 'T',
    'C': 'G',
    'G': 'C',
    'T': 'A'
}

def rev_comp(seq, comps):
    return ''.join(comps.get(n, n) for n in reversed(seq))

In [4]:
def process_fastq(fastq_path):
    fastq_name = fastq_path.split('/')[-1]
    gzipped = fastq_name.endswith('.gz')
    openner = gzip.open if gzipped else open

    with openner(fastq_path, 'rt') as f:
        seqs = []
        for i, line in enumerate(f):
            line = line.strip()
            if i%4 == 0:
                if line.startswith('@'):
                    id = line.split(' ')[0]
                else:
                    print(f'Error in {fastq_path} line {i} - not an ID line')
                    raise
            elif i%4 == 1:
                seq = line
                seqs.append(seq)
            elif i%4 == 2:
                if line.startswith('+'):
                    opt = line
                else:
                    print(f'Error in {fastq_path} line {i} - not a + line')
                    raise
            elif i%4 == 3:
                qual = line
        return seqs

In [5]:
reads = [process_fastq(f) for f in fastq_paths]

In [6]:
len(reads)

15

In [7]:
reads = sum(reads, [])

In [8]:
len(reads)

43572

In [9]:
sum(1 for r in reads if 'TAGTAGTAG' in r)

27287

In [10]:
fwd_prefixes = {}
rev_prefixes = {}
fwd_suffixes = {}
rev_suffixes = {}
for r in reads:
    ix = r.find(fwd_start)
    if ix != -1:
        prefix = r[ix-15:ix]
        if prefix in fwd_prefixes:
            fwd_prefixes[prefix] += 1
        else:
            fwd_prefixes[prefix] = 1

    ix = r.find(rev_start)
    if ix != -1:
        prefix = r[ix-15:ix]
        if prefix in rev_prefixes:
            rev_prefixes[prefix] += 1
        else:
            rev_prefixes[prefix] = 1

    r = rev_comp(r, COMPLEMENT)

    ix = r.find(fwd_end)
    if ix != -1:
        prefix = r[ix-15:ix]
        if prefix in fwd_suffixes:
            fwd_suffixes[prefix] += 1
        else:
            fwd_suffixes[prefix] = 1

    ix = r.find(rev_end)
    if ix != -1:
        prefix = r[ix-15:ix]
        if prefix in rev_suffixes:
            rev_suffixes[prefix] += 1
        else:
            rev_suffixes[prefix] = 1


In [11]:
len(fwd_prefixes)

2675

In [12]:
len(rev_prefixes)

985

In [13]:
rev_prefixes

{'GAAAACCTGGCTTTA': 7013,
 'GCTTTACTACTACTG': 17,
 'AGAAACCTGGCTTTA': 401,
 'CTGGCTTTACTATTA': 44,
 'CTACTCTTACTATTA': 1,
 'GCTTTACTACTACTT': 11,
 'TTTGCTCTTTTCTTA': 1,
 'AAAAACCTGGCTTTA': 149,
 'GGGCTTGCTACTTTA': 1,
 'TACTTACTACTATTA': 4,
 'CTTTACTTCTACTCT': 1,
 'ACAAACCTGGCTTTA': 29,
 'AGAAAACTGGCTTTA': 7,
 'GTTCAACCTAGCTTA': 1,
 'AGAACCTTGGCTTTA': 16,
 'TCAAACCTGGCTTTA': 20,
 'TAAAACCTGGCTTTA': 37,
 'AAGAACCTGGCTTTA': 229,
 'CCTAACCTGGCTTTA': 12,
 'AACCTGGCTTTATTA': 361,
 'AATAAAGGCCTTGCT': 1,
 'TAAAACTTGGCTTTA': 1,
 'CTGGCTTTATTATTA': 5,
 'CAAGACCTGGCTTTA': 66,
 'GCTTTACTACTATTA': 83,
 'AACCTGGCTTTCTTA': 48,
 'AACCTCACTTTGTTA': 2,
 'TTATTACTATTATTA': 1,
 'AAGTTGGCTTGTTTA': 1,
 'AAAACTCTGGCTTTA': 4,
 'CTGTTACTACTACTG': 1,
 'GTAGAACCAGCTTTA': 1,
 'CTAAACCTGGCTTTA': 43,
 'AAAACTTCGGCTTTA': 1,
 'AAACCTGGCTTTCCT': 1,
 'TGCTCCTTACTCTCT': 1,
 'CTTTACTTACTACTG': 1,
 'TGGCTTTATTCTTCT': 1,
 'GAAAGCCTGGCTTTA': 19,
 'ACCTGGCTTTCCTTA': 4,
 'AAACCTTTGGCTTTA': 5,
 'CTCTGGCTTTGCTCT': 1,
 'ACCTGGCT

In [14]:
# If you want to get all keys with the maximum value
max_keys = [key for key, value in fwd_prefixes.items() if value == max(fwd_prefixes.values())]
print("Keys with maximum value:", max_keys)

Keys with maximum value: ['TATTTTTAAAAAATG']


In [15]:
# If you want to get all keys with the maximum value
max_keys = [rev_comp(key, COMPLEMENT) for key, value in rev_prefixes.items() if value == max(rev_prefixes.values())]
print("Keys with maximum value:", max_keys)

Keys with maximum value: ['TAAAGCCAGGTTTTC']


In [16]:
# If you want to get all keys with the maximum value
max_keys = [rev_comp(key, COMPLEMENT) for key, value in fwd_suffixes.items() if value == max(fwd_suffixes.values())]
print("Keys with maximum value:", max_keys)

Keys with maximum value: ['TAAAGCCAGGTTTTC']


In [17]:
# If you want to get all keys with the maximum value
max_keys = [key for key, value in rev_suffixes.items() if value == max(rev_suffixes.values())]
print("Keys with maximum value:", max_keys)

Keys with maximum value: ['TATTTTTAAAAAATG']


In [18]:
rev_comp('CTTCATGTTAGAAAACCTGGCTTTA', COMPLEMENT)

'TAAAGCCAGGTTTTCTAACATGAAG'