# WS_ch03C.ipynb
# WESmith 11/10/22
## WS created this notebook to follow along chap 3 code from book 
# 'Bioinformatics with Python Cookbook' by Tiago Antao
### Each recipe will have its own notebook, suffixed by A, B, etc.

# WORKING WITH MODERN SEQUENCE FORMATS

## get the dataset from
!wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265.filt.fastq.gz


In [None]:
from Bio import SeqIO
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gzip
import os
from collections import defaultdict

### 1) OPEN THE FILE

In [None]:
data_dir = 'data/ch03_data'
file     = 'SRR003265.filt.fastq.gz'

In [None]:
recs = SeqIO.parse(gzip.open(os.path.join(data_dir, file), 'rt', encoding='utf-8'), 'fastq')

In [None]:
rec = next(recs)

In [None]:
print(rec.id, rec.description, rec.seq)

In [None]:
print(rec.letter_annotations)

### 2) LOOK AT DISTRIBUTION OF NUCLEOTIDE READS

In [None]:
# reset the recs iterator
recs = SeqIO.parse(gzip.open(os.path.join(data_dir, file), 'rt', encoding='utf-8'), 'fastq')

In [None]:
cnt = defaultdict(int)
for rec in recs:
    for letter in rec.seq:
        cnt[letter] += 1
tot = sum(cnt.values())
for letter, cnt in cnt.items():
    print('%s: %.2f %d' % (letter, 100. * cnt/tot, cnt))

### 3) LOOK AT THE DISTRIBUTION OF N'S (UNKNOWN BASES)

In [None]:
# reset the recs iterator
recs = SeqIO.parse(gzip.open(os.path.join(data_dir, file), 'rt', encoding='utf-8'), 'fastq')

In [None]:
n_cnt = defaultdict(int)
for rec in recs:
    for i, letter in enumerate(rec.seq):
        pos = i + 1
        if letter == 'N':
            n_cnt[pos] += 1
seq_len   = max(n_cnt.keys())
positions = range(1, seq_len + 1)

In [None]:
fig, ax = plt.subplots(figsize=(16,5))
ax.plot(positions, [n_cnt[x] for x in positions])
fig.suptitle('Number of unknown calls as a function of the distance from start of the sequencer read')
ax.set_xlim(1, seq_len)
ax.set_xlabel('Read Distance')
ax.set_ylabel('Number of unknown calls')
ax.grid(True)  # WS

### 4) LOOK AT THE DISTRIBUTION OF PHRED SCORES (THE QUALITY OF THE READS)

In [None]:
# reset the recs iterator
recs = SeqIO.parse(gzip.open(os.path.join(data_dir, file), 'rt', encoding='utf-8'), 'fastq')

In [None]:
cnt_qual = defaultdict(int)
for rec in recs:
    for i, qual in enumerate(rec.letter_annotations['phred_quality']):
        if i < 25: # guaranteed to have no quality problems less than first 25 reads in this dbase
            continue
        cnt_qual[qual] += 1

In [None]:
tot = sum(cnt_qual.values())
for qual, cnt in cnt_qual.items():
    #print('%d: %.2f %d' % (qual, 100.*cnt/tot, cnt))
    print('phred: {:02d}   % occuring: {:5.2f}  counts: {:8d}'.format(qual, 100.*cnt/tot, cnt)) # WS version

In [None]:
# WS example phred values
import ws_utils as ws
for k in [0,1,5,10,20,30,40,50,60]:
    print('phred: {:02d}  % accurate: {:7.4f}'.format(k, ws.phred_to_percent_accurate(k)))

### 5) PLOT THE PHRED QUALITY VS READ POSITION

In [None]:
# reset the recs iterator
recs = SeqIO.parse(gzip.open(os.path.join(data_dir, file), 'rt', encoding='utf-8'), 'fastq')

In [None]:
qual_pos = defaultdict(list)
cutoff = 25  # WS
for rec in recs:
    for i, qual in enumerate(rec.letter_annotations['phred_quality']):
        if i < cutoff or qual == 40:
            continue  # WS skip low-error reads
        pos = i + 1
        qual_pos[pos].append(qual)
vps = []
poses = list(qual_pos.keys())
poses.sort()
for pos in poses:
    vps.append(qual_pos[pos])

In [None]:
# WS seaborn boxplot
fig, ax = plt.subplots(figsize=(16,9))
sns.boxplot(data=vps, ax=ax)
ax.set_xticklabels([str(x) for x in range(cutoff + 1, max(qual_pos.keys()) + 1)])
ax.set_xlabel('Read Distance')
ax.set_ylabel('PHRED Score')
fig.suptitle('Distribution of PHRED scores as a function of read distance')
ax.grid(True)
plt.show()
# WS the median is the horizontal bar (see below): the quartiles are shown

In [None]:
# WS matplotlib boxplot
fig, ax = plt.subplots(figsize=(16,9))
ax.boxplot(vps)
ax.set_xticklabels([str(x) for x in range(cutoff + 1, max(qual_pos.keys()) + 1)])
ax.set_xlabel('Read Distance')
ax.set_ylabel('PHRED Score')
fig.suptitle('Distribution of PHRED scores as a function of read distance')
plt.show()

In [None]:
np.median(vps[26-26])

In [None]:
len(vps)

In [None]:
sns.boxplot?

In [None]:
# NEXT: look at the 'there's more' section