# WS_ch05B.ipynb
# WESmith 11/25/22
## WS created this notebook to follow along with the code in the book:
## 'Bioinformatics with Python Cookbook' by Tiago Antao
## Each recipe will have its own notebook, suffixed by A, B, etc.¶

# RECIPE: 
# WORKING WITH LOW-QUALITY REFERENCE GENOMES
## also see book code in Chapter05/Low_Quality.py¶

In [None]:
import gzip
import os
import numpy as np
import matplotlib.pyplot as plt
from Bio import SeqIO, SeqUtils

In [None]:
# put large datasets in ~/Downloads so they are not backed up
remote_data_dir = '/home/smithw/Downloads/bioinformatics/ch05_data'
# most important mosquito vector for malaria (in Sub-Saharan Africa)
gambiae_name    = 'VectorBase-57_AgambiaePEST_Genome.fasta'
# another mosquito vector for malaria (in Europe)
atroparvus_name = 'VectorBase-57_AatroparvusEBRO_Genome.fasta'

### 1) LIST THE CHROMOSOMES OF GAMBIAE

In [None]:
g_name = os.path.join(remote_data_dir, gambiae_name)
recs = SeqIO.parse(open(g_name, 'rt', encoding='utf-8'), 'fasta')
for rec in recs:
    print(rec.description)
#Do not do this with atroparvus

### 2) GET THE UNCALLED POSITIONS (Ns) AND THEIR DISTRIBUTION IN GAMBIAE

In [None]:
# this took about 40 seconds
recs = SeqIO.parse(open(g_name, 'rt', encoding='utf-8'), 'fasta')
chrom_Ns    = {}
chrom_sizes = {}
for rec in recs:
    if rec.description.find('supercontig') > -1:
        continue
    print(rec.description) # WS , rec.id, rec)
    chrom = rec.id.split('_')[1]
    if chrom in ['UNKN']:#, 'Y_unplaced']:
        continue
    chrom_Ns[chrom] = []
    on_N = False
    curr_size = 0
    for pos, nuc in enumerate(rec.seq):
        if nuc in ['N', 'n']:
            curr_size += 1
            on_N = True
        else:
            if on_N:
                chrom_Ns[chrom].append(curr_size)
                curr_size = 0
            on_N = False
    if on_N:
        chrom_Ns[chrom].append(curr_size)
    chrom_sizes[chrom] = len(rec.seq)

In [None]:
for chrom, Ns in chrom_Ns.items():
    size = chrom_sizes[chrom]
    if len(Ns) > 0:
        max_Ns = max(Ns)
    else:
        max_Ns = 'NA'
    txt = 'chrom {:2}, size: {:8}, percent Ns: {:4.1f}, # of runs of Ns: {:4}, max N run: {:5}'.\
          format(chrom, size, round(100*sum(Ns)/size, 1), len(Ns), max_Ns)  # WS version
    print(txt)

### 3) GET THE SCAFFOLDS (SUPERCONTINGS) AND SCAFFOLD SIZES OF ATROPARVUS

In [None]:
# this took about 20 seconds
a_name = os.path.join(remote_data_dir, atroparvus_name)
recs   = SeqIO.parse(open(a_name, 'rt', encoding='utf-8'), 'fasta')
sizes  = []
size_N = []
for rec in recs:
    size = len(rec.seq)
    sizes.append(size)
    count_N = 0
    for nuc in rec.seq:
        if nuc in ['n', 'N']:
            count_N += 1
    size_N.append((size, count_N / size))

In [None]:
txt = '{} scaf, med: {}, mean: {:8.1f}, max: {}, min: {}, 10th %: {:4.1f}, 90th %: {:4.1f}'.\
       format(len(sizes), np.median(sizes), np.mean(sizes), max(sizes), min(sizes),
              np.percentile(sizes, 10), np.percentile(sizes, 90)) # WS version
print(txt)

### 4) PLOT FRACTION OF Ns PER CONTIG SIZE

In [None]:
small_split = 4800
large_split = 540000
fig, axs = plt.subplots(1, 3, figsize=(16, 9), dpi=300, squeeze=False, sharey=True)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x <= small_split])
axs[0, 0].plot(xs, ys, '.')
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > small_split and x <= large_split])
axs[0, 1].plot(xs, ys, '.')
axs[0, 1].set_xlim(small_split, large_split)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > large_split])
axs[0, 2].plot(xs, ys, '.')
axs[0, 0].set_ylabel('Fraction of Ns', fontsize=12)
axs[0, 1].set_xlabel('Contig size', fontsize=12)
axs[0, 0].grid() # WS
axs[0, 1].grid()
axs[0, 2].grid()
fig.suptitle('Fraction of Ns per contig size', fontsize=26)
plt.show()