# WS_ch05A.ipynb
# WESmith 11/24/22
## WS created this notebook to follow along with the code in the book:
## 'Bioinformatics with Python Cookbook' by Tiago Antao
## Each recipe will have its own notebook, suffixed by A, B, etc.¶

# RECIPE: 
# WORKING WITH HIGH-QUALITY REFERENCE GENOMES
## also see book code in Chapter05/Reference_Genome.py¶

In [None]:
from IPython.core.display import Image
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio import SeqIO
from Bio import SeqUtils
from Bio.Graphics import BasicChromosome
import os

In [None]:
# source of the Plasmodium falciparum genome
#!wget http://plasmodb.org/common/downloads/release-13.0/Pfalciparum3D7/fasta/data/PlasmoDB-13.0_Pfalciparum3D7_Genome.fasta

In [None]:
data_dir  = 'data/ch05_data'
genome = 'PlasmoDB-13.0_Pfalciparum3D7_Genome.fasta'
file   = os.path.join(data_dir, genome)

### 1) INSPECT THE SEQUENCES IN THE FASTA FILE

In [None]:
recs = SeqIO.parse(file, 'fasta')
chroms = {}
for rec in recs:
    print(rec.description)

### 2) 3) COMPUTE THE GC FRACTION PER BLOCK OF BASE PAIRS

In [None]:
chrom_sizes = {}
chrom_GC = {}
recs = SeqIO.parse(file, 'fasta')
block_size = 50000
min_GC = 100.0
max_GC = 0.0
for rec in recs:
    if rec.description.find('SO=chromosome') == -1:
        continue
    chrom = int(rec.description.split('_')[1])
    chrom_GC[chrom] = []
    size = len(rec.seq)
    chrom_sizes[chrom] = size
    num_blocks = size // block_size + 1
    for block in range(num_blocks):
        start = block_size * block
        if block == num_blocks - 1:
            end = size
        else:
            end = block_size + start + 1
        block_seq = rec.seq[start:end]
        block_GC = SeqUtils.GC(block_seq)
        if block_GC < min_GC:
            min_GC = block_GC
        if block_GC > max_GC:
            max_GC = block_GC
        chrom_GC[chrom].append(block_GC)

In [None]:
min_GC, max_GC

### 4) PLOT

In [None]:
chroms = list(chrom_sizes.keys())
chroms.sort()
biggest_chrom = max(chrom_sizes.values())

In [None]:
my_genome = BasicChromosome.Organism(output_format="png")
my_genome.page_size = (11.0*2.54*cm, 8.5*2.54*cm)  # (29.7*cm, 21*cm)  # WS changed
telomere_length = 10
bottom_GC = 17.5  # WS for bottom color
top_GC    = 22.0  # WS for top    color
#bottom_GC = (1 + 0.1) * min_GC  # WS change
#top_GC    = (1 - 0.3) * max_GC  # WS change

In [None]:
bottom_GC, top_GC

In [None]:
for chrom in chroms:
    chrom_size = chrom_sizes[chrom]
    chrom_representation = BasicChromosome.Chromosome('Cr %d' % chrom)
    chrom_representation.scale_num = biggest_chrom

    tel = BasicChromosome.TelomereSegment()
    tel.scale = telomere_length
    chrom_representation.add(tel)

    num_blocks = len(chrom_GC[chrom])
    for block, gc in enumerate(chrom_GC[chrom]):
        my_GC = chrom_GC[chrom][block]
        body = BasicChromosome.ChromosomeSegment()
        if my_GC > top_GC:
            body.fill_color = colors.Color(1, 1, 1)  # WS experimented
        elif my_GC < bottom_GC:
            body.fill_color = colors.Color(0, 0, 0)  # WS experimented
        else:
            my_color = (my_GC - bottom_GC) / (top_GC - bottom_GC)
            #body.fill_color = colors.Color(my_color, my_color, 1)  # WS R,G,B
            body.fill_color = colors.Color(my_color, my_color, 1) # WS experimented
        if block < num_blocks - 1:
            body.scale = block_size
        else:
            body.scale = chrom_size % block_size
        chrom_representation.add(body)

    tel = BasicChromosome.TelomereSegment(inverted=True)
    tel.scale = telomere_length
    chrom_representation.add(tel)

    my_genome.add(chrom_representation)

In [None]:
out = os.path.join(data_dir,"falciparum.png")
my_genome.draw(out, "Plasmodium falciparum")

In [None]:
Image(out)