# Computing %GC
Compute the overall %GC of all CDS regions and for the whole genomes

This only depends on the reference sequences.
CDS regions are identified from the GTF using the same code as used for the primary analysis in `exons_Cas9_vs_Cpf1.ipynb`

Change `SETNAME` below to run for AgamGF or AAEL

In [12]:
import sys; print(sys.version)
import os
from collections import OrderedDict
import pickle

import subprocess
from collections import Counter

import numpy as np; print('numpy', np.__version__)
import scipy; print('scipy', scipy.__version__)
import pandas as pd; print('pandas',pd.__version__)
import allel; print('scikit-allel', allel.__version__)
import zarr; print('zarr', zarr.__version__)

import matplotlib as mpl
import matplotlib.pyplot as plt

import statsmodels; print('statsmodels', statsmodels.__version__)
import statsmodels.api as sm

from IPython.display import display, HTML

3.11.5 | packaged by conda-forge | (main, Aug 27 2023, 03:34:09) [GCC 12.3.0]
numpy 1.26.0
scipy 1.12.0
pandas 2.1.1
scikit-allel 1.3.7
zarr 2.16.1
statsmodels 0.14.1


In [2]:
# %matplotlib inline
mpl.rcParams['figure.facecolor'] = '#BBBBBB'

# bit of code to showing all a dataframe
from IPython.display import display, HTML
def display_all(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None):
        display(df)
def display_all_cols(df, max_rows=60):
    with pd.option_context('display.max_rows', max_rows, 'display.max_columns', None, 'display.width', None):
        display(df)

In [3]:
SETNAME = 'AAEL'

if SETNAME == 'AgamGF':
    ## AgamGF ##########
    GENOME = "AgamP4.12"
    GENE_PREFIX = "AGAP"
    SAMPLE_LIST_FN = None
    CHROMS = ['2R','2L','3R','3L','X']
    REFFN = "data/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa"
    CHROMSBEDFN = 'AgamP4_chroms.bed'
    GFFFN = "data/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3"
    TRANSFN = "data/AgamP4.12.transcript_list"

elif SETNAME == 'AAEL':
    ## AaegL5 #########
    GENOME = "AaegL5.1"
    GENE_PREFIX = "AAEL"
    SAMPLE_LIST_FN = None
    CHROMS = ['1','2','3']
    REFFN = "/data2/data/reference/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa"
    CHROMSBEDFN = 'AaegL5_chroms.bed'
    GFFFN = "data/Aedes-aegypti-LVP_AGWG_BASEFEATURES_AaegL5.1.gff3"
    TRANSFN = "data/Aedes-aegypti-LVP_AGWG_BASEFEATURES_AaegL5.1.transcript_list"


In [4]:
translist = pd.read_csv(TRANSFN, header=None, names=['transcript_id'])
t = translist['transcript_id'].str.rsplit('-', n=1, expand=True)
t.columns = ['gene', 'splice_id']
translist = pd.concat((translist,t), axis=1)
translist

Unnamed: 0,transcript_id,gene,splice_id
0,AAEL008159-RA,AAEL008159,RA
1,AAEL026895-RA,AAEL026895,RA
2,AAEL020319-RA,AAEL020319,RA
3,AAEL025965-RB,AAEL025965,RB
4,AAEL025965-RA,AAEL025965,RA
...,...,...,...
29330,AAEL012106-RA,AAEL012106,RA
29331,AAEL012106-RH,AAEL012106,RH
29332,AAEL012106-RG,AAEL012106,RG
29333,AAEL012102-RA,AAEL012102,RA


In [5]:
#
# Get the table of CDSs from the gff3 file
#

t = translist.copy(deep=True)

assert t.shape[0] == t['transcript_id'].nunique(), "trascript list should have no duplicte transcript_ids"

t.set_index('transcript_id', inplace=True)

gffdf = pd.read_csv(GFFFN, sep='\t', comment='#', header=None,
            names=['seqid',
                   'source',
                   'type',
                   'start',
                   'end',
                   'score',
                   'strand',
                   'phase',
                   'attributes'],
                dtype={'seqid':str})

# total number of genes (just accounting / checking)
# note: assumes one entry per gene & pseudogene in gff
tmp = (gffdf['type'] == 'gene').sum()
print("total number of genes in gtf:", tmp)
tmp2 = (gffdf['type'] == 'pseudogene').sum()
print("total number of pseudogenes in gtf:", tmp2)
# results_dict['total pseudogenes in gtf'] = tmp2
assert tmp+tmp2 == t['gene'].nunique(), "number of genes+pseudogenes in gtf should equal number of genes in gene_table"

# just look at CDS entries
cds = gffdf.loc[gffdf['type']=='CDS' ,:].copy(deep=True)
# add transcript_id to Exon entries
cds['transcript_id'] = cds['attributes'].apply(
    lambda x: [x3[1].strip("'") for x3 in [x2.split('=') for x2 in x.split(';')] if x3[0] == 'Parent' ][0])

# join with the transcript list to get the gene_ids
num_CDSs = cds.shape[0]
cds = cds.join(translist.set_index('transcript_id'), on='transcript_id', how='outer')

print(f"CDSs not matching a transcript from translist: {(cds['gene'].isna()).sum()}") # should be 0
assert cds['gene'].isna().sum()==0, "CDS not matching a transcript in translist"

print(f"translist entries without a CDS (presumably pseudogenes): {(cds['seqid'].isna()).sum()}")
# drop the non-matching translist entries
cds = cds.loc[~cds['seqid'].isna(),:]

# cast positions to ints
cds['start'] = cds['start'].astype(int)
cds['end'] = cds['end'].astype(int)

# Only take CDSs in the CRHOMS of interest
tmp = cds.shape[0]
cds = cds[cds['seqid'].isin(CHROMS)]
print(f'CDSs not in CHROMS {CHROMS}: {tmp-cds.shape[0]}')

cds.sort_values(['seqid','start','splice_id'], inplace=True)
cds.reset_index(drop=True, inplace=True)

cds

total number of genes in gtf: 14626
total number of pseudogenes in gtf: 382
CDSs not matching a transcript from translist: 0
translist entries without a CDS (presumably pseudogenes): 1019
CDSs not in CHROMS ['1', '2', '3']: 4067


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,transcript_id,gene,splice_id
0,1,VectorBase,CDS,31666,31761,.,+,0,ID=AAEL012102-PA;Parent=AAEL012102-RA;protein_...,AAEL012102-RA,AAEL012102,RA
1,1,VectorBase,CDS,31666,31761,.,+,0,ID=AAEL012102-PB;Parent=AAEL012102-RB;protein_...,AAEL012102-RB,AAEL012102,RB
2,1,VectorBase,CDS,31832,32404,.,+,0,ID=AAEL012102-PA;Parent=AAEL012102-RA;protein_...,AAEL012102-RA,AAEL012102,RA
3,1,VectorBase,CDS,31832,32404,.,+,0,ID=AAEL012102-PB;Parent=AAEL012102-RB;protein_...,AAEL012102-RB,AAEL012102,RB
4,1,VectorBase,CDS,32495,32673,.,+,0,ID=AAEL012102-PA;Parent=AAEL012102-RA;protein_...,AAEL012102-RA,AAEL012102,RA
...,...,...,...,...,...,...,...,...,...,...,...,...
168730,3,VectorBase,CDS,409622594,409622821,.,-,0,ID=AAEL010311-PA;Parent=AAEL010311-RA;protein_...,AAEL010311-RA,AAEL010311,RA
168731,3,VectorBase,CDS,409622594,409622821,.,-,0,ID=AAEL010311-PB;Parent=AAEL010311-RB;protein_...,AAEL010311-RB,AAEL010311,RB
168732,3,VectorBase,CDS,409688525,409688690,.,+,0,ID=AAEL010318-PA;Parent=AAEL010318-RA;protein_...,AAEL010318-RA,AAEL010318,RA
168733,3,VectorBase,CDS,409695468,409696934,.,+,2,ID=AAEL010318-PA;Parent=AAEL010318-RA;protein_...,AAEL010318-RA,AAEL010318,RA


In [6]:
# drop CDSs which have length 1 (start==end)
print(f"DROPPING {(cds['start'] == cds['end']).sum()} CDSs where start==end")
cds = cds[cds['start'] != cds['end']].copy()


# look for overlapping CDS with the SAME GENE
print(f"number of CDS which are simple splice variants (same start,end,strand,gene): {cds.duplicated(['seqid','start','end','strand','gene'], keep='first').sum()}")

# drop simple splice variants
tmp = cds.shape[0]
cds.drop_duplicates(['seqid','start','end','strand','gene'], keep='first', inplace=True)
print(f"DROPPING {tmp-cds.shape[0]} simple spliace variants")

# overlap groups with same gene
overlap = ((cds['start']<=cds['end'].shift(1)) & (cds['seqid']==cds['seqid'].shift(1)) & (cds['gene']==cds['gene'].shift(1)))
cds['overlap_group'] = (~overlap).cumsum()
gb = cds.groupby('overlap_group', dropna=False)

assert (gb['gene'].nunique() == 1).all(), "all groups should contain only the same gene"

print(f"number of same-gene overlap groups with multiple CDSs: {(gb['gene'].count()>1).sum()}")
print(f"number of same-gene overlapping CDSs: {gb['gene'].count()[gb['gene'].count()>1].sum()}")

foo = gb.nth(0).copy()
foo['start'] = gb['start'].min()
foo['end'] = gb['end'].max()
foo['CDS cnt'] = gb.size()
foo['splice_id'] = gb['splice_id'].agg(lambda x: ','.join(x))

foo['full_len'] = gb['end'].max()-gb['start'].min()+1
foo['overlap_len'] = gb['end'].min()-gb['start'].max()+1

print(f"number of same-gene groups with CDSs of different lengths: {foo[foo['overlap_len'] != foo['full_len']].shape[0]}")

print("** number of protein coding genes:", cds['gene'].nunique())

cds_grps = foo
cds_grps

DROPPING 79 CDSs where start==end
number of CDS which are simple splice variants (same start,end,strand,gene): 103728
DROPPING 103728 simple spliace variants
number of same-gene overlap groups with multiple CDSs: 2667
number of same-gene overlapping CDSs: 5684
number of same-gene groups with CDSs of different lengths: 40203
** number of protein coding genes: 13601


Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,transcript_id,gene,splice_id,overlap_group,CDS cnt,full_len,overlap_len
0,1,VectorBase,CDS,,,.,+,0,ID=AAEL012102-PA;Parent=AAEL012102-RA;protein_...,AAEL012102-RA,AAEL012102,,1,,,
2,1,VectorBase,CDS,31832.0,32404.0,.,+,0,ID=AAEL012102-PA;Parent=AAEL012102-RA;protein_...,AAEL012102-RA,AAEL012102,RA,2,1.0,573.0,573.0
4,1,VectorBase,CDS,32732.0,33596.0,.,+,0,ID=AAEL012102-PA;Parent=AAEL012102-RA;protein_...,AAEL012102-RA,AAEL012102,RA,3,1.0,865.0,865.0
6,1,VectorBase,CDS,41767.0,41958.0,.,+,1,ID=AAEL012102-PA;Parent=AAEL012102-RA;protein_...,AAEL012102-RA,AAEL012102,RA,4,1.0,192.0,192.0
8,1,VectorBase,CDS,44349.0,44471.0,.,+,0,ID=AAEL012102-PA;Parent=AAEL012102-RA;protein_...,AAEL012102-RA,AAEL012102,RA,5,1.0,123.0,123.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168728,3,VectorBase,CDS,,,.,-,0,ID=AAEL010311-PA;Parent=AAEL010311-RA;protein_...,AAEL010311-RA,AAEL010311,,61907,,,
168730,3,VectorBase,CDS,,,.,-,0,ID=AAEL010311-PA;Parent=AAEL010311-RA;protein_...,AAEL010311-RA,AAEL010311,,61908,,,
168732,3,VectorBase,CDS,,,.,+,0,ID=AAEL010318-PA;Parent=AAEL010318-RA;protein_...,AAEL010318-RA,AAEL010318,,61909,,,
168733,3,VectorBase,CDS,,,.,+,2,ID=AAEL010318-PA;Parent=AAEL010318-RA;protein_...,AAEL010318-RA,AAEL010318,,61910,,,


In [7]:
# computing the cumulatilve length (bp) of the CDSs accounting for overlaps
overlap = ((cds['start']<=cds['end'].shift(1)) & (cds['seqid']==cds['seqid'].shift(1)))
cds['overlap_group_ignoring_gene'] = (~overlap).cumsum()

gb = cds.groupby('overlap_group_ignoring_gene', dropna=False)

print(f"number of overlap groups with multiple CDSs: {(gb['gene'].count()>1).sum()}")
print(f"number of overlapping CDSs: {gb['gene'].count()[gb['gene'].count()>1].sum()}")

total_coding_len = (gb['end'].max()-gb['start'].min()+1).sum()
print(f"** total coding length [bp]: {total_coding_len}")

print(f"total coding len possibly multiple counting non-same-gene overlaps: {cds_grps['full_len'].sum()}")

number of overlap groups with multiple CDSs: 2681
number of overlapping CDSs: 5714
** total coding length [bp]: 23255031
total coding len possibly multiple counting non-same-gene overlaps: 8370802.0


In [8]:
cds

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,transcript_id,gene,splice_id,overlap_group,overlap_group_ignoring_gene
0,1,VectorBase,CDS,31666,31761,.,+,0,ID=AAEL012102-PA;Parent=AAEL012102-RA;protein_...,AAEL012102-RA,AAEL012102,RA,1,1
2,1,VectorBase,CDS,31832,32404,.,+,0,ID=AAEL012102-PA;Parent=AAEL012102-RA;protein_...,AAEL012102-RA,AAEL012102,RA,2,2
4,1,VectorBase,CDS,32495,32673,.,+,0,ID=AAEL012102-PA;Parent=AAEL012102-RA;protein_...,AAEL012102-RA,AAEL012102,RA,3,3
6,1,VectorBase,CDS,32732,33596,.,+,1,ID=AAEL012102-PA;Parent=AAEL012102-RA;protein_...,AAEL012102-RA,AAEL012102,RA,4,4
8,1,VectorBase,CDS,33720,33905,.,+,0,ID=AAEL012102-PA;Parent=AAEL012102-RA;protein_...,AAEL012102-RA,AAEL012102,RA,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168728,3,VectorBase,CDS,409621462,409622532,.,-,0,ID=AAEL010311-PA;Parent=AAEL010311-RA;protein_...,AAEL010311-RA,AAEL010311,RA,61907,61891
168730,3,VectorBase,CDS,409622594,409622821,.,-,0,ID=AAEL010311-PA;Parent=AAEL010311-RA;protein_...,AAEL010311-RA,AAEL010311,RA,61908,61892
168732,3,VectorBase,CDS,409688525,409688690,.,+,0,ID=AAEL010318-PA;Parent=AAEL010318-RA;protein_...,AAEL010318-RA,AAEL010318,RA,61909,61893
168733,3,VectorBase,CDS,409695468,409696934,.,+,2,ID=AAEL010318-PA;Parent=AAEL010318-RA;protein_...,AAEL010318-RA,AAEL010318,RA,61910,61894


In [9]:
# write out bed file of all CDSs
t = cds[['seqid','start','end']].copy()
t['start'] -= 1 # BED is 0-based half-open
display(t)
BEDFN = f"{SETNAME}_CDSs.bed"
t.to_csv(BEDFN, sep='\t', index=False, header=False)

Unnamed: 0,seqid,start,end
0,1,31665,31761
2,1,31831,32404
4,1,32494,32673
6,1,32731,33596
8,1,33719,33905
...,...,...,...
168728,3,409621461,409622532
168730,3,409622593,409622821
168732,3,409688524,409688690
168733,3,409695467,409696934


In [10]:
cmd = f"bedtools getfasta -fi {REFFN} -bed {BEDFN} | grep -v '^>'"# | tr -d '[:space:]'"

print(f"running:\n{cmd}")
with subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) as p:
    c = Counter()
    for line in p.stdout:
        c += Counter(line)

raw_cnts = {}
for k,v in c.items():
    raw_cnts[chr(k)]=v
display(raw_cnts)

c = {}
sum = 0
for nuc in ['A','G','C','T']:
    c[nuc] = raw_cnts[nuc]+raw_cnts[nuc.lower()]
    sum += c[nuc]
display(c)
display(sum)

print(f"CDSs only %GC = {(c['G']+c['C'])/sum}")

running:
bedtools getfasta -fi /data2/data/reference/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa -bed AAEL_CDSs.bed | grep -v '^>'


{'A': 5925009,
 'T': 5965905,
 'G': 5911759,
 'C': 5911333,
 '\n': 64928,
 'c': 172479,
 'a': 185045,
 'g': 173422,
 't': 193548}

{'A': 6110054, 'G': 6085181, 'C': 6083812, 'T': 6159453}

24438500

CDSs only %GC = 0.49794353172248706


In [11]:
# Whole reference (selected chroms only)
cmd = f"bedtools getfasta -fi {REFFN} -bed {CHROMSBEDFN} | grep -v '^>'"# | tr -d '[:space:]'"

print(f"running:\n{cmd}")
with subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) as p:
    c = Counter()
    for line in p.stdout:
        c += Counter(line)

raw_cnts = {}
for k,v in c.items():
    raw_cnts[chr(k)]=v
display(raw_cnts)

c = {}
sum = 0
for nuc in ['A','G','C','T']:
    c[nuc] = raw_cnts[nuc]+raw_cnts[nuc.lower()]
    sum += c[nuc]
display(c)
display(sum)

print(f"Whole genome %GC = {(c['G']+c['C'])/sum}")

running:
bedtools getfasta -fi /data2/data/reference/Aedes-aegypti-LVP_AGWG_CHROMOSOMES_AaegL5.fa -bed AaegL5_chroms.bed | grep -v '^>'


{'G': 51784094,
 'A': 79491128,
 'T': 79527252,
 'C': 51810939,
 'a': 289997861,
 'c': 176232833,
 't': 289937805,
 'g': 176231161,
 'n': 17335,
 '\n': 3}

{'A': 369488989, 'G': 228015255, 'C': 228043772, 'T': 369465057}

1195013073

Whole genome %GC = 0.3816351781450344
