# Get PAM sites in whole-genome coding regions

This notebook uses python package `gffutils` to parse CDS, and `re` to find positions of NGG PAMs.

This serves as the first layer of reference genome predictions


In [1]:
# move up one-level to access backend
%cd /common/zhangz2lab/jling/CROTONdb/

/common/zhangz2lab/jling/CROTONdb


In [2]:
import backend
# for watermark
import gffutils
import pandas as pd
import numpy as np
import os
import json
from pyfaidx import Fasta
from tqdm import tqdm
import time

In [3]:
backend.configs.DATA_DIR

'/common/zhangz2lab/jling/CROTONdb/backend/../frontend/data'

In [4]:
print ("GTF_DB_PATH:",backend.configs.GTF_DB_PATH)

if os.path.isfile(backend.configs.GTF_DB_PATH.rstrip('.gtf_sqldb')) and \
   not os.path.isfile(backend.configs.GTF_DB_PATH):
    t0 = time.time()
    print("found GTF file but not SQL database, building with gffutils; this will only run once.")
    # see here: https://daler.github.io/gffutils/autodocs/gffutils.create.create_db.html
    gffutils.create.create_db(
        data=backend.configs.GTF_DB_PATH.rstrip('.gtf_sqldb'),
        dbfn=backend.configs.GTF_DB_PATH,
        merge_strategy="merge",
    )
    print("took %.3f seconds.." % (time.time() - t0))

GTF_DB_PATH: /common/zhangz2lab/jling/CROTONdb/backend/../frontend/data/genomes/Small_Test_Genome.CROTONTest.gff3.gtf_sqldb
found GTF file but not SQL database, building with gffutils; this will only run once.
took 0.100 seconds..


In [5]:
cds_df = backend.get_CDS_df(db_path=backend.configs.GTF_DB_PATH)

cds_df

reading gtf_db..


100%|██████████| 2/2 [00:00<00:00, 13751.82it/s]


Unnamed: 0,genename,chrom,start,end,strand
0,CROTONTestGene,chr1,0,30,+
1,CROTONTestGene,chr1,12,40,+


In [6]:
cds_df = backend.get_CDS_seq(df=cds_df, genome_path=backend.configs.GENOME_FA_PATH)

cds_df

100%|██████████| 2/2 [00:00<00:00, 3433.73it/s]


Unnamed: 0,genename,chrom,start,end,strand,seq
0,CROTONTestGene,chr1,0,30,+,ATGACGGCGACGAACAAGCACCTCATACGA
1,CROTONTestGene,chr1,12,40,+,AACAAGCACCTCATACGAAAGCACCTCA


In [7]:
pam_df = backend.get_PAM_coords(df=cds_df)

100%|██████████| 2/2 [00:00<00:00, 2629.66it/s]


In [8]:
pam_df.head()

Unnamed: 0,genename,chrom,start,end,strand,seq,pams,rc_pams
0,CROTONTestGene,chr1,0,30,+,ATGACGGCGACGAACAAGCACCTCATACGA,[4],[23]
1,CROTONTestGene,chr1,12,40,+,AACAAGCACCTCATACGAAAGCACCTCA,[],"[38, 23]"


In [9]:
# to make 60bp input for CROTON, we pad 33bp to the left of PAM, and 27bp to the right
CDSpamsbed_df, skipped_genes = backend.make_CDSpamsbed(pam_df=pam_df, pam_left=33, pam_right=27)

100%|██████████| 1/1 [00:00<00:00, 354.10it/s]

processed PAMs in CDS: (3, 7)
skipped genes: []





In [10]:
CDSpamsbed_df = backend.get_ref_PAM_seq(df=CDSpamsbed_df, genome_path=backend.configs.GENOME_FA_PATH)

100%|██████████| 3/3 [00:00<00:00, 5167.52it/s]


In [11]:
CDSpamsbed_df.head()

Unnamed: 0,start,end,strand,#,genename,num,pamid,ref_seq
2,-29,31,+,1,CROTONTestGene,1,CROTONTestGene|1,GAACAAGCACCTCATACGAA
1,-4,56,-,1,CROTONTestGene,2,CROTONTestGene|2,TGAG
0,11,71,-,1,CROTONTestGene,3,CROTONTestGene|3,TGAGGTGCTTTCGTATGAGGTGCTTGTTC


In [15]:
backend.subset_CDSpams(CDSpamsbed_df, save_path=backend.configs.CDS_PAM_DIR, chrom_type='number')

X
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


In [13]:
%load_ext watermark
%watermark -n -u -v -iv -w

Last updated: Wed Dec 14 2022

Python implementation: CPython
Python version       : 3.10.6
IPython version      : 8.5.0

gffutils: 0.11.1
json    : 2.0.9
pandas  : 1.5.2
backend : 0.0.1
numpy   : 1.23.4

Watermark: 2.3.1

