# Download, pick, and generate data of specific genomic contexts.
To pick the interested region of specific genomic contexts and make a prediction on them, an example of GRCh38 on chromosome [2, 4, 6, 8, 10] is performed (Original build in the paper is Ensembl build 104).

There are three types of raw files:
- Regulatory Features: promoter, enhancers, TFBS, open chromatin, CTCF BS...
- Featuer types: Exons, CDS...
- CpG featuers: CpG islands, CpG shores...

Three steps are included respectively:
- Download 
- Pick interested regions and corresponding index
- Generate dataset by new index

In [37]:
import numpy as np
import os
import requests
import gzip
import shutil
import pandas as pd
import bisect
import subprocess
from tqdm import tqdm
# set chromosome ID 
target_chromosome = ['2', '4', '6', '8', '10', '12'] #chromosome 10 for example
# load files of Hemato Dataset
y = np.load("y_Hemato.npz")# you path of datasets
pos = np.load("pos_Hemato.npz")# your path of dataset

## Regulatory Features

Available regions: {'TF_binding_site', 'open_chromatin_region', 'CTCF_binding_site', 'promoter', 'enhancer'}

### Download

In [38]:
# Define the URL and filename
url = 'https://ftp.ensembl.org/pub/release-109/regulation/homo_sapiens/homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.20221007.gff.gz'
filename = 'homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.20221007.gff.gz'
unzip_filename = 'homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.20221007.gff'

if not os.path.exists(unzip_filename):
    # Download the file to the current directory
    response = requests.get(url)

    if response.status_code == 200:
        with open(os.path.join(os.getcwd(), filename), 'wb') as f:
            f.write(response.content)

    # Unzip the compressed file
    with gzip.open(filename, 'rb') as f_in:
        with open(unzip_filename, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

### Pick interested regions and generate dataset

In [39]:
# Load GFF file into pandas DataFrame
df = pd.read_csv(unzip_filename, sep='\t', header=None, usecols=range(5))
column_names = ['chr_id', 'Regulatory_build', 'type', 'start', 'end']
df.columns = column_names
for pick_type_id in ['TF_binding_site', 'open_chromatin_region', 'CTCF_binding_site', 'promoter', 'enhancer']:
    pick_type = pick_type_id
    if not os.path.exists('y_'+pick_type+'.npz'):
        y_picked = {}
        pos_picked = {}
        for target_chromosome_id in tqdm(target_chromosome):
            # Select specific chromosome
            df_chr = df[df['chr_id'] == target_chromosome_id][['type', 'start', 'end']]
            # Available elements
            # print("Available regions:",set(df_chr['type']))
            # Take promoter as an example
            df_chr_pick = df_chr[df_chr['type'] == pick_type][['start', 'end']]
            df_chr_pick = df_chr_pick.sort_values(by='start').reset_index(drop=True)
            chr_pick = df_chr_pick.values

            chromosome_name = 'chr'+target_chromosome_id
            # select chromosome
            y_chr = y[chromosome_name]
            pos_chr = pos[chromosome_name]
            # initial new dataset
            y_chr_picked = []
            pos_chr_picked = [] 
            [last_start, last_end] = [0, 0]
            for i in range(chr_pick.shape[0]):
                [start, end] = chr_pick[i, :]
                if last_start == start or start < last_end:
                    continue   
                start_pick = bisect.bisect_left(pos_chr, start)
                end_pick = bisect.bisect_right(pos_chr, end) - 1
                if end_pick <= 0:
                    continue
                if end_pick > pos_chr.shape[0]:
                    continue
                y_chr_picked += list(y_chr[start_pick:end_pick,:])
                pos_chr_picked += list(pos_chr[start_pick:end_pick])
                [last_start, last_end] = [start, end]
            y_chr_picked = np.array(y_chr_picked)

            y_picked[chromosome_name] = y_chr_picked
            pos_picked[chromosome_name] = pos_chr_picked
            
        # save as npz
        np.savez_compressed('y_'+pick_type+'.npz', **y_picked)
        np.savez_compressed('pos_'+pick_type+'.npz', **pos_picked)

100%|██████████| 6/6 [00:00<00:00,  6.32it/s]
100%|██████████| 6/6 [00:01<00:00,  5.43it/s]
100%|██████████| 6/6 [00:01<00:00,  5.40it/s]
100%|██████████| 6/6 [00:01<00:00,  5.45it/s]
100%|██████████| 6/6 [00:01<00:00,  3.99it/s]


# Feature types

Available regions: {'exon', 'three_prime_UTR', 'CDS', 'ncRNA_gene', 'V_gene_segment', 'J_gene_segment', 'rRNA', 'scRNA', 'pseudogenic_transcript', 'transcript', 'miRNA', 'biological_region', 'gene', 'mRNA', 'C_gene_segment', 'snRNA', 'unconfirmed_transcript', 'snoRNA', 'pseudogene', 'ncRNA', 'lnc_RNA', 'chromosome', 'five_prime_UTR'}

### Download

In [89]:
for target_chromosome_id in tqdm(target_chromosome):
    # Define the URL and filename
    url = 'https://ftp.ensembl.org/pub/release-109/gff3/homo_sapiens/Homo_sapiens.GRCh38.109.chromosome.'+target_chromosome_id+'.gff3.gz'
    filename = 'Homo_sapiens.GRCh38.109.chromosome.'+target_chromosome_id+'.gff3.gz'
    unzip_filename = 'Homo_sapiens.GRCh38.109.chromosome.'+target_chromosome_id+'.gff3'

    if not os.path.exists(unzip_filename):
        # Download the file to the current directory
        response = requests.get(url)

        if response.status_code == 200:
            with open(os.path.join(os.getcwd(), filename), 'wb') as f:
                f.write(response.content)

        # Unzip the compressed file
        with gzip.open(filename, 'rb') as f_in:
            with open(unzip_filename, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

100%|██████████| 6/6 [00:00<00:00, 49932.19it/s]


### Pick interested regions and generate dataset

In [28]:
for pick_type_id in ['CDS', 'exon']:
    pick_type = pick_type_id
    if not os.path.exists('y_'+pick_type+'.npz'):
        y_picked = {}
        pos_picked = {}
        for target_chromosome_id in tqdm(target_chromosome[:]):
            # Load GFF file into pandas DataFrame
            df = pd.read_csv('Homo_sapiens.GRCh38.109.chromosome.'+target_chromosome_id+'.gff3', sep='\t', header=None, comment='#', usecols=range(2,5))
            column_names = ['feature_type', 'start', 'end']
            df.columns = column_names
            # Available features
            # print("Available regions:",set(df['feature_type']))

            # Take exons as an example
            df_chr_pick = df[df['feature_type'] == pick_type][['start', 'end']]
            df_chr_pick = df_chr_pick.sort_values(by='start').reset_index(drop=True)
            chr_pick = df_chr_pick.values

            # load files of Ser Dataset
            chromosome_name = 'chr'+target_chromosome_id
            # select chromosome
            y_chr = y[chromosome_name]
            pos_chr = pos[chromosome_name]
            # initial new dataset
            y_chr_picked = []
            pos_chr_picked = [] 
            [last_start, last_end] = [0, 0]
            for i in range(chr_pick.shape[0]):
                [start, end] = chr_pick[i, :]
                if last_start == start or start < last_end:
                    continue    
                start_pick = bisect.bisect_left(pos_chr, start)
                end_pick = bisect.bisect_right(pos_chr, end) - 1
                if end_pick <= 0:
                    continue
                if end_pick > pos_chr.shape[0]:
                    continue
                y_chr_picked += list(y_chr[start_pick:end_pick,:])
                pos_chr_picked += list(pos_chr[start_pick:end_pick])
                [last_start, last_end] = [start, end]
            y_chr_picked = np.array(y_chr_picked)

            y_picked[chromosome_name] = y_chr_picked
            pos_picked[chromosome_name] = pos_chr_picked
        # save as npz
        np.savez_compressed('y_'+pick_type+'.npz', **y_picked)
        np.savez_compressed('pos_'+pick_type+'.npz', **pos_picked)

100%|██████████| 6/6 [00:01<00:00,  3.88it/s]
100%|██████████| 6/6 [00:01<00:00,  3.09it/s]


# CpG islands and CpG shores
 
CpG shores: are processed with the regions 0-2000 positions both down and upstream from those CpG Islands

['CpG_island', 'CpG_shores']

## Download

In [87]:
# Define the URL and filename
url = 'http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/cpgIslandExt.txt.gz'
filename = 'cpgIslandExt.txt.gz'
unzip_filename = 'cpgIslandExt.txt'

if not os.path.exists(unzip_filename):
    # Download the file to the current directory
    response = requests.get(url)

    if response.status_code == 200:
        with open(os.path.join(os.getcwd(), filename), 'wb') as f:
            f.write(response.content)

    # Unzip the compressed file
    with gzip.open(filename, 'rb') as f_in:
        with open(unzip_filename, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

## Pick interested regions and generate dataset

In [32]:
# Load txt file into pandas DataFrame
df = pd.read_csv('cpgIslandExt.txt', sep='\t', header=None, usecols=range(1,4))
column_names = ['chr_id', 'start', 'end']
df.columns = column_names

for pick_type_id in ['shores', 'islands']:
    pick_type = pick_type_id
    if not os.path.exists('y_CpG_'+pick_type+'.npz'):
        y_picked = {}
        pos_picked = {}
        for target_chromosome_id in tqdm(target_chromosome):
            chromosome_name = 'chr'+target_chromosome_id
            # Select specific chromosome
            df_chr = df[df['chr_id'] == chromosome_name][['start', 'end']]
            df_chr_pick = df_chr.sort_values(by='start').reset_index(drop=True)
            chr_pick = df_chr_pick.values

            # select chromosome
            y_chr = y[chromosome_name]
            pos_chr = pos[chromosome_name]
            # initial new dataset
            y_chr_picked = []
            pos_chr_picked = [] 
            [last_start, last_end] = [0, 0]
            for i in range(chr_pick.shape[0]):
                [start, end] = chr_pick[i, :]
                if pick_type == 'shores': # CpG shores
                    if last_start == start-2000 or start-2000 < last_end:
                        continue  
                    startL_pick = bisect.bisect_left(pos_chr, start-2000)
                    endL_pick = bisect.bisect_right(pos_chr,start) - 1
                    startR_pick = bisect.bisect_left(pos_chr, end)
                    endR_pick = bisect.bisect_right(pos_chr,end+2000) - 1
                    if endR_pick <= 0:
                        continue
                    if endR_pick > pos_chr.shape[0]:
                        continue
                    y_chr_picked += list(y_chr[startL_pick:endL_pick,:])
                    y_chr_picked += list(y_chr[startR_pick:endR_pick,:])
                    pos_chr_picked += list(pos_chr[startL_pick:endL_pick])
                    pos_chr_picked += list(pos_chr[startR_pick:endR_pick])
                    [last_start, last_end] = [start-2000, end+2000]
                else:
                    if last_start == start or start < last_end:
                        continue  
                    start_pick = bisect.bisect_left(pos_chr, start)
                    end_pick = bisect.bisect_right(pos_chr, end) - 1
                    if end_pick <= 0:
                        continue
                    if end_pick > pos_chr.shape[0]:
                        continue
                    y_chr_picked += list(y_chr[start_pick:end_pick,:])
                    pos_chr_picked += list(pos_chr[start_pick:end_pick])
                    [last_start, last_end] = [start, end]
            y_chr_picked = np.array(y_chr_picked)

            y_picked[chromosome_name] = y_chr_picked
            pos_picked[chromosome_name] = pos_chr_picked

        # save as npz
        np.savez_compressed('y_CpG_'+pick_type+'.npz', **y_picked)
        np.savez_compressed('pos_CpG_'+pick_type+'.npz', **pos_picked)
    

100%|██████████| 6/6 [00:00<00:00,  6.30it/s]
100%|██████████| 6/6 [00:00<00:00,  6.28it/s]
