In [1]:
from multiprocessing import Pool
import pandas as pd
import numpy as np
import allel
import pysam
import json
import os
from datetime import datetime
pd.set_option('display.max_columns', None)
from urllib import request

In [2]:
work_dir = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
raw_data_dir = os.path.join(work_dir, 'raw_data')
result_data_dir = os.path.join(work_dir, 'data')
if not os.path.exists(raw_data_dir):
    os.mkdir(raw_data_dir)
if not os.path.exists(result_data_dir):
    os.mkdir(result_data_dir)

# version_date = datetime.today().strftime('%Y-%m-%d')
version_date = '2020-07-09'
version_date_dir = os.path.join(raw_data_dir, version_date)
if not os.path.exists(version_date_dir):
    os.mkdir(version_date_dir)

In [10]:
gene_info_url = 'https://ftp.ncbi.nih.gov/refseq/H_sapiens/Homo_sapiens.gene_info.gz'
ref_gene_url = 'https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz'
clingen_gene_curation_url = 'https://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh37.tsv'
clingene_region_curation_url = 'https://ftp.clinicalgenome.org/ClinGen_region_curation_list_GRCh37.tsv'
clinvar_url = 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz'
hi_pred_url = 'https://decipher.sanger.ac.uk/files/downloads/HI_Predictions_Version3.bed.gz'
gnomad_lof_url = 'https://storage.googleapis.com/gnomad-public/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz'
gnomad_control_only_url = 'https://storage.googleapis.com/gnomad-public/papers/2019-sv/gnomad_v2.1_sv.controls_only.sites.bed.gz'
hgnc_gene_fam_url = 'ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/csv/genefamily_db_tables/family.csv'
# ori file
gene_info_ori_file = os.path.join(version_date_dir, 'Homo_sapiens.gene_info.gz')
ref_gene_ori_file = os.path.join(version_date_dir, 'refGene.txt.gz')
clingen_gene_ori_file = os.path.join(version_date_dir, 'clingen_gene_hg19.tsv')
clingen_region_ori_file = os.path.join(version_date_dir, 'clingen_region_hg19.tsv')
clinvar_ori_vcf_file = os.path.join(version_date_dir, 'clinvar.vcf.gz')
hi_pred_ori_file = os.path.join(version_date_dir, 'HI_Predictions_Version3.bed.gz')
gnomad_lof_ori_file = os.path.join(version_date_dir, 'gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz')
gnomad_control_ori_file = os.path.join(version_date_dir, 'gnomad_v2.1_sv.controls_only.sites.bed.gz')
# prep an external omim gene list
omim_gene_list_file = os.path.join(raw_data_dir, 'omim-gene-list.txt')
dgv_ori_file = os.path.join(raw_data_dir, 'DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3')
# result file
gene_file = os.path.join(result_data_dir, 'gene.sorted.bed')
omim_gene_file = os.path.join(result_data_dir, 'omim-gene.sorted.bed')
clinvar_file = os.path.join(result_data_dir, 'clinvar-pathogenic.sorted.vcf')
decipher_gene_file = os.path.join(result_data_dir, 'decipher-gene.sorted.bed')
dgv_gain_file = os.path.join(result_data_dir, 'dgv-gain.sorted.bed')
dgv_loss_file = os.path.join(result_data_dir, 'dgv-loss.sorted.bed')
func_region_file = os.path.join(result_data_dir, 'func-region.sorted')
gnomad_del_file = os.path.join(result_data_dir, 'gnomad-del.sorted.bed')
gnomad_dup_file = os.path.join(result_data_dir, 'gnomad-dup.sorted.bed')
hi_cds_file = os.path.join(result_data_dir, 'hi-cds.sorted.bed')
hi_exon_file = os.path.join(result_data_dir, 'hi-exon.sorted.bed')
hi_gene_file = os.path.join(result_data_dir, 'hi-gene.sorted.bed')
hi_region_file = os.path.join(result_data_dir, 'hi-region.sorted.bed')
ts_gene_file = os.path.join(result_data_dir, 'ts-gene.sorted.bed')
ts_region_file = os.path.join(result_data_dir, 'ts-region.sorted.bed')
uhi_gene_file = os.path.join(result_data_dir, 'uhi-gene.sorted.bed')
uhi_region_file = os.path.join(result_data_dir, 'uhi-region.sorted.bed')
uts_gene_file = os.path.join(result_data_dir, 'uts-gene.sorted.bed')
uts_region_file = os.path.join(result_data_dir, 'uts-region.sorted.bed')
hgnc_gene_fam_file = os.path.join(version_date_dir, 'family.csv')

In [11]:
# Download require data
if not os.path.exists(gene_info_ori_file):
    print(f'downloading gene info to {gene_info_ori_file}')
    request.urlretrieve(gene_info_url, gene_info_ori_file)
    print('done!')

if not os.path.exists(ref_gene_ori_file):
    print(f'downloading ref gene to {ref_gene_ori_file}')
    request.urlretrieve(ref_gene_url, ref_gene_ori_file)
    print('done!')

if not os.path.exists(clingen_gene_ori_file):
    print(f'downloading clingen gene list to {clingen_gene_ori_file}')
    request.urlretrieve(clingen_gene_curation_url, clingen_gene_ori_file)
    print('done!')

if not os.path.exists(clingen_region_ori_file):
    print(f'downloading clingen region file to {clingen_region_ori_file}')
    request.urlretrieve(clingene_region_curation_url, clingen_region_ori_file)
    print('done!')

if not os.path.exists(clinvar_ori_vcf_file):
    print(f'downloading clingen region file to {clinvar_ori_vcf_file}')
    request.urlretrieve(clinvar_url, clinvar_ori_vcf_file)
    print('done!')

if not os.path.exists(hi_pred_ori_file):
    print(f'downloading hi prediction file to {hi_pred_ori_file}')
    request.urlretrieve(hi_pred_url, hi_pred_ori_file)
    print('done!')
if not os.path.exists(gnomad_lof_ori_file):
    print(f'downloading pLoF file from GnomAD to {gnomad_lof_ori_file}')
    request.urlretrieve(gnomad_lof_url, gnomad_lof_ori_file)
    print('done!')
if not os.path.exists(gnomad_control_ori_file):
    print(f'downloading gnomad control only file to {gnomad_control_ori_file}')
    request.urlretrieve(gnomad_control_only_url, gnomad_control_ori_file)
    print('done!')

if not os.path.exists(hgnc_gene_fam_file):
    print(f'downloading gene family file to {hgnc_gene_fam_file}')
    request.urlretrieve(hgnc_gene_fam_url, hgnc_gene_fam_file)
    print('done!')



downloading gene family file to /Users/zhonghua/workspace/python-space/acit/raw_data/2020-07-09/family.csv
done!


In [12]:
cols = [
    'bin', 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds',
    'score', 'name2', 'cdsStartStat', 'cdsEndStat', 'ExonFrames']
refgene = pd.read_csv(ref_gene_ori_file, sep='\t', names=cols)
refgene = refgene[~refgene['chrom'].str.match(r'.*fix$')]
refgene['length'] = refgene['cdsEnd'] - refgene['cdsStart']
refgene = refgene.sort_values('length', ascending=False)
refgene = refgene.drop_duplicates('name2', keep='first')

In [13]:
refgene.head()

Unnamed: 0,bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,ExonFrames,length
14492,26,NM_001351365,chr1,+,144146811,146467744,144146846,146466121,93,"144146811,144148789,144149726,144150981,144151...","144147021,144148892,144149941,144151054,144151...",0,NBPF19,cmpl,cmpl,"0,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,...",2319275
10422,26,NM_001278267,chr1,+,144146810,146467744,144158383,146466121,131,"144146810,144148789,144149726,144150981,144151...","144147021,144148892,144149941,144151054,144151...",0,NBPF20,cmpl,cmpl,"-1,-1,-1,-1,-1,-1,-1,-1,0,2,1,2,1,2,1,2,1,2,1,...",2307738
10842,12,NM_000109,chrX,-,31137340,33357505,31140035,33357382,79,"31137340,31144758,31152218,31164407,31165391,3...","31140047,31144790,31152311,31164531,31165635,3...",0,DMD,cmpl,cmpl,"0,1,1,0,2,2,2,2,2,0,2,0,1,2,1,1,2,1,0,0,1,0,2,...",2217347
28606,2,NM_001351274,chr11,-,83166055,85339417,83170860,85337661,27,"83166055,83173044,83177750,83180243,83182668,8...","83170967,83173136,83177860,83180416,83182770,8...",0,DLG2,cmpl,cmpl,"1,2,0,1,1,1,0,2,2,1,0,2,2,2,1,0,1,2,0,0,0,0,0,...",2166801
36946,9,NM_033225,chr8,-,2792882,4852436,2796106,4851938,70,"2792882,2799993,2806820,2807752,2808635,281174...","2796266,2800126,2806908,2807865,2808797,281179...",0,CSMD1,cmpl,cmpl,"2,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...",2055832


In [14]:
gene_info = pd.read_csv(gene_info_ori_file, sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
gene_info.head()

Unnamed: 0,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
0,9606,1,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20200601,-
1,9606,2,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20200705,-
2,9606,3,A2MP1,-,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000256069,12,12p13.31,alpha-2-macroglobulin pseudogene 1,pseudo,A2MP1,alpha-2-macroglobulin pseudogene 1,O,pregnancy-zone protein pseudogene,20200601,-
3,9606,9,NAT1,-,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171428,8,8p22,N-acetyltransferase 1,protein-coding,NAT1,N-acetyltransferase 1,O,arylamine N-acetyltransferase 1|N-acetyltransf...,20200601,-
4,9606,10,NAT2,-,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156006,8,8p22,N-acetyltransferase 2,protein-coding,NAT2,N-acetyltransferase 2,O,arylamine N-acetyltransferase 2|N-acetyltransf...,20200621,-


In [None]:
refgene_info = refgene.merge(gene_info, left_on='name2', right_on='Symbol')

In [None]:
refgene_info.head()

In [17]:
gene_fam = pd.read_csv(hgnc_gene_fam_file, sep=',')
gene_fam = gene_fam[gene_fam['typical_gene'].notna()].rename(columns={'id': 'fam'})
gene_fam_grp = gene_fam.groupby('typical_gene').agg({'fam': set})

In [None]:
gene = refgene_info.loc[
    refgene_info['type_of_gene'] == 'protein-coding',
    ['chrom', 'txStart', 'txEnd', 'GeneID', 'name2', 'name', 'strand']
].sort_values(['chrom', 'txStart', 'txEnd'])

In [None]:
gene.head()

In [None]:
gene.rename(columns={
    'chrom': '#chrom', 'txStart': 'start', 'txEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(gene_file, index=False, sep='\t')

In [None]:
!bgzip -cf {gene_file} > {gene_file}.gz

In [None]:
!tabix -fp bed {gene_file}.gz

In [None]:
!rm {gene_file}

In [None]:
with open(omim_gene_list_file) as f:
    omim_gene = set(f.read().split('\n'))

In [None]:
omim_gene = refgene_info.loc[
    refgene_info['name2'].isin(omim_gene),
    ['chrom', 'txStart', 'txEnd', 'GeneID', 'name2', 'name', 'strand']
].sort_values(['chrom', 'txStart', 'txEnd'])

In [None]:
omim_gene.head()

In [None]:
omim_gene.rename(columns={
    'chrom': '#chrom', 'txStart': 'start', 'txEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(omim_gene_file, index=False, sep='\t')

In [None]:
!bgzip -cf {omim_gene_file} > {omim_gene_file}.gz

In [None]:
!tabix -fp bed {omim_gene_file}.gz
!rm {omim_gene_file}

In [None]:
curation_gene = pd.read_csv(clingen_gene_ori_file, sep='\t', dtype=str, skiprows=5)

In [None]:
curation_gene.head()

In [None]:
hi_genes = set(
    curation_gene.loc[
        curation_gene['Haploinsufficiency Score'] == '3', '#Gene Symbol'
    ]
)

In [None]:
hi_gene = refgene_info.loc[
    refgene_info['name2'].isin(hi_genes),
    ['chrom', 'txStart', 'txEnd', 'GeneID', 'name2', 'name', 'strand']
].sort_values(['chrom', 'txStart', 'txEnd'])

In [None]:
hi_gene.head()

In [None]:
hi_gene.rename(columns={
    'chrom': '#chrom', 'txStart': 'start', 'txEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(hi_gene_file, index=False, sep='\t')

In [None]:
!bgzip -cf {hi_gene_file} > {hi_gene_file}.gz

In [None]:
!tabix -fp bed {hi_gene_file}.gz
!rm {hi_gene_file}

In [None]:
hi_cds = refgene_info.loc[
    (refgene_info['name2'].isin(hi_genes)) & (refgene_info['length'] != 0),
    ['chrom', 'cdsStart', 'cdsEnd', 'GeneID', 'name2', 'name']
].sort_values(['chrom', 'cdsStart', 'cdsEnd'])

In [None]:
hi_cds.head()

In [None]:
hi_cds.rename(columns={
    'chrom': '#chrom', 'cdsStart': 'start', 'cdsEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(hi_cds_file, index=False, sep='\t')

In [None]:
!bgzip -cf {hi_cds_file} > {hi_cds_file}.gz

In [None]:
!tabix -fp bed {hi_cds_file}.gz
!rm {hi_cds_file}

In [None]:
hi_exons = refgene_info.loc[
    refgene_info['name2'].isin(hi_genes), ['chrom', 'exonStarts', 'exonEnds', 'GeneID', 'name2', 'name', 'strand']
].copy()

In [None]:
hi_exons.head()

In [None]:
hi_exons['exonStarts'] = hi_exons['exonStarts'].str.replace(r',$', '')
hi_exons['exonEnds'] = hi_exons['exonEnds'].str.replace(r',$', '')

In [None]:
hi_exons.head()

In [None]:
start = hi_exons['exonStarts'].str.split(',').apply(pd.Series).stack().reset_index()
start = start.rename(columns={'level_0': 'row', 0: 'start'})[['row', 'start']]
start['start'] = start['start'].astype(int)
end = hi_exons['exonEnds'].str.split(',').apply(pd.Series).stack().reset_index()
end = end.rename(columns={0: 'end'})['end'].astype(int)
position = start.join(end)

In [None]:
position.head()

In [None]:
exon = position.merge(
    hi_exons[['chrom', 'GeneID', 'name2', 'name', 'strand']], how='left', left_on='row', right_index=True
)
exon = exon.sort_values(['chrom', 'start', 'end'])

In [None]:
exon.head()

In [None]:
exon['+'] = exon.groupby(['name2', 'name'])['start'].rank('first', ascending=True).astype(int)
exon['-'] = exon.groupby(['name2', 'name'])['start'].rank('first', ascending=False).astype(int)

In [None]:
exon.head()

In [None]:
exon.tail()

In [None]:
exon['exon'] = pd.concat([exon.loc[exon['strand'] == '+', '+'], exon.loc[exon['strand'] == '-', '-']])

In [None]:
exon.head()

In [None]:
exon.tail()

In [None]:
exon['last_exon'] = exon.groupby(['name2', 'name'])['exon'].transform('max') == exon['exon']

In [None]:
exon.head()

In [None]:
exon.tail()

In [None]:
exon = exon[
    ['chrom', 'start', 'end', 'GeneID', 'name2', 'name', 'exon', 'last_exon']
].sort_values(['chrom', 'start', 'end'])

In [None]:
exon.head()

In [None]:
exon.rename(columns={
    'chrom': '#chrom',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(hi_exon_file, index=False, sep='\t')

In [None]:
!bgzip -cf {hi_exon_file} > {hi_exon_file}.gz

In [None]:
!tabix -fp bed {hi_exon_file}.gz
!rm {hi_exon_file}

In [None]:
last_exon = exon[exon['last_exon']]

In [None]:
last_exon.head()

In [None]:
last_exon_region = last_exon['chrom'] + ':' + last_exon['start'].astype(str) + '-' + last_exon['end'].astype(str)
last_exon_region = last_exon_region.str.replace('chr', '')

In [None]:
last_exon_region.head()

In [None]:
need_fields = [
    'variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT',
    'variants/AF_ESP', 'variants/AF_EXAC', 'variants/AF_TGP', 'variants/CLNSIG'
]

In [None]:
with open(clinvar_file, 'w') as f:
    headers = allel.read_vcf_headers(clinvar_ori_vcf_file)
    f.write(''.join(headers.headers))
    
    def fetch_variants(region):
        fields, samples, headers, it = allel.iter_vcf_chunks(
            clinvar_ori_vcf_file, fields=need_fields, alt_number=1, region=region
        )
        for variants, *_ in it:
            esp_filter = np.isnan(variants['variants/AF_ESP'])
            esp_filter[~esp_filter] |= variants['variants/AF_ESP'][~esp_filter] < 0.01

            exac_filter = np.isnan(variants['variants/AF_EXAC'])
            exac_filter[~exac_filter] |= variants['variants/AF_EXAC'][~exac_filter] < 0.01

            tgp_filter = np.isnan(variants['variants/AF_TGP'])
            tgp_filter[~tgp_filter] |= variants['variants/AF_TGP'][~tgp_filter] < 0.01

            pathogenic_filter = np.isin(
                variants['variants/CLNSIG'], ['Likely_pathogenic', 'Pathogenic', 'Pathogenic/Likely_pathogenic']
            )

            af_filter = esp_filter & exac_filter & tgp_filter & pathogenic_filter

            filtered_variants = {k: v[af_filter] for k, v in variants.items()}

            filtered_variants['variants/CHROM'] = 'chr' + filtered_variants['variants/CHROM']

            return allel.normalize_callset(filtered_variants)
    
    with Pool(processes=7) as pool:
        variants = pool.map(fetch_variants, last_exon_region)
    
    for names, callset in filter(lambda x: x is not None, variants):
        allel.write_vcf_data(f, names, callset, None, {field: np.nan for field in need_fields})

In [101]:
!sed -i '' 's/AF_.\+=nan;//' {clinvar_file}

In [102]:
!bgzip -cf {clinvar_file} > {clinvar_file}.gz

In [103]:
!tabix -fp vcf {clinvar_file}.gz
# !rm {clinvar_file}

In [104]:
uhi_genes = set(
    curation_gene.loc[curation_gene['Haploinsufficiency Score'] == '40', '#Gene Symbol']
)

In [105]:
uhi_gene = refgene_info.loc[
    refgene_info['name2'].isin(uhi_genes),
    ['chrom', 'txStart', 'txEnd', 'GeneID', 'name2', 'name', 'strand']
].sort_values(['chrom', 'txStart', 'txEnd'])

In [106]:
uhi_gene.head()

Unnamed: 0,chrom,txStart,txEnd,GeneID,name2,name,strand
8358,chr1,55505220,55530525,255738,PCSK9,NM_174936,+
18742,chr13,20761608,20767077,2706,GJB2,NM_004004,-


In [107]:
uhi_gene.rename(columns={
    'chrom': '#chrom', 'txStart': 'start', 'txEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(uhi_gene_file, index=False, sep='\t')

In [108]:
!bgzip -cf {uhi_gene_file} > {uhi_gene_file}.gz

In [109]:
!tabix -fp bed {uhi_gene_file}.gz
!rm {uhi_gene_file}

In [110]:
region = pd.read_csv(clingen_region_ori_file, sep='\t', skiprows=5, dtype=str)

In [111]:
region.head()

Unnamed: 0,#ISCA ID,ISCA Region Name,cytoBand,Genomic Location,Haploinsufficiency Score,Haploinsufficiency Description,Haploinsufficiency PMID1,Haploinsufficiency PMID2,Haploinsufficiency PMID3,Triplosensitivity Score,Triplosensitivity Description,Triplosensitivity PMID1,Triplosensitivity PMID2,Triplosensitivity PMID3,Date Last Evaluated,Loss phenotype OMIM ID,Triplosensitive phenotype OMIM ID
0,ISCA-46300,15q24 recurrent region (C-D) (includes SIN3A),,chr15: 75631787-75972909,3,Sufficient evidence for dosage pathogenicity,27399968.0,22180641.0,,0,No evidence available,,,,2020-01-13,613406.0,
1,ISCA-46299,Xp11.22 region (includes HUWE1),Xp11.22,chrX:53363456-53793054,0,No evidence available,,,,3,Sufficient evidence for dosage pathogenicity,22840365.0,20655035.0,26692240.0,2018-11-19,,
2,ISCA-46297,16p12.2 recurrent region (includes OTOA) (dist...,16p12.2,chr16:21570113-21740423,30,Gene associated with autosomal recessive pheno...,,,,40,Dosage sensitivity unlikely,,,,2019-11-03,607039.0,
3,ISCA-46296,15q24 recurrent region (A-C),15q24.1-q24.2,chr15:72963715-75508312,3,Sufficient evidence for dosage pathogenicity,22180641.0,19921647.0,,1,Little evidence for dosage pathogenicity,19557438.0,20860070.0,,2020-01-03,,
4,ISCA-46295,15q13.3 recurrent region (D-CHRNA7 to BP5) (in...,15q13.3,chr15:32019621-32445405,3,Sufficient evidence for dosage pathogenicity,19898479.0,20236110.0,22775350.0,40,Dosage sensitivity unlikely,26968334.0,22420048.0,,2018-05-10,,


In [112]:
position = region['Genomic Location'].str.extract(r'(?P<chrom>chr\w+)\s*:\s*(?P<start>\d+)\s*-\s*(?P<end>\d+)')
position['start'] = position['start'].astype(int)
position['end'] = position['end'].astype(int)

In [113]:
position.head()

Unnamed: 0,chrom,start,end
0,chr15,75631787,75972909
1,chrX,53363456,53793054
2,chr16,21570113,21740423
3,chr15,72963715,75508312
4,chr15,32019621,32445405


In [114]:
region_pos = region.merge(position, how='left', left_index=True, right_index=True)

In [115]:
region_pos.head()

Unnamed: 0,#ISCA ID,ISCA Region Name,cytoBand,Genomic Location,Haploinsufficiency Score,Haploinsufficiency Description,Haploinsufficiency PMID1,Haploinsufficiency PMID2,Haploinsufficiency PMID3,Triplosensitivity Score,Triplosensitivity Description,Triplosensitivity PMID1,Triplosensitivity PMID2,Triplosensitivity PMID3,Date Last Evaluated,Loss phenotype OMIM ID,Triplosensitive phenotype OMIM ID,chrom,start,end
0,ISCA-46300,15q24 recurrent region (C-D) (includes SIN3A),,chr15: 75631787-75972909,3,Sufficient evidence for dosage pathogenicity,27399968.0,22180641.0,,0,No evidence available,,,,2020-01-13,613406.0,,chr15,75631787,75972909
1,ISCA-46299,Xp11.22 region (includes HUWE1),Xp11.22,chrX:53363456-53793054,0,No evidence available,,,,3,Sufficient evidence for dosage pathogenicity,22840365.0,20655035.0,26692240.0,2018-11-19,,,chrX,53363456,53793054
2,ISCA-46297,16p12.2 recurrent region (includes OTOA) (dist...,16p12.2,chr16:21570113-21740423,30,Gene associated with autosomal recessive pheno...,,,,40,Dosage sensitivity unlikely,,,,2019-11-03,607039.0,,chr16,21570113,21740423
3,ISCA-46296,15q24 recurrent region (A-C),15q24.1-q24.2,chr15:72963715-75508312,3,Sufficient evidence for dosage pathogenicity,22180641.0,19921647.0,,1,Little evidence for dosage pathogenicity,19557438.0,20860070.0,,2020-01-03,,,chr15,72963715,75508312
4,ISCA-46295,15q13.3 recurrent region (D-CHRNA7 to BP5) (in...,15q13.3,chr15:32019621-32445405,3,Sufficient evidence for dosage pathogenicity,19898479.0,20236110.0,22775350.0,40,Dosage sensitivity unlikely,26968334.0,22420048.0,,2018-05-10,,,chr15,32019621,32445405


In [116]:
func_region = region_pos.loc[
    (region_pos['Haploinsufficiency Score'].isin(['1', '2','3']))
     | (region_pos['Triplosensitivity Score'].isin(['1', '2','3'])),
     ['chrom', 'start', 'end', '#ISCA ID', 'ISCA Region Name']
].sort_values(['chrom', 'start', 'end'])

In [117]:
func_region.head()

Unnamed: 0,chrom,start,end,#ISCA ID,ISCA Region Name
43,chr1,834083,6289973,ISCA-37434,1p36 terminal region (includes GABRD)
48,chr1,145386507,145748064,ISCA-37428,1q21.1 recurrent (TAR syndrome) region (BP2-BP...
51,chr1,146577486,147394506,ISCA-37421,"1q21.1 recurrent region (BP3-BP4, distal) (inc..."
18,chr1,243287730,245318287,ISCA-37493,1q43q44 terminal region (includes AKT3)
50,chr10,81682843,88739388,ISCA-37424,10q22.3q23.2 recurrent region (LCR-3/4-flanked...


In [118]:
func_region.rename(columns={
    'chrom': '#chrom',
    '#ISCA ID': 'isca_id', 'ISCA Region Name': 'name'
}).to_csv(func_region_file, sep='\t', index=False)

In [120]:
!bgzip -cf {func_region_file} > {func_region_file}.gz

In [121]:
!tabix -fp bed {func_region_file}.gz
!rm {func_region_file}

In [122]:
hi_region = region_pos.loc[
    region_pos['Haploinsufficiency Score'] == '3',
    ['chrom', 'start', 'end', '#ISCA ID', 'ISCA Region Name']
].sort_values(['chrom', 'start', 'end'])

In [123]:
hi_region.head()

Unnamed: 0,chrom,start,end,#ISCA ID,ISCA Region Name
43,chr1,834083,6289973,ISCA-37434,1p36 terminal region (includes GABRD)
51,chr1,146577486,147394506,ISCA-37421,"1q21.1 recurrent region (BP3-BP4, distal) (inc..."
18,chr1,243287730,245318287,ISCA-37493,1q43q44 terminal region (includes AKT3)
50,chr10,81682843,88739388,ISCA-37424,10q22.3q23.2 recurrent region (LCR-3/4-flanked...
62,chr11,31803509,32510988,ISCA-37401,11p13 (WAGR syndrome) region


In [124]:
hi_region['omim_genes'] = hi_region.apply(
    lambda row: ','.join(omim_gene.loc[
        (omim_gene['chrom'] == row['chrom'])
        & (omim_gene['txEnd'] >= row['start'])
        & (omim_gene['txStart'] <= row['end']),
        'name2'
    ]),
    axis=1
)

In [125]:
hi_region.head()

Unnamed: 0,chrom,start,end,#ISCA ID,ISCA Region Name,omim_genes
43,chr1,834083,6289973,ISCA-37434,1p36 terminal region (includes GABRD),"ISG15,AGRN,TNFRSF4,B3GALT6,DVL1,ATAD3A,TMEM240..."
51,chr1,146577486,147394506,ISCA-37421,"1q21.1 recurrent region (BP3-BP4, distal) (inc...","GJA5,GJA8"
18,chr1,243287730,245318287,ISCA-37493,1q43q44 terminal region (includes AKT3),"SDCCAG8,AKT3,ZBTB18,COX20,HNRNPU"
50,chr10,81682843,88739388,ISCA-37424,10q22.3q23.2 recurrent region (LCR-3/4-flanked...,"ANXA11,MAT1A,CDHR1,RGR,LDB3,BMPR1A"
62,chr11,31803509,32510988,ISCA-37401,11p13 (WAGR syndrome) region,"ELP4,PAX6,WT1"


In [126]:
hi_region.rename(columns={
    'chrom': '#chrom',
    '#ISCA ID': 'isca_id', 'ISCA Region Name': 'name'
}).to_csv(hi_region_file, sep='\t', index=False)

In [127]:
!bgzip -cf {hi_region_file} > {hi_region_file}.gz

In [128]:
!tabix -fp bed {hi_region_file}.gz
!rm {hi_region_file}

In [129]:
uhi_region = region_pos.loc[
    region_pos['Haploinsufficiency Score'] == '40',
    ['chrom', 'start', 'end', '#ISCA ID', 'ISCA Region Name']
].sort_values(['chrom', 'start', 'end'])

In [130]:
uhi_region.head()

Unnamed: 0,chrom,start,end,#ISCA ID,ISCA Region Name
29,chr1,148867551,149768855,ISCA-37469,1q21.2 region (polymorphic region)
25,chr14,22111109,23021097,ISCA-37476,14q11.2 region (TCRA region)
24,chr14,106050000,107289540,ISCA-37477,1 copy: 14q telomere; 3 copies: 14q telomere
21,chr16,34202088,35147508,ISCA-37481,3 copies: 16p centromere
20,chr19,43242796,43741310,ISCA-37483,19q13.3 region (PSG gene cluster)


In [131]:
uhi_region['genes'] = uhi_region.apply(
    lambda row: ','.join(gene.loc[
        (gene['chrom'] == row['chrom'])
        & (gene['txEnd'] >= row['start'])
        & (gene['txStart'] <= row['end']),
        'name2'
    ]),
    axis=1
)

In [132]:
uhi_region.head()

Unnamed: 0,chrom,start,end,#ISCA ID,ISCA Region Name,genes
29,chr1,148867551,149768855,ISCA-37469,1q21.2 region (polymorphic region),"H3-2,PPIAL4C,H2BC18,FCGR1A"
25,chr14,22111109,23021097,ISCA-37476,14q11.2 region (TCRA region),"OR4E2,OR4E1"
24,chr14,106050000,107289540,ISCA-37477,1 copy: 14q telomere; 3 copies: 14q telomere,
21,chr16,34202088,35147508,ISCA-37481,3 copies: 16p centromere,
20,chr19,43242796,43741310,ISCA-37483,19q13.3 region (PSG gene cluster),"PSG3,PSG8,PSG1,PSG6,PSG7,PSG11,PSG2,PSG5,PSG4"


In [133]:
uhi_region.rename(columns={
    'chrom': '#chrom',
    '#ISCA ID': 'isca_id', 'ISCA Region Name': 'name'
}).to_csv(uhi_region_file, sep='\t', index=False)

In [134]:
!bgzip -cf {uhi_region_file} > {uhi_region_file}.gz

In [135]:
!tabix -fp bed {uhi_region_file}.gz
!rm {uhi_region_file}

In [136]:
decipher = pd.read_csv(hi_pred_ori_file, sep='\t',skiprows=1, header=None, usecols=[3,])
decipher = decipher[3].str.split('|', expand=True).rename(columns={0: 'symbol', 1: 'hi_score', 2: 'hi_index'})
decipher['hi_index'] = decipher['hi_index'].str.replace('%', '').astype(float)
decipher = decipher.merge(gene, left_on='symbol', right_on='name2')

In [137]:
decipher.head()

Unnamed: 0,symbol,hi_score,hi_index,chrom,txStart,txEnd,GeneID,name2,name,strand
0,ANXA2R,4.7349e-05,100.0,chr5,43039181,43040447,389289,ANXA2R,NM_001014279,-
1,SCGB1D1,5.4551e-05,99.99,chr11,61957687,61961011,10648,SCGB1D1,NM_006552,+
2,IL31,5.7228e-05,99.99,chr12,122656575,122658768,386653,IL31,NM_001014336,-
3,BPIFA2,6.1264e-05,99.98,chr20,31749575,31769218,140683,BPIFA2,NM_001319164,+
4,MUCL1,6.7779e-05,99.98,chr12,55248299,55252171,118430,MUCL1,NM_058173,+


In [140]:
gnomad = pd.read_csv(gnomad_lof_ori_file, sep='\t', index_col=0, compression='gzip')

In [141]:
gnomad.head()

Unnamed: 0_level_0,transcript,obs_mis,exp_mis,oe_mis,mu_mis,possible_mis,obs_mis_pphen,exp_mis_pphen,oe_mis_pphen,possible_mis_pphen,obs_syn,exp_syn,oe_syn,mu_syn,possible_syn,obs_lof,mu_lof,possible_lof,exp_lof,pLI,pNull,pRec,oe_lof,oe_syn_lower,oe_syn_upper,oe_mis_lower,oe_mis_upper,oe_lof_lower,oe_lof_upper,constraint_flag,syn_z,mis_z,lof_z,oe_lof_upper_rank,oe_lof_upper_bin,oe_lof_upper_bin_6,n_sites,classic_caf,max_af,no_lofs,obs_het_lof,obs_hom_lof,defined,p,exp_hom_lof,classic_caf_afr,classic_caf_amr,classic_caf_asj,classic_caf_eas,classic_caf_fin,classic_caf_nfe,classic_caf_oth,classic_caf_sas,p_afr,p_amr,p_asj,p_eas,p_fin,p_nfe,p_oth,p_sas,transcript_type,gene_id,transcript_level,cds_length,num_coding_exons,gene_type,gene_length,exac_pLI,exac_obs_lof,exac_exp_lof,exac_oe_lof,brain_expression,chromosome,start_position,end_position
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1
MED13,ENST00000397786,871,1117.8,0.77921,5.6e-05,14195,314.0,529.75,0.59273,6708.0,422,387.53,1.089,1.9e-05,4248,0.0,5e-06,1257.0,98.429,1.0,8.9436e-40,1.8383e-16,0.0,1.005,1.18,0.736,0.824,0.0,0.03,,-1.3765,2.6232,9.1935,0.0,0.0,0.0,2.0,1.2e-05,8e-06,124782.0,3.0,0.0,124785.0,1.2e-05,1.8e-05,0.0,0.0,0.0,0.0,9.3e-05,9e-06,0.0,0.0,0.0,0.0,0.0,0.0,9.3e-05,9e-06,0.0,0.0,protein_coding,ENSG00000108510,2,6522,30,protein_coding,122678,1.0,0.0,64.393,0.0,,17,60019966,60142643
NIPBL,ENST00000282516,846,1441.5,0.58688,7.4e-05,18540,158.0,543.1,0.29092,7135.0,496,495.01,1.002,2.5e-05,5211,1.0,9e-06,1781.0,150.32,1.0,2.9773e-59,3.5724e-24,0.006653,0.93,1.079,0.554,0.621,0.001,0.032,,-0.035119,5.5737,11.286,1.0,0.0,0.0,2.0,1.2e-05,8e-06,125693.0,3.0,0.0,125696.0,1.2e-05,1.8e-05,0.0,0.0,9.9e-05,0.0,0.0,0.0,0.0,6.5e-05,0.0,0.0,9.9e-05,0.0,0.0,0.0,0.0,6.5e-05,protein_coding,ENSG00000164190,2,8412,46,protein_coding,189655,1.0,1.0,110.57,0.009044,,5,36876861,37066515
SMC3,ENST00000361804,178,630.07,0.28251,3.2e-05,8109,21.0,182.52,0.11506,2197.0,215,203.25,1.0578,1e-05,2091,0.0,5e-06,937.0,79.49,1.0,2.7853e-32,2.1914e-13,0.0,0.946,1.184,0.249,0.32,0.0,0.037,,-0.64776,6.3999,8.2618,2.0,0.0,0.0,8.0,3.2e-05,4e-06,125731.0,8.0,0.0,125739.0,3.2e-05,0.000127,0.0,0.0,9.9e-05,5.4e-05,0.0,4.4e-05,0.0,3.3e-05,0.0,0.0,9.9e-05,5.4e-05,0.0,4.4e-05,0.0,3.3e-05,protein_coding,ENSG00000108055,2,3651,29,protein_coding,36946,1.0,0.0,58.523,0.0,,10,112327449,112364394
CNOT1,ENST00000317147,561,1295.9,0.4329,6.9e-05,15670,51.0,290.68,0.17545,3560.0,470,456.03,1.0306,2.4e-05,4564,1.0,7e-06,1440.0,125.03,1.0,2.9924e-49,4.5628999999999995e-20,0.007998,0.955,1.112,0.403,0.464,0.002,0.038,,-0.5141,7.2546,10.279,3.0,0.0,0.0,5.0,2e-05,4e-06,125740.0,4.0,0.0,125744.0,1.6e-05,3.2e-05,0.0,2.9e-05,0.0,5.5e-05,0.0,2.6e-05,0.0,0.0,0.0,2.9e-05,0.0,5.4e-05,0.0,1.8e-05,0.0,0.0,protein_coding,ENSG00000125107,2,7128,48,protein_coding,109936,1.0,3.0,90.13,0.033285,,16,58553855,58663790
RLF,ENST00000372771,669,972.87,0.68766,4.7e-05,12682,107.0,321.14,0.33319,4151.0,358,352.62,1.0153,1.7e-05,3482,0.0,4e-06,1024.0,73.222,1.0,8.4055e-30,2.2842e-12,0.0,0.93,1.108,0.645,0.733,0.0,0.04,,-0.22518,3.462,7.9294,4.0,0.0,0.0,1.0,4e-06,4e-06,125122.0,1.0,0.0,125123.0,4e-06,2e-06,6.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,protein_coding,ENSG00000117000,2,5742,8,protein_coding,79549,1.0,0.0,43.607,0.0,,1,40627045,40706593


In [142]:
decipher = decipher.join(gnomad['pLI'], on='name2')
decipher = decipher.join(gnomad['oe_lof_upper'], on='name2')

In [143]:
decipher.head()

Unnamed: 0,symbol,hi_score,hi_index,chrom,txStart,txEnd,GeneID,name2,name,strand,pLI,oe_lof_upper
0,ANXA2R,4.7349e-05,100.0,chr5,43039181,43040447,389289,ANXA2R,NM_001014279,-,,
1,SCGB1D1,5.4551e-05,99.99,chr11,61957687,61961011,10648,SCGB1D1,NM_006552,+,1.294e-07,1.968
2,IL31,5.7228e-05,99.99,chr12,122656575,122658768,386653,IL31,NM_001014336,-,0.011445,1.715
3,BPIFA2,6.1264e-05,99.98,chr20,31749575,31769218,140683,BPIFA2,NM_001319164,+,8.0124e-07,1.495
4,MUCL1,6.7779e-05,99.98,chr12,55248299,55252171,118430,MUCL1,NM_058173,+,3.0923e-05,1.908


In [144]:
decipher = decipher.loc[
    (decipher['pLI'] >= 0.9) & (decipher['hi_index'] < 10) & (decipher['oe_lof_upper'] < 0.35),
    ['chrom', 'txStart', 'txEnd', 'GeneID', 'name2', 'name', 'pLI', 'hi_score']
].sort_values(['chrom', 'txStart', 'txEnd'])

In [145]:
decipher.head()

Unnamed: 0,chrom,txStart,txEnd,GeneID,name2,name,pLI,hi_score
17613,chr1,6845513,7829766,23261,CAMTA1,NM_001349609,1.0,0.951338773
17671,chr1,8412463,8877699,473,RERE,NM_012102,1.0,0.962236687
17743,chr1,9711788,9789172,5293,PIK3CD,NM_005026,0.99999,0.977549203
16486,chr1,10271673,10441661,23095,KIF1B,NM_001365952,1.0,0.70541308
17920,chr1,11166591,11322608,2475,MTOR,NM_004958,1.0,0.997612705


In [146]:
decipher.rename(columns={
    'chrom': '#chrom', 'txStart': 'start', 'txEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(decipher_gene_file, sep='\t', index=False)

In [147]:
!bgzip -cf {decipher_gene_file} > {decipher_gene_file}.gz

In [148]:
!tabix -fp bed {decipher_gene_file}.gz
!rm {decipher_gene_file}

In [149]:
ts_genes = set(
    curation_gene.loc[
        curation_gene['Triplosensitivity Score'] == '3', '#Gene Symbol'
    ]
)

In [150]:
ts_gene = refgene_info.loc[
    refgene_info['name2'].isin(ts_genes),
    ['chrom', 'txStart', 'txEnd', 'GeneID', 'name2', 'name', 'strand']
].sort_values(['chrom', 'txStart', 'txEnd'])

In [151]:
ts_gene.head()

Unnamed: 0,chrom,txStart,txEnd,GeneID,name2,name,strand
4349,chr5,126112827,126172712,4001,LMNB1,NM_005573,+
10881,chrX,103031433,103047547,5354,PLP1,NM_001128834,+
3809,chrX,153287023,153363174,4204,MECP2,NM_001110792,-


In [152]:
ts_gene.rename(columns={
    'chrom': '#chrom', 'txStart': 'start', 'txEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(ts_gene_file, index=False, sep='\t')

In [153]:
!bgzip -cf {ts_gene_file} > {ts_gene_file}.gz

In [154]:
!tabix -fp bed {ts_gene_file}.gz
!rm {ts_gene_file}

In [160]:
uts_genes = set(
    curation_gene.loc[curation_gene['Triplosensitivity Score'] == '40', '#Gene Symbol']
)

In [161]:
uts_gene = refgene_info.loc[
    refgene_info['name2'].isin(uts_genes),
    ['chrom', 'txStart', 'txEnd', 'GeneID', 'name2', 'name', 'strand']
].sort_values(['chrom', 'txStart', 'txEnd'])

In [162]:
uts_gene.head()

Unnamed: 0,chrom,txStart,txEnd,GeneID,name2,name,strand
18642,chrX,6451658,6453159,51481,VCX3A,NM_016379,-


In [163]:
uts_gene.rename(columns={
    'chrom': '#chrom', 'txStart': 'start', 'txEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(uts_gene_file, index=False, sep='\t')

In [164]:
!bgzip -cf {uts_gene_file} > {uts_gene_file}.gz

In [165]:
!tabix -fp bed {uts_gene_file}.gz
!rm {uts_gene_file}

In [166]:
ts_region = region_pos.loc[
    region_pos['Triplosensitivity Score'] == '3',
    ['chrom', 'start', 'end', '#ISCA ID', 'ISCA Region Name']
].sort_values(['chrom', 'start', 'end'])

In [167]:
ts_region.head()

Unnamed: 0,chrom,start,end,#ISCA ID,ISCA Region Name
51,chr1,146577486,147394506,ISCA-37421,"1q21.1 recurrent region (BP3-BP4, distal) (inc..."
61,chr15,22832519,28379874,ISCA-37404,"15q11q13 recurrent (PWS/AS) region (BP1-BP3, C..."
23,chr15,23747996,28379874,ISCA-37478,"15q11q13 recurrent (PWS/AS) region (BP2-BP3, C..."
63,chr16,29649997,30199852,ISCA-37400,"16p11.2 recurrent region (proximal, BP4-BP5) (..."
46,chr17,1247833,2588909,ISCA-37430,17p13.3 (Miller-Dieker syndrome) region (inclu...


In [168]:
ts_region['omim_genes'] = ts_region.apply(
    lambda row: ','.join(omim_gene.loc[
        (omim_gene['chrom'] == row['chrom'])
        & (omim_gene['txEnd'] >= row['start'])
        & (omim_gene['txStart'] <= row['end']),
        'name2'
    ]),
    axis=1
)

In [169]:
ts_region.head()

Unnamed: 0,chrom,start,end,#ISCA ID,ISCA Region Name,omim_genes
51,chr1,146577486,147394506,ISCA-37421,"1q21.1 recurrent region (BP3-BP4, distal) (inc...","GJA5,GJA8"
61,chr15,22832519,28379874,ISCA-37404,"15q11q13 recurrent (PWS/AS) region (BP1-BP3, C...","NIPA1,MKRN3,MAGEL2,NDN,SNRPN,UBE3A,GABRB3,GABR..."
23,chr15,23747996,28379874,ISCA-37478,"15q11q13 recurrent (PWS/AS) region (BP2-BP3, C...","MKRN3,MAGEL2,NDN,SNRPN,UBE3A,GABRB3,GABRA5,OCA..."
63,chr16,29649997,30199852,ISCA-37400,"16p11.2 recurrent region (proximal, BP4-BP5) (...","KIF22,PRRT2,ALDOA,TBX6,CORO1A"
46,chr17,1247833,2588909,ISCA-37430,17p13.3 (Miller-Dieker syndrome) region (inclu...,"INPP5K,PRPF8,WDR81,SERPINF2,SERPINF1,DPH1,PAFA..."


In [170]:
ts_region.rename(columns={
    'chrom': '#chrom',
    '#ISCA ID': 'isca_id', 'ISCA Region Name': 'name'
}).to_csv(ts_region_file, sep='\t', index=False)

In [171]:
!bgzip -cf {ts_region_file} > {ts_region_file}.gz

In [172]:
!tabix -fp bed {ts_region_file}.gz
!rm {ts_region_file}

In [173]:
uts_region = region_pos.loc[
    region_pos['Triplosensitivity Score'] == '40',
    ['chrom', 'start', 'end', '#ISCA ID', 'ISCA Region Name']
].sort_values(['chrom', 'start', 'end'])

In [174]:
uts_region.head()

Unnamed: 0,chrom,start,end,#ISCA ID,ISCA Region Name
29,chr1,148867551,149768855,ISCA-37469,1q21.2 region (polymorphic region)
25,chr14,22111109,23021097,ISCA-37476,14q11.2 region (TCRA region)
24,chr14,106050000,107289540,ISCA-37477,1 copy: 14q telomere; 3 copies: 14q telomere
33,chr15,22832519,23090897,ISCA-37448,15q11.2 recurrent region (BP1-BP2) (includes N...
4,chr15,32019621,32445405,ISCA-46295,15q13.3 recurrent region (D-CHRNA7 to BP5) (in...


In [175]:
uts_region['genes'] = uts_region.apply(
    lambda row: ','.join(gene.loc[
        (gene['chrom'] == row['chrom'])
        & (gene['txEnd'] >= row['start'])
        & (gene['txStart'] <= row['end']),
        'name2'
    ]),
    axis=1
)

In [176]:
uts_region.head()

Unnamed: 0,chrom,start,end,#ISCA ID,ISCA Region Name,genes
29,chr1,148867551,149768855,ISCA-37469,1q21.2 region (polymorphic region),"H3-2,PPIAL4C,H2BC18,FCGR1A"
25,chr14,22111109,23021097,ISCA-37476,14q11.2 region (TCRA region),"OR4E2,OR4E1"
24,chr14,106050000,107289540,ISCA-37477,1 copy: 14q telomere; 3 copies: 14q telomere,
33,chr15,22832519,23090897,ISCA-37448,15q11.2 recurrent region (BP1-BP2) (includes N...,"TUBGCP5,CYFIP1,NIPA2,NIPA1"
4,chr15,32019621,32445405,ISCA-46295,15q13.3 recurrent region (D-CHRNA7 to BP5) (in...,"OTUD7A,CHRNA7"


In [177]:
uts_region.rename(columns={
    'chrom': '#chrom',
    '#ISCA ID': 'isca_id', 'ISCA Region Name': 'name'
}).to_csv(uts_region_file, sep='\t', index=False)

In [178]:
!bgzip -cf {uts_region_file} > {uts_region_file}.gz

In [179]:
!tabix -fp bed {uts_region_file}.gz
!rm {uts_region_file}

In [180]:
dgv = pd.read_csv(
    dgv_ori_file,
    sep='\t', names=['chrom', 'info'], usecols=[0, 8]
).drop_duplicates('info')
info = dgv['info'].str.extract(
    r'ID=(?P<id>[^;]+).*variant_sub_type=(?P<type>[^;]+).*inner_start=(?P<start>[^;]+).*inner_end=(?P<end>[^;]+).*Frequency=(?P<freq>\S+?)%;.*num_unique_samples_tested=(?P<sample>[^;]+)'
).astype({'start': int, 'end': int, 'sample': int, 'freq': float})
dgv = dgv.merge(info, left_index=True, right_index=True)
dgv['af'] = dgv['freq'] / 100
dgv = dgv[dgv['sample'] >= 1000].sort_values(['chrom', 'start', 'end'])

In [181]:
dgv.head()

Unnamed: 0,chrom,info,id,type,start,end,freq,sample,af
12,chr1,ID=gssvG9;Name=gssvG9;variant_type=CNV;variant...,gssvG9,Gain,49911,222421,1.13,1149,0.0113
54,chr1,ID=gssvG28;Name=gssvG28;variant_type=CNV;varia...,gssvG28,Gain,60905,97505,0.61,2609,0.0061
18651,chr1,ID=gssvL18;Name=gssvL18;variant_type=CNV;varia...,gssvL18,Loss,60905,97505,0.61,2609,0.0061
18627,chr1,ID=gssvL4;Name=gssvL4;variant_type=CNV;variant...,gssvL4,Loss,61724,346583,0.3,12364,0.003
48,chr1,ID=gssvG24;Name=gssvG24;variant_type=CNV;varia...,gssvG24,Gain,125400,176984,0.73,1510,0.0073


In [182]:
def fetch_gene(gene, chrom, start, end):
    return ','.join(gene.loc[
        (gene['chrom'] == chrom) & (gene['txEnd'] >= start) & (gene['txStart'] <= end), 'name2'
    ])

In [183]:
with Pool(processes=7) as pool:
    dgv['genes'] = pool.starmap(fetch_gene, (
        (gene, row['chrom'], row['start'], row['end']) for _, row in dgv.iterrows()
    ), chunksize=70)

In [None]:
dgv.head()

In [None]:
dgv.loc[
    dgv['type'] == 'Gain', ['chrom', 'start', 'end', 'id', 'genes', 'af']
].rename(columns={'chrom': '#chrom'}).to_csv(dgv_gain_file, sep='\t', index=False)

In [None]:
!bgzip -cf {dgv_gain_file} > {dgv_gain_file}.gz

In [None]:
!tabix -fp bed {dgv_gain_file}.gz
!rm {dgv_gain_file}

In [None]:
dgv.loc[
    dgv['type'] == 'Loss', ['chrom', 'start', 'end', 'id', 'genes', 'af']
].rename(columns={'chrom': '#chrom'}).to_csv(dgv_loss_file, sep='\t', index=False)

In [None]:
!bgzip -cf {dgv_loss_file} > {dgv_loss_file}.gz

In [None]:
!tabix -fp bed {dgv_loss_file}.gz
!rm {dgv_loss_file}

In [None]:
gnomad = pd.read_csv(
    gnomad_control_ori_file, sep='\t',  dtype=str,
    usecols=[0, 1, 2, 3, 4, 37, 38, 73, 74, 107, 108, 141, 142, 175, 176, 241]
)
gnomad = gnomad[
    (gnomad['FILTER'] == 'PASS') & gnomad['svtype'].isin(['DEL', 'DUP'])
]
gnomad = gnomad[
    (gnomad['N_BI_GENOS'].astype(int) >= 1000) |
    (gnomad['AFR_N_BI_GENOS'].astype(int) >= 1000) |
    (gnomad['AMR_N_BI_GENOS'].astype(int) >= 1000) |
    (gnomad['EAS_N_BI_GENOS'].astype(int) >= 1000) |
    (gnomad['EUR_N_BI_GENOS'].astype(int) >= 1000)
]
gnomad['#chrom'] = 'chr' + gnomad['#chrom']
gnomad['start'] = gnomad['start'].astype(int)
gnomad['end'] = gnomad['end'].astype(int)

In [None]:
gnomad.head()

In [None]:
with Pool(processes=7) as pool:
    gnomad['genes'] = pool.starmap(fetch_gene, (
        (gene, row['#chrom'], row['start'], row['end']) for _, row in gnomad.iterrows()
    ), chunksize=70)

In [None]:
gnomad.head()

In [None]:
gnomad.loc[
    gnomad['svtype'] == 'DEL',
    ['#chrom', 'start', 'end', 'name', 'genes', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF']
].rename(columns={
    'AF': 'af', 'AFR_AF': 'af_afr', 'AMR_AF': 'af_amr', 'EAS_AF': 'af_eas', 'EUR_AF': 'af_eur'
}).to_csv(genomad_del_file, sep='\t', index=False)

In [None]:
!bgzip -cf {genomad_del_file} > {genomad_del_file}.gz

In [None]:
!tabix -fp bed {genomad_del_file}.gz
!rm {genomad_del_file}

In [None]:
gnomad.loc[
    gnomad['svtype'] == 'DUP',
    ['#chrom', 'start', 'end', 'name', 'genes', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF']
].rename(columns={
    'AF': 'af', 'AFR_AF': 'af_afr', 'AMR_AF': 'af_amr', 'EAS_AF': 'af_eas', 'EUR_AF': 'af_eur'
}).to_csv(gnomad_dup_file, sep='\t', index=False)

In [None]:
!bgzip -cf {gnomad_dup_file} > {gnomad_dup_file}.gz

In [None]:
!tabix -fp bed {gnomad_dup_file}.gz
!rm {gnomad_dup_file}

In [None]:
!tabix -fp bed {gnomad_dup_file}.gz
!rm {gnomad_dup_file}