In [77]:
from multiprocessing import Pool
import pandas as pd
import numpy as np
import allel
import pysam
import json
import os
from datetime import datetime
pd.set_option('display.max_columns', None)
from urllib import request
from tqdm import tqdm

In [2]:
work_dir = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
raw_data_dir = os.path.join(work_dir, 'raw_data')
result_data_dir = os.path.join(work_dir, 'data')
if not os.path.exists(raw_data_dir):
    os.mkdir(raw_data_dir)
if not os.path.exists(result_data_dir):
    os.mkdir(result_data_dir)

# version_date = datetime.today().strftime('%Y-%m-%d')
version_date = '2021-10-26'
version_date_dir = os.path.join(raw_data_dir, version_date)
if not os.path.exists(version_date_dir):
    os.mkdir(version_date_dir)

In [3]:
gene_info_url = 'https://ftp.ncbi.nih.gov/refseq/H_sapiens/Homo_sapiens.gene_info.gz'
ref_gene_url = 'https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/refGene.txt.gz'
clingen_gene_curation_url = 'https://ftp.clinicalgenome.org/ClinGen_gene_curation_list_GRCh37.tsv'
clingene_region_curation_url = 'https://ftp.clinicalgenome.org/ClinGen_region_curation_list_GRCh37.tsv'
clinvar_url = 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz'
hi_pred_url = 'https://decipher.sanger.ac.uk/files/downloads/HI_Predictions_Version3.bed.gz'
gnomad_lof_url = 'https://datasetgnomad.blob.core.windows.net/dataset/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz'
gnomad_control_only_url = 'https://datasetgnomad.blob.core.windows.net/dataset/papers/2019-sv/gnomad_v2.1_sv.controls_only.sites.bed.gz'
hgnc_gene_fam_url = 'ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/csv/genefamily_db_tables/family.csv'
# ori file
gene_info_ori_file = os.path.join(version_date_dir, 'Homo_sapiens.gene_info.gz')
ref_gene_ori_file = os.path.join(version_date_dir, 'refGene.txt.gz')
clingen_gene_ori_file = os.path.join(version_date_dir, 'clingen_gene_hg19.tsv')
clingen_region_ori_file = os.path.join(version_date_dir, 'clingen_region_hg19.tsv')
clinvar_ori_vcf_file = os.path.join(version_date_dir, 'clinvar.vcf.gz')
hi_pred_ori_file = os.path.join(version_date_dir, 'HI_Predictions_Version3.bed.gz')
gnomad_lof_ori_file = os.path.join(version_date_dir, 'gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz')
gnomad_control_ori_file = os.path.join(version_date_dir, 'gnomad_v2.1_sv.controls_only.sites.bed.gz')
# prep an external omim gene list
omim_gene_list_file = os.path.join(raw_data_dir, 'gene-list-key-lte3.xlsx')
dgv_ori_file = os.path.join(raw_data_dir, 'DGV.GS.March2016.50percent.GainLossSep.Final.hg19.gff3')
# result file
gene_file = os.path.join(result_data_dir, 'gene.sorted.bed')
omim_gene_file = os.path.join(result_data_dir, 'omim-gene.sorted.bed')
clinvar_file = os.path.join(result_data_dir, 'clinvar-pathogenic.sorted.vcf')
decipher_gene_file = os.path.join(result_data_dir, 'decipher-gene.sorted.bed')
dgv_gain_file = os.path.join(result_data_dir, 'dgv-gain.sorted.bed')
dgv_loss_file = os.path.join(result_data_dir, 'dgv-loss.sorted.bed')
func_region_file = os.path.join(result_data_dir, 'func-region.sorted')
gnomad_del_file = os.path.join(result_data_dir, 'gnomad-del.sorted.bed')
gnomad_dup_file = os.path.join(result_data_dir, 'gnomad-dup.sorted.bed')
hi_cds_file = os.path.join(result_data_dir, 'hi-cds.sorted.bed')
hi_exon_file = os.path.join(result_data_dir, 'hi-exon.sorted.bed')
hi_gene_file = os.path.join(result_data_dir, 'hi-gene.sorted.bed')
hi_region_file = os.path.join(result_data_dir, 'hi-region.sorted.bed')
ts_gene_file = os.path.join(result_data_dir, 'ts-gene.sorted.bed')
ts_region_file = os.path.join(result_data_dir, 'ts-region.sorted.bed')
uhi_gene_file = os.path.join(result_data_dir, 'uhi-gene.sorted.bed')
uhi_region_file = os.path.join(result_data_dir, 'uhi-region.sorted.bed')
uts_gene_file = os.path.join(result_data_dir, 'uts-gene.sorted.bed')
uts_region_file = os.path.join(result_data_dir, 'uts-region.sorted.bed')
hgnc_gene_fam_file = os.path.join(version_date_dir, 'family.csv')

In [4]:
# Download require data
if not os.path.exists(gene_info_ori_file):
    print(f'downloading gene info to {gene_info_ori_file}')
    request.urlretrieve(gene_info_url, gene_info_ori_file)
    print('done!')

if not os.path.exists(ref_gene_ori_file):
    print(f'downloading ref gene to {ref_gene_ori_file}')
    request.urlretrieve(ref_gene_url, ref_gene_ori_file)
    print('done!')

if not os.path.exists(clingen_gene_ori_file):
    print(f'downloading clingen gene list to {clingen_gene_ori_file}')
    request.urlretrieve(clingen_gene_curation_url, clingen_gene_ori_file)
    print('done!')

if not os.path.exists(clingen_region_ori_file):
    print(f'downloading clingen region file to {clingen_region_ori_file}')
    request.urlretrieve(clingene_region_curation_url, clingen_region_ori_file)
    print('done!')

if not os.path.exists(clinvar_ori_vcf_file):
    print(f'downloading clingen region file to {clinvar_ori_vcf_file}')
    request.urlretrieve(clinvar_url, clinvar_ori_vcf_file)
    print('done!')

if not os.path.exists(hi_pred_ori_file):
    print(f'downloading hi prediction file to {hi_pred_ori_file}')
    request.urlretrieve(hi_pred_url, hi_pred_ori_file)
    print('done!')
if not os.path.exists(gnomad_lof_ori_file):
    print(f'downloading pLoF file from GnomAD to {gnomad_lof_ori_file}')
    request.urlretrieve(gnomad_lof_url, gnomad_lof_ori_file)
    print('done!')
if not os.path.exists(gnomad_control_ori_file):
    print(f'downloading gnomad control only file to {gnomad_control_ori_file}')
    request.urlretrieve(gnomad_control_only_url, gnomad_control_ori_file)
    print('done!')

if not os.path.exists(hgnc_gene_fam_file):
    print(f'downloading gene family file to {hgnc_gene_fam_file}')
    request.urlretrieve(hgnc_gene_fam_url, hgnc_gene_fam_file)
    print('done!')



In [5]:
cols = [
    'bin', 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds',
    'score', 'name2', 'cdsStartStat', 'cdsEndStat', 'ExonFrames']
refgene = pd.read_csv(ref_gene_ori_file, sep='\t', names=cols)
refgene = refgene[~refgene['chrom'].str.match(r'.*fix$')]
refgene['length'] = refgene['cdsEnd'] - refgene['cdsStart']
refgene = refgene.sort_values('length', ascending=False)
refgene = refgene.drop_duplicates('name2', keep='first')

In [6]:
refgene.head()

Unnamed: 0,bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,ExonFrames,length
10573,26,NM_001351365,chr1,+,144146811,146467744,144146846,146466121,93,"144146811,144148789,144149726,144150981,144151...","144147021,144148892,144149941,144151054,144151...",0,NBPF19,cmpl,cmpl,"0,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,...",2319275
10459,26,NM_001278267,chr1,+,144146810,146467744,144158383,146466121,131,"144146810,144148789,144149726,144150981,144151...","144147021,144148892,144149941,144151054,144151...",0,NBPF20,cmpl,cmpl,"-1,-1,-1,-1,-1,-1,-1,-1,0,2,1,2,1,2,1,2,1,2,1,...",2307738
18617,12,NM_000109,chrX,-,31137340,33357505,31140035,33357382,79,"31137340,31144758,31152218,31164407,31165391,3...","31140047,31144790,31152311,31164531,31165635,3...",0,DMD,cmpl,cmpl,"0,1,1,0,2,2,2,2,2,0,2,0,1,2,1,1,2,1,0,0,1,0,2,...",2217347
29979,2,NM_001351274,chr11,-,83166055,85339417,83170860,85337661,27,"83166055,83173044,83177750,83180243,83182668,8...","83170967,83173136,83177860,83180416,83182770,8...",0,DLG2,cmpl,cmpl,"1,2,0,1,1,1,0,2,2,1,0,2,2,2,1,0,1,2,0,0,0,0,0,...",2166801
68033,9,NM_033225,chr8,-,2792882,4852436,2796106,4851938,70,"2792882,2799993,2806820,2807752,2808635,281174...","2796266,2800126,2806908,2807865,2808797,281179...",0,CSMD1,cmpl,cmpl,"2,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...",2055832


In [7]:
gene_info = pd.read_csv(gene_info_ori_file, sep='\t')

In [8]:
gene_info.head()

Unnamed: 0,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
0,9606,1,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20210708,-
1,9606,2,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20211009,-
2,9606,3,A2MP1,-,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000256069,12,12p13.31,alpha-2-macroglobulin pseudogene 1,pseudo,A2MP1,alpha-2-macroglobulin pseudogene 1,O,pregnancy-zone protein pseudogene,20210611,-
3,9606,9,NAT1,-,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171428,8,8p22,N-acetyltransferase 1,protein-coding,NAT1,N-acetyltransferase 1,O,arylamine N-acetyltransferase 1|N-acetyltransf...,20210926,-
4,9606,10,NAT2,-,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156006,8,8p22,N-acetyltransferase 2,protein-coding,NAT2,N-acetyltransferase 2,O,arylamine N-acetyltransferase 2|N-acetyltransf...,20210926,-


In [9]:
refgene_info = refgene.merge(gene_info, left_on='name2', right_on='Symbol')

In [10]:
refgene_info.head()

Unnamed: 0,bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,ExonFrames,length,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
0,26,NM_001351365,chr1,+,144146811,146467744,144146846,146466121,93,"144146811,144148789,144149726,144150981,144151...","144147021,144148892,144149941,144151054,144151...",0,NBPF19,cmpl,cmpl,"0,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,...",2319275,9606,101060226,NBPF19,-,-,MIM:614006|HGNC:HGNC:31999|Ensembl:ENSG0000027...,1,1q21.2,NBPF member 19,protein-coding,NBPF19,NBPF member 19,O,neuroblastoma breakpoint family member 19,20210912,-
1,26,NM_001278267,chr1,+,144146810,146467744,144158383,146466121,131,"144146810,144148789,144149726,144150981,144151...","144147021,144148892,144149941,144151054,144151...",0,NBPF20,cmpl,cmpl,"-1,-1,-1,-1,-1,-1,-1,-1,0,2,1,2,1,2,1,2,1,2,1,...",2307738,9606,100288142,NBPF20,-,-,MIM:614007|HGNC:HGNC:32000|Ensembl:ENSG0000016...,1,1q21.1,NBPF member 20,protein-coding,NBPF20,NBPF member 20,O,neuroblastoma breakpoint family member 20,20210611,-
2,12,NM_000109,chrX,-,31137340,33357505,31140035,33357382,79,"31137340,31144758,31152218,31164407,31165391,3...","31140047,31144790,31152311,31164531,31165635,3...",0,DMD,cmpl,cmpl,"0,1,1,0,2,2,2,2,2,0,2,0,1,2,1,1,2,1,0,0,1,0,2,...",2217347,9606,1756,DMD,-,BMD|CMD3B|DXS142|DXS164|DXS206|DXS230|DXS239|D...,MIM:300377|HGNC:HGNC:2928|Ensembl:ENSG00000198947,X,Xp21.2-p21.1,dystrophin,protein-coding,DMD,dystrophin,O,dystrophin|truncated dystrophin,20210906,-
3,2,NM_001351274,chr11,-,83166055,85339417,83170860,85337661,27,"83166055,83173044,83177750,83180243,83182668,8...","83170967,83173136,83177860,83180416,83182770,8...",0,DLG2,cmpl,cmpl,"1,2,0,1,1,1,0,2,2,1,0,2,2,2,1,0,1,2,0,0,0,0,0,...",2166801,9606,1740,DLG2,-,PPP1R58|PSD-93|PSD93|chapsyn-110,MIM:603583|HGNC:HGNC:2901|Ensembl:ENSG00000150672,11,11q14.1,discs large MAGUK scaffold protein 2,protein-coding,DLG2,discs large MAGUK scaffold protein 2,O,disks large homolog 2|channel-associated prote...,20211016,-
4,9,NM_033225,chr8,-,2792882,4852436,2796106,4851938,70,"2792882,2799993,2806820,2807752,2808635,281174...","2796266,2800126,2806908,2807865,2808797,281179...",0,CSMD1,cmpl,cmpl,"2,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...",2055832,9606,64478,CSMD1,-,PPP1R24,MIM:608397|HGNC:HGNC:14026|Ensembl:ENSG0000018...,8,8p23.2,CUB and Sushi multiple domains 1,protein-coding,CSMD1,CUB and Sushi multiple domains 1,O,CUB and sushi domain-containing protein 1|prot...,20210919,-


In [11]:
gene_fam = pd.read_csv(hgnc_gene_fam_file, sep=',')
gene_fam = gene_fam[gene_fam['typical_gene'].notna()].rename(columns={'id': 'fam'})
gene_fam_grp = gene_fam.groupby('typical_gene').agg({'fam': set})

In [12]:
gene = refgene_info.loc[
    refgene_info['type_of_gene'] == 'protein-coding',
    ['chrom', 'txStart', 'txEnd', 'GeneID', 'name2', 'name', 'strand']
].sort_values(['chrom', 'txStart', 'txEnd'])

In [13]:
gene.head()

Unnamed: 0,chrom,txStart,txEnd,GeneID,name2,name,strand
18372,chr1,69090,70008,79501,OR4F5,NM_001005484,+
18188,chr1,367658,368597,729759,OR4F29,NM_001005221,+
18185,chr1,621095,622034,81399,OR4F16,NM_001005277,-
9220,chr1,859302,879954,148398,SAMD11,NM_001385641,+
10563,chr1,879582,894636,26155,NOC2L,NM_015658,-


In [14]:
gene.rename(columns={
    'chrom': '#chrom', 'txStart': 'start', 'txEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(gene_file, index=False, sep='\t')

In [15]:
!bgzip -cf {gene_file} > {gene_file}.gz

In [16]:
!tabix -fp bed {gene_file}.gz

In [17]:
!rm {gene_file}

In [18]:
raw_omim_df = pd.read_excel(omim_gene_list_file)
omim_gene = set(raw_omim_df['gene_symbol'].dropna())
# with open(omim_gene_list_file) as f:
#     omim_gene = set(f.read().split('\n'))

In [19]:
omim_gene = refgene_info.loc[
    refgene_info['name2'].isin(omim_gene),
    ['chrom', 'txStart', 'txEnd', 'GeneID', 'name2', 'name', 'strand']
].sort_values(['chrom', 'txStart', 'txEnd'])

In [20]:
omim_gene.head()

Unnamed: 0,chrom,txStart,txEnd,GeneID,name2,name,strand
18397,chr1,948876,949919,9636,ISG15,NM_005101,+
6532,chr1,955499,991494,375790,AGRN,NM_001305275,+
16101,chr1,1146719,1149533,7293,TNFRSF4,NM_003327,-
17883,chr1,1167616,1170420,126792,B3GALT6,NM_080605,+
11098,chr1,1270657,1284798,1855,DVL1,NM_004421,-


In [21]:
omim_gene.rename(columns={
    'chrom': '#chrom', 'txStart': 'start', 'txEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(omim_gene_file, index=False, sep='\t')

In [22]:
!bgzip -cf {omim_gene_file} > {omim_gene_file}.gz

In [23]:
!tabix -fp bed {omim_gene_file}.gz
!rm {omim_gene_file}

In [24]:
curation_gene = pd.read_csv(clingen_gene_ori_file, sep='\t', dtype=str, skiprows=5)

In [25]:
curation_gene.head()

Unnamed: 0,#Gene Symbol,Gene ID,cytoBand,Genomic Location,Haploinsufficiency Score,Haploinsufficiency Description,Haploinsufficiency PMID1,Haploinsufficiency PMID2,Haploinsufficiency PMID3,Haploinsufficiency PMID4,Haploinsufficiency PMID5,Haploinsufficiency PMID6,Triplosensitivity Score,Triplosensitivity Description,Triplosensitivity PMID1,Triplosensitivity PMID2,Triplosensitivity PMID3,Triplosensitivity PMID4,Triplosensitivity PMID5,Triplosensitivity PMID6,Date Last Evaluated,Loss phenotype OMIM ID,Triplosensitive phenotype OMIM ID
0,A4GALT,53947,22q13.2,chr22:43088121-43117307,30,Gene associated with autosomal recessive pheno...,,,,,,,0,No evidence available,,,,,,,2014-12-11,111400.0,
1,AAGAB,79719,15q23,chr15:67493013-67547536,3,Sufficient evidence for dosage pathogenicity,23064416.0,23000146.0,,,,,0,No evidence available,,,,,,,2013-02-28,148600.0,
2,AARS1,16,16q22.1,chr16:70286297-70323412,0,No evidence available,,,,,,,0,No evidence available,,,,,,,2018-01-11,,
3,AARS2,57505,6p21.1,chr6:44266463-44281063,30,Gene associated with autosomal recessive pheno...,,,,,,,Not yet evaluated,Not yet evaluated,,,,,,,2016-08-22,615889.0,
4,AASS,10157,7q31.32,chr7:121713598-121784344,30,Gene associated with autosomal recessive pheno...,,,,,,,Not yet evaluated,Not yet evaluated,,,,,,,2016-08-22,238700.0,


In [26]:
hi_genes = set(
    curation_gene.loc[
        curation_gene['Haploinsufficiency Score'] == '3', '#Gene Symbol'
    ]
)

In [27]:
hi_gene = refgene_info.loc[
    refgene_info['name2'].isin(hi_genes),
    ['chrom', 'txStart', 'txEnd', 'GeneID', 'name2', 'name', 'strand']
].sort_values(['chrom', 'txStart', 'txEnd'])

In [28]:
hi_gene.head()

Unnamed: 0,chrom,txStart,txEnd,GeneID,name2,name,strand
51,chr1,6845513,7829766,23261,CAMTA1,NM_001349609,+
6499,chr1,17345220,17380527,6390,SDHB,NM_003000,-
3068,chr1,27022505,27108595,8289,ARID1A,NM_139135,+
14667,chr1,27860755,27930655,27245,AHDC1,NM_001371928,-
7013,chr1,43391025,43424539,6513,SLC2A1,NM_006516,-


In [29]:
hi_gene.rename(columns={
    'chrom': '#chrom', 'txStart': 'start', 'txEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(hi_gene_file, index=False, sep='\t')

In [30]:
!bgzip -cf {hi_gene_file} > {hi_gene_file}.gz

In [31]:
!tabix -fp bed {hi_gene_file}.gz
!rm {hi_gene_file}

In [32]:
hi_cds = refgene_info.loc[
    (refgene_info['name2'].isin(hi_genes)) & (refgene_info['length'] != 0),
    ['chrom', 'cdsStart', 'cdsEnd', 'GeneID', 'name2', 'name']
].sort_values(['chrom', 'cdsStart', 'cdsEnd'])

In [33]:
hi_cds.head()

Unnamed: 0,chrom,cdsStart,cdsEnd,GeneID,name2,name
51,chr1,6845590,7826582,23261,CAMTA1,NM_001349609
6499,chr1,17345375,17380514,6390,SDHB,NM_003000
3068,chr1,27022894,27107247,8289,ARID1A,NM_139135
14667,chr1,27873814,27878626,27245,AHDC1,NM_001371928
7013,chr1,43392711,43424322,6513,SLC2A1,NM_006516


In [34]:
hi_cds.rename(columns={
    'chrom': '#chrom', 'cdsStart': 'start', 'cdsEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(hi_cds_file, index=False, sep='\t')

In [35]:
!bgzip -cf {hi_cds_file} > {hi_cds_file}.gz

In [36]:
!tabix -fp bed {hi_cds_file}.gz
!rm {hi_cds_file}

In [37]:
hi_exons = refgene_info.loc[
    refgene_info['name2'].isin(hi_genes), ['chrom', 'exonStarts', 'exonEnds', 'GeneID', 'name2', 'name', 'strand']
].copy()

In [38]:
hi_exons.head()

Unnamed: 0,chrom,exonStarts,exonEnds,GeneID,name2,name,strand
2,chrX,"31137340,31144758,31152218,31164407,31165391,3...","31140047,31144790,31152311,31164531,31165635,3...",1756,DMD,NM_000109,-
26,chr7,"69063460,69364271,69583117,69599521,69900737,7...","69064948,69364484,69583219,69599557,69900767,7...",26053,AUTS2,NM_015570,+
31,chrX,"28605562,28807436,29301054,29414374,29417271,2...","28606164,28807542,29301334,29414561,29417425,2...",11141,IL1RAPL1,NM_014271,+
44,chr2,"50145642,50170841,50280408,50282092,50318460,5...","50149389,50170929,50280728,50282182,50318632,5...",9378,NRXN1,NM_001330084,-
51,chr1,"6845513,6880240,6885151,7151363,7309550,752788...","6845635,6880310,6885270,7151431,7309686,752796...",23261,CAMTA1,NM_001349609,+


In [39]:
hi_exons['exonStarts'] = hi_exons['exonStarts'].str.replace(r',$', '')
hi_exons['exonEnds'] = hi_exons['exonEnds'].str.replace(r',$', '')

  hi_exons['exonStarts'] = hi_exons['exonStarts'].str.replace(r',$', '')
  hi_exons['exonEnds'] = hi_exons['exonEnds'].str.replace(r',$', '')


In [40]:
hi_exons.head()

Unnamed: 0,chrom,exonStarts,exonEnds,GeneID,name2,name,strand
2,chrX,"31137340,31144758,31152218,31164407,31165391,3...","31140047,31144790,31152311,31164531,31165635,3...",1756,DMD,NM_000109,-
26,chr7,"69063460,69364271,69583117,69599521,69900737,7...","69064948,69364484,69583219,69599557,69900767,7...",26053,AUTS2,NM_015570,+
31,chrX,"28605562,28807436,29301054,29414374,29417271,2...","28606164,28807542,29301334,29414561,29417425,2...",11141,IL1RAPL1,NM_014271,+
44,chr2,"50145642,50170841,50280408,50282092,50318460,5...","50149389,50170929,50280728,50282182,50318632,5...",9378,NRXN1,NM_001330084,-
51,chr1,"6845513,6880240,6885151,7151363,7309550,752788...","6845635,6880310,6885270,7151431,7309686,752796...",23261,CAMTA1,NM_001349609,+


In [41]:
start = hi_exons['exonStarts'].str.split(',').apply(pd.Series).stack().reset_index()
start = start.rename(columns={'level_0': 'row', 0: 'start'})[['row', 'start']]
start['start'] = start['start'].astype(int)
end = hi_exons['exonEnds'].str.split(',').apply(pd.Series).stack().reset_index()
end = end.rename(columns={0: 'end'})['end'].astype(int)
position = start.join(end)

In [42]:
position.head()

Unnamed: 0,row,start,end
0,2,31137340,31140047
1,2,31144758,31144790
2,2,31152218,31152311
3,2,31164407,31164531
4,2,31165391,31165635


In [43]:
exon = position.merge(
    hi_exons[['chrom', 'GeneID', 'name2', 'name', 'strand']], how='left', left_on='row', right_index=True
)
exon = exon.sort_values(['chrom', 'start', 'end'])

In [44]:
exon.head()

Unnamed: 0,row,start,end,chrom,GeneID,name2,name,strand
128,51,6845513,6845635,chr1,23261,CAMTA1,NM_001349609,+
129,51,6880240,6880310,chr1,23261,CAMTA1,NM_001349609,+
130,51,6885151,6885270,chr1,23261,CAMTA1,NM_001349609,+
131,51,7151363,7151431,chr1,23261,CAMTA1,NM_001349609,+
132,51,7309550,7309686,chr1,23261,CAMTA1,NM_001349609,+


In [45]:
exon['+'] = exon.groupby(['name2', 'name'])['start'].rank('first', ascending=True).astype(int)
exon['-'] = exon.groupby(['name2', 'name'])['start'].rank('first', ascending=False).astype(int)

In [46]:
exon.head()

Unnamed: 0,row,start,end,chrom,GeneID,name2,name,strand,+,-
128,51,6845513,6845635,chr1,23261,CAMTA1,NM_001349609,+,1,23
129,51,6880240,6880310,chr1,23261,CAMTA1,NM_001349609,+,2,22
130,51,6885151,6885270,chr1,23261,CAMTA1,NM_001349609,+,3,21
131,51,7151363,7151431,chr1,23261,CAMTA1,NM_001349609,+,4,20
132,51,7309550,7309686,chr1,23261,CAMTA1,NM_001349609,+,5,19


In [47]:
exon.tail()

Unnamed: 0,row,start,end,chrom,GeneID,name2,name,strand,+,-
1520,1195,154227753,154227875,chrX,2157,F8,NM_000132,-,25,2
1521,1195,154250684,154250998,chrX,2157,F8,NM_000132,-,26,1
5077,15481,154487519,154490514,chrX,116442,RAB39B,NM_171998,-,1,2
5078,15481,154493358,154493776,chrX,116442,RAB39B,NM_171998,-,2,1
5126,18757,2654895,2655723,chrY,6736,SRY,NM_003140,-,1,1


In [48]:
exon['exon'] = pd.concat([exon.loc[exon['strand'] == '+', '+'], exon.loc[exon['strand'] == '-', '-']])

In [49]:
exon.head()

Unnamed: 0,row,start,end,chrom,GeneID,name2,name,strand,+,-,exon
128,51,6845513,6845635,chr1,23261,CAMTA1,NM_001349609,+,1,23,1
129,51,6880240,6880310,chr1,23261,CAMTA1,NM_001349609,+,2,22,2
130,51,6885151,6885270,chr1,23261,CAMTA1,NM_001349609,+,3,21,3
131,51,7151363,7151431,chr1,23261,CAMTA1,NM_001349609,+,4,20,4
132,51,7309550,7309686,chr1,23261,CAMTA1,NM_001349609,+,5,19,5


In [50]:
exon.tail()

Unnamed: 0,row,start,end,chrom,GeneID,name2,name,strand,+,-,exon
1520,1195,154227753,154227875,chrX,2157,F8,NM_000132,-,25,2,2
1521,1195,154250684,154250998,chrX,2157,F8,NM_000132,-,26,1,1
5077,15481,154487519,154490514,chrX,116442,RAB39B,NM_171998,-,1,2,2
5078,15481,154493358,154493776,chrX,116442,RAB39B,NM_171998,-,2,1,1
5126,18757,2654895,2655723,chrY,6736,SRY,NM_003140,-,1,1,1


In [51]:
exon['last_exon'] = exon.groupby(['name2', 'name'])['exon'].transform('max') == exon['exon']

In [52]:
exon.head()

Unnamed: 0,row,start,end,chrom,GeneID,name2,name,strand,+,-,exon,last_exon
128,51,6845513,6845635,chr1,23261,CAMTA1,NM_001349609,+,1,23,1,False
129,51,6880240,6880310,chr1,23261,CAMTA1,NM_001349609,+,2,22,2,False
130,51,6885151,6885270,chr1,23261,CAMTA1,NM_001349609,+,3,21,3,False
131,51,7151363,7151431,chr1,23261,CAMTA1,NM_001349609,+,4,20,4,False
132,51,7309550,7309686,chr1,23261,CAMTA1,NM_001349609,+,5,19,5,False


In [53]:
exon.tail()

Unnamed: 0,row,start,end,chrom,GeneID,name2,name,strand,+,-,exon,last_exon
1520,1195,154227753,154227875,chrX,2157,F8,NM_000132,-,25,2,2,False
1521,1195,154250684,154250998,chrX,2157,F8,NM_000132,-,26,1,1,False
5077,15481,154487519,154490514,chrX,116442,RAB39B,NM_171998,-,1,2,2,True
5078,15481,154493358,154493776,chrX,116442,RAB39B,NM_171998,-,2,1,1,False
5126,18757,2654895,2655723,chrY,6736,SRY,NM_003140,-,1,1,1,True


In [54]:
exon = exon[
    ['chrom', 'start', 'end', 'GeneID', 'name2', 'name', 'exon', 'last_exon']
].sort_values(['chrom', 'start', 'end'])

In [55]:
exon.head()

Unnamed: 0,chrom,start,end,GeneID,name2,name,exon,last_exon
128,chr1,6845513,6845635,23261,CAMTA1,NM_001349609,1,False
129,chr1,6880240,6880310,23261,CAMTA1,NM_001349609,2,False
130,chr1,6885151,6885270,23261,CAMTA1,NM_001349609,3,False
131,chr1,7151363,7151431,23261,CAMTA1,NM_001349609,4,False
132,chr1,7309550,7309686,23261,CAMTA1,NM_001349609,5,False


In [56]:
exon.rename(columns={
    'chrom': '#chrom',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(hi_exon_file, index=False, sep='\t')

In [57]:
!bgzip -cf {hi_exon_file} > {hi_exon_file}.gz

In [58]:
!tabix -fp bed {hi_exon_file}.gz
!rm {hi_exon_file}

In [59]:
last_exon = exon[exon['last_exon']]

In [60]:
last_exon.head()

Unnamed: 0,chrom,start,end,GeneID,name2,name,exon,last_exon
150,chr1,7826518,7829766,23261,CAMTA1,NM_001349609,23,True
3882,chr1,17345220,17345453,6390,SDHB,NM_003000,8,True
2565,chr1,27105513,27108595,8289,ARID1A,NM_139135,20,True
5029,chr1,27860755,27861427,27245,AHDC1,NM_001371928,9,True
4072,chr1,43391025,43392912,6513,SLC2A1,NM_006516,10,True


In [61]:
last_exon_region = last_exon['chrom'] + ':' + last_exon['start'].astype(str) + '-' + last_exon['end'].astype(str)
last_exon_region = last_exon_region.str.replace('chr', '')

In [62]:
last_exon_region.head()

150       1:7826518-7829766
3882    1:17345220-17345453
2565    1:27105513-27108595
5029    1:27860755-27861427
4072    1:43391025-43392912
dtype: object

In [67]:
need_fields = [
    'variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT',
    'variants/AF_ESP', 'variants/AF_EXAC', 'variants/AF_TGP', 'variants/CLNSIG'
]

In [76]:
!tabix -f {clinvar_ori_vcf_file}

In [79]:

with open(clinvar_file, 'w') as f:
    headers = allel.read_vcf_headers(clinvar_ori_vcf_file)
    f.write(''.join(headers.headers))
    
    def fetch_variants(region):
        fields, samples, headers, it = allel.iter_vcf_chunks(
            clinvar_ori_vcf_file, fields=need_fields, alt_number=1, region=region
        )
        for variants, *_ in it:
            esp_filter = np.isnan(variants['variants/AF_ESP'])
            esp_filter[~esp_filter] |= variants['variants/AF_ESP'][~esp_filter] < 0.01

            exac_filter = np.isnan(variants['variants/AF_EXAC'])
            exac_filter[~exac_filter] |= variants['variants/AF_EXAC'][~exac_filter] < 0.01

            tgp_filter = np.isnan(variants['variants/AF_TGP'])
            tgp_filter[~tgp_filter] |= variants['variants/AF_TGP'][~tgp_filter] < 0.01

            pathogenic_filter = np.isin(
                variants['variants/CLNSIG'], ['Likely_pathogenic', 'Pathogenic', 'Pathogenic/Likely_pathogenic']
            )

            af_filter = esp_filter & exac_filter & tgp_filter & pathogenic_filter

            filtered_variants = {k: v[af_filter] for k, v in variants.items()}

            filtered_variants['variants/CHROM'] = 'chr' + filtered_variants['variants/CHROM']

            return allel.normalize_callset(filtered_variants)
    
    with Pool(processes=7) as pool:
        variants = pool.map(fetch_variants, last_exon_region)
    
    for names, callset in tqdm(filter(lambda x: x is not None, variants)):
        allel.write_vcf_data(f, names, callset, None, {field: np.nan for field in need_fields})

KeyboardInterrupt: 

In [None]:
!sed -i '' 's/AF_.\+=nan;//' {clinvar_file}

In [None]:
!bgzip -cf {clinvar_file} > {clinvar_file}.gz

In [None]:
!tabix -fp vcf {clinvar_file}.gz
# !rm {clinvar_file}

In [None]:
uhi_genes = set(
    curation_gene.loc[curation_gene['Haploinsufficiency Score'] == '40', '#Gene Symbol']
)

In [None]:
uhi_gene = refgene_info.loc[
    refgene_info['name2'].isin(uhi_genes),
    ['chrom', 'txStart', 'txEnd', 'GeneID', 'name2', 'name', 'strand']
].sort_values(['chrom', 'txStart', 'txEnd'])

In [None]:
uhi_gene.head()

In [None]:
uhi_gene.rename(columns={
    'chrom': '#chrom', 'txStart': 'start', 'txEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(uhi_gene_file, index=False, sep='\t')

In [None]:
!bgzip -cf {uhi_gene_file} > {uhi_gene_file}.gz

In [None]:
!tabix -fp bed {uhi_gene_file}.gz
!rm {uhi_gene_file}

In [None]:
region = pd.read_csv(clingen_region_ori_file, sep='\t', skiprows=5, dtype=str)

In [None]:
region.head()

In [None]:
position = region['Genomic Location'].str.extract(r'(?P<chrom>chr\w+)\s*:\s*(?P<start>\d+)\s*-\s*(?P<end>\d+)')
position['start'] = position['start'].astype(int)
position['end'] = position['end'].astype(int)

In [None]:
position.head()

In [None]:
region_pos = region.merge(position, how='left', left_index=True, right_index=True)

In [None]:
region_pos.head()

In [None]:
func_region = region_pos.loc[
    (region_pos['Haploinsufficiency Score'].isin(['1', '2','3']))
     | (region_pos['Triplosensitivity Score'].isin(['1', '2','3'])),
     ['chrom', 'start', 'end', '#ISCA ID', 'ISCA Region Name']
].sort_values(['chrom', 'start', 'end'])

In [None]:
func_region.head()

In [None]:
func_region.rename(columns={
    'chrom': '#chrom',
    '#ISCA ID': 'isca_id', 'ISCA Region Name': 'name'
}).to_csv(func_region_file, sep='\t', index=False)

In [None]:
!bgzip -cf {func_region_file} > {func_region_file}.gz

In [None]:
!tabix -fp bed {func_region_file}.gz
!rm {func_region_file}

In [None]:
hi_region = region_pos.loc[
    region_pos['Haploinsufficiency Score'] == '3',
    ['chrom', 'start', 'end', '#ISCA ID', 'ISCA Region Name']
].sort_values(['chrom', 'start', 'end'])

In [None]:
hi_region.head()

In [None]:
hi_region['omim_genes'] = hi_region.apply(
    lambda row: ','.join(omim_gene.loc[
        (omim_gene['chrom'] == row['chrom'])
        & (omim_gene['txEnd'] >= row['start'])
        & (omim_gene['txStart'] <= row['end']),
        'name2'
    ]),
    axis=1
)

In [None]:
hi_region.head()

In [None]:
hi_region.rename(columns={
    'chrom': '#chrom',
    '#ISCA ID': 'isca_id', 'ISCA Region Name': 'name'
}).to_csv(hi_region_file, sep='\t', index=False)

In [None]:
!bgzip -cf {hi_region_file} > {hi_region_file}.gz

In [None]:
!tabix -fp bed {hi_region_file}.gz
!rm {hi_region_file}

In [None]:
uhi_region = region_pos.loc[
    region_pos['Haploinsufficiency Score'] == '40',
    ['chrom', 'start', 'end', '#ISCA ID', 'ISCA Region Name']
].sort_values(['chrom', 'start', 'end'])

In [None]:
uhi_region.head()

In [None]:
uhi_region['genes'] = uhi_region.apply(
    lambda row: ','.join(gene.loc[
        (gene['chrom'] == row['chrom'])
        & (gene['txEnd'] >= row['start'])
        & (gene['txStart'] <= row['end']),
        'name2'
    ]),
    axis=1
)

In [None]:
uhi_region.head()

In [None]:
uhi_region.rename(columns={
    'chrom': '#chrom',
    '#ISCA ID': 'isca_id', 'ISCA Region Name': 'name'
}).to_csv(uhi_region_file, sep='\t', index=False)

In [None]:
!bgzip -cf {uhi_region_file} > {uhi_region_file}.gz

In [None]:
!tabix -fp bed {uhi_region_file}.gz
!rm {uhi_region_file}

In [None]:
decipher = pd.read_csv(hi_pred_ori_file, sep='\t',skiprows=1, header=None, usecols=[3,])
decipher = decipher[3].str.split('|', expand=True).rename(columns={0: 'symbol', 1: 'hi_score', 2: 'hi_index'})
decipher['hi_index'] = decipher['hi_index'].str.replace('%', '').astype(float)
decipher = decipher.merge(gene, left_on='symbol', right_on='name2')

In [None]:
decipher.head()

In [None]:
gnomad = pd.read_csv(gnomad_lof_ori_file, sep='\t', index_col=0, compression='gzip')

In [None]:
gnomad.head()

In [None]:
decipher = decipher.join(gnomad['pLI'], on='name2')
decipher = decipher.join(gnomad['oe_lof_upper'], on='name2')

In [None]:
decipher.head()

In [None]:
decipher = decipher.loc[
    (decipher['pLI'] >= 0.9) & (decipher['hi_index'] < 10) & (decipher['oe_lof_upper'] < 0.35),
    ['chrom', 'txStart', 'txEnd', 'GeneID', 'name2', 'name', 'pLI', 'hi_score']
].sort_values(['chrom', 'txStart', 'txEnd'])

In [None]:
decipher.head()

In [None]:
decipher.rename(columns={
    'chrom': '#chrom', 'txStart': 'start', 'txEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(decipher_gene_file, sep='\t', index=False)

In [None]:
!bgzip -cf {decipher_gene_file} > {decipher_gene_file}.gz

In [None]:
!tabix -fp bed {decipher_gene_file}.gz
!rm {decipher_gene_file}

In [None]:
ts_genes = set(
    curation_gene.loc[
        curation_gene['Triplosensitivity Score'] == '3', '#Gene Symbol'
    ]
)

In [None]:
ts_gene = refgene_info.loc[
    refgene_info['name2'].isin(ts_genes),
    ['chrom', 'txStart', 'txEnd', 'GeneID', 'name2', 'name', 'strand']
].sort_values(['chrom', 'txStart', 'txEnd'])

In [None]:
ts_gene.head()

In [None]:
ts_gene.rename(columns={
    'chrom': '#chrom', 'txStart': 'start', 'txEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(ts_gene_file, index=False, sep='\t')

In [None]:
!bgzip -cf {ts_gene_file} > {ts_gene_file}.gz

In [None]:
!tabix -fp bed {ts_gene_file}.gz
!rm {ts_gene_file}

In [None]:
uts_genes = set(
    curation_gene.loc[curation_gene['Triplosensitivity Score'] == '40', '#Gene Symbol']
)

In [None]:
uts_gene = refgene_info.loc[
    refgene_info['name2'].isin(uts_genes),
    ['chrom', 'txStart', 'txEnd', 'GeneID', 'name2', 'name', 'strand']
].sort_values(['chrom', 'txStart', 'txEnd'])

In [None]:
uts_gene.head()

In [None]:
uts_gene.rename(columns={
    'chrom': '#chrom', 'txStart': 'start', 'txEnd': 'end',
    'GeneID': 'gene_id', 'name2': 'symbol', 'name': 'transcript'
}).to_csv(uts_gene_file, index=False, sep='\t')

In [None]:
!bgzip -cf {uts_gene_file} > {uts_gene_file}.gz

In [None]:
!tabix -fp bed {uts_gene_file}.gz
!rm {uts_gene_file}

In [None]:
ts_region = region_pos.loc[
    region_pos['Triplosensitivity Score'] == '3',
    ['chrom', 'start', 'end', '#ISCA ID', 'ISCA Region Name']
].sort_values(['chrom', 'start', 'end'])

In [None]:
ts_region.head()

In [None]:
ts_region['omim_genes'] = ts_region.apply(
    lambda row: ','.join(omim_gene.loc[
        (omim_gene['chrom'] == row['chrom'])
        & (omim_gene['txEnd'] >= row['start'])
        & (omim_gene['txStart'] <= row['end']),
        'name2'
    ]),
    axis=1
)

In [None]:
ts_region.head()

In [None]:
ts_region.rename(columns={
    'chrom': '#chrom',
    '#ISCA ID': 'isca_id', 'ISCA Region Name': 'name'
}).to_csv(ts_region_file, sep='\t', index=False)

In [None]:
!bgzip -cf {ts_region_file} > {ts_region_file}.gz

In [None]:
!tabix -fp bed {ts_region_file}.gz
!rm {ts_region_file}

In [None]:
uts_region = region_pos.loc[
    region_pos['Triplosensitivity Score'] == '40',
    ['chrom', 'start', 'end', '#ISCA ID', 'ISCA Region Name']
].sort_values(['chrom', 'start', 'end'])

In [None]:
uts_region.head()

In [None]:
uts_region['genes'] = uts_region.apply(
    lambda row: ','.join(gene.loc[
        (gene['chrom'] == row['chrom'])
        & (gene['txEnd'] >= row['start'])
        & (gene['txStart'] <= row['end']),
        'name2'
    ]),
    axis=1
)

In [None]:
uts_region.head()

In [None]:
uts_region.rename(columns={
    'chrom': '#chrom',
    '#ISCA ID': 'isca_id', 'ISCA Region Name': 'name'
}).to_csv(uts_region_file, sep='\t', index=False)

In [None]:
!bgzip -cf {uts_region_file} > {uts_region_file}.gz

In [None]:
!tabix -fp bed {uts_region_file}.gz
!rm {uts_region_file}

In [None]:
dgv = pd.read_csv(
    dgv_ori_file,
    sep='\t', names=['chrom', 'info'], usecols=[0, 8]
).drop_duplicates('info')
info = dgv['info'].str.extract(
    r'ID=(?P<id>[^;]+).*variant_sub_type=(?P<type>[^;]+).*inner_start=(?P<start>[^;]+).*inner_end=(?P<end>[^;]+).*Frequency=(?P<freq>\S+?)%;.*num_unique_samples_tested=(?P<sample>[^;]+)'
).astype({'start': int, 'end': int, 'sample': int, 'freq': float})
dgv = dgv.merge(info, left_index=True, right_index=True)
dgv['af'] = dgv['freq'] / 100
dgv = dgv[dgv['sample'] >= 1000].sort_values(['chrom', 'start', 'end'])

In [None]:
dgv.head()

In [None]:
def fetch_gene(gene, chrom, start, end):
    return ','.join(gene.loc[
        (gene['chrom'] == chrom) & (gene['txEnd'] >= start) & (gene['txStart'] <= end), 'name2'
    ])

In [None]:
with Pool(processes=7) as pool:
    dgv['genes'] = pool.starmap(fetch_gene, (
        (gene, row['chrom'], row['start'], row['end']) for _, row in dgv.iterrows()
    ), chunksize=70)

In [None]:
dgv.head()

In [None]:
dgv.loc[
    dgv['type'] == 'Gain', ['chrom', 'start', 'end', 'id', 'genes', 'af']
].rename(columns={'chrom': '#chrom'}).to_csv(dgv_gain_file, sep='\t', index=False)

In [None]:
!bgzip -cf {dgv_gain_file} > {dgv_gain_file}.gz

In [None]:
!tabix -fp bed {dgv_gain_file}.gz
!rm {dgv_gain_file}

In [None]:
dgv.loc[
    dgv['type'] == 'Loss', ['chrom', 'start', 'end', 'id', 'genes', 'af']
].rename(columns={'chrom': '#chrom'}).to_csv(dgv_loss_file, sep='\t', index=False)

In [None]:
!bgzip -cf {dgv_loss_file} > {dgv_loss_file}.gz

In [None]:
!tabix -fp bed {dgv_loss_file}.gz
!rm {dgv_loss_file}

In [None]:
gnomad = pd.read_csv(
    gnomad_control_ori_file, sep='\t',  dtype=str,
    usecols=[0, 1, 2, 3, 4, 37, 38, 73, 74, 107, 108, 141, 142, 175, 176, 241]
)
gnomad = gnomad[
    (gnomad['FILTER'] == 'PASS') & gnomad['svtype'].isin(['DEL', 'DUP'])
]
gnomad = gnomad[
    (gnomad['N_BI_GENOS'].astype(int) >= 1000) |
    (gnomad['AFR_N_BI_GENOS'].astype(int) >= 1000) |
    (gnomad['AMR_N_BI_GENOS'].astype(int) >= 1000) |
    (gnomad['EAS_N_BI_GENOS'].astype(int) >= 1000) |
    (gnomad['EUR_N_BI_GENOS'].astype(int) >= 1000)
]
gnomad['#chrom'] = 'chr' + gnomad['#chrom']
gnomad['start'] = gnomad['start'].astype(int)
gnomad['end'] = gnomad['end'].astype(int)

In [None]:
gnomad.head()

In [None]:
with Pool(processes=7) as pool:
    gnomad['genes'] = pool.starmap(fetch_gene, (
        (gene, row['#chrom'], row['start'], row['end']) for _, row in gnomad.iterrows()
    ), chunksize=70)

In [None]:
# gnomad['genes'] = pool.starmap(fetch_gene, (
#         (gene, row['#chrom'], row['start'], row['end']) for _, row in gnomad.iterrows()
#     ), chunksize=70)
gnomad['gene'] = gnomad.apply(lambda row: fetch_gene(gene, row['#chrom'], row['start'], row['end']), axis=1)

In [None]:
gnomad.head()

In [None]:
gnomad[
    gnomad['svtype'] == 'DEL'
][['#chrom', 'start', 'end', 'name', 'gene', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF']].rename(columns={
    'AF': 'af', 'AFR_AF': 'af_afr', 'AMR_AF': 'af_amr', 'EAS_AF': 'af_eas', 'EUR_AF': 'af_eur'
}).to_csv(gnomad_del_file, sep='\t', index=False)

In [None]:
!bgzip -cf {d} > {gnomad_del_file}.gz

In [None]:
!tabix -fp bed {gnomad_del_file}.gz
!rm {gnomad_del_file}

In [None]:
gnomad.loc[
    gnomad['svtype'] == 'DUP',
    ['#chrom', 'start', 'end', 'name', 'genes', 'AF', 'AFR_AF', 'AMR_AF', 'EAS_AF', 'EUR_AF']
].rename(columns={
    'AF': 'af', 'AFR_AF': 'af_afr', 'AMR_AF': 'af_amr', 'EAS_AF': 'af_eas', 'EUR_AF': 'af_eur'
}).to_csv(gnomad_dup_file, sep='\t', index=False)

In [None]:
!bgzip -cf {gnomad_dup_file} > {gnomad_dup_file}.gz

In [None]:
!tabix -fp bed {gnomad_dup_file}.gz
!rm {gnomad_dup_file}

In [None]:
!tabix -fp bed {gnomad_dup_file}.gz
!rm {gnomad_dup_file}