In [1]:
import numpy as np
import pandas as pd

In [None]:
def extract_elof_genes(row):
    if (row['pLI'] > 0.9
            and row['oe_lof_upper'] < 0.35
            and row['pHaplo'] > 0.86):
        return True
    else:
        return False 

def extract_g2pLoF_genes(x):
    if 'absent gene product' in x:
        return 'PASS'
    else:
        return '.'

def extract_definitive(x):
    if x == 'definitive':
        return 'PASS'
    else:
        return '.'

def extract_strong(x):
    if (x == 'definitive' or x == 'strong'):
        return 'PASS'
    else:
        return '.'

def extract_moderate(x):
    if (x == 'definitive' or x == 'strong' or x == 'moderate'):
        return 'PASS'
    else:
        return '.'

In [16]:
# Curation data
ajhg_path = 'AJHG_ClinGenLoF_genes.txt.gz'
hi_path = 'ClinGen_haploinsufficiency_gene_GRCh37.bed'
gnomad_path = 'gnomad.v2.1.1.lof_metrics.by_gene.txt.gz'
phaplo_path = 'Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz'

# G2P data
dd = './DDG2P.csv.gz'
eye = './EyeG2P.csv.gz'
skin = './SkinG2P.csv.gz'
cancer = './CancerG2P.csv.gz'
cardiac = './CardiacG2P.csv.gz'
skeletal = './SkeletalG2P.csv.gz'

# Validation set
am_path = 'AlphaMissense_gene_hg19.tsv.gz'
genovo_path = 'GenovoLOEUF.txt.gz'
gnocchi_path = 'EnhancerGnocchi.txt.gz'

# Load data
ajhg = pd.read_csv(
    ajhg_path, sep='\t', header=None, skiprows=1,
    names=['gene', 'pLI', 'RefSeq', 'SpliceAcceptor', 'SpliceDonor'],
    usecols=['gene', 'RefSeq']) 

gnomad = pd.read_csv(
    gnomad_path, sep='\t', compression='gzip',
    usecols=['gene', 'pLI', 'oe_lof_upper', 'transcript', 'cds_length']
    )

higene = pd.read_csv(
    hi_path, sep='\t', header=None, skiprows=1,
    names=['CHROM', 'start', 'end', 'gene', 'HIscore'],
    usecols=['gene', 'HIscore']
    ) 

phaplo = pd.read_csv(
    phaplo_path, sep='\t', header=None, compression='gzip', skiprows=1,
    names=['gene', 'pHaplo', 'pTriplo'],
    usecols=['gene', 'pHaplo']
    ) 

genovo = pd.read_csv(
    genovo_path, sep='\t', compression='gzip',
    usecols=['enstID', 'enstVersion', 'genovo_LOEUF_corrected']
    )

gnocchi = pd.read_csv(
    gnocchi_path, sep='\t', compression='gzip',
    usecols=['gene', 'enhancer_Gnocchi']
    )

am = pd.read_csv(am_path, sep='\t', compression='gzip', skiprows=3)
am['enstID'] = am['transcript_id'].str.split('.').str[0]
am['version'] = am['transcript_id'].str.split('.').str[1]
am.drop(columns=['transcript_id'], inplace=True)
am = am[['enstID', 'version', 'mean_am_pathogenicity']]

dddf = pd.read_csv(dd, sep=',', header=0, compression='infer')
eyedf = pd.read_csv(eye, sep=',', header=0, compression='infer')
skidf = pd.read_csv(skin, sep=',', header=0, compression='infer')
candf = pd.read_csv(cancer, sep=',', header=0, compression='infer')
cardf = pd.read_csv(cardiac, sep=',', header=0, compression='infer')
skedf = pd.read_csv(skeletal, sep=',', header=0, compression='infer')
dfs = [dddf, eyedf, skidf, candf, cardf, skedf]


rename_cols = {
    'gene symbol': 'gene',
    'prev symbols': 'prev_symbols', 
    'hgnc id': 'hgnc_id',
    'gene mim': 'gene_mim', 
    'mutation consequence': 'mutation_consequence',
    'confidence category': 'confidence_category',
    'allelic requirement': 'allelic_requirement', 
    'disease name': 'disease_name'
    }

pickup_cols = [
    'gene', 'prev_symbols', 'hgnc_id',  
    'mutation_consequence', 'confidence_category', 
    'allelic_requirement', 'panel', 'disease_name', 
    'g2pLoF', 'definitiveFILTER', 'strongFILTER','moderateFILTER'
    ]



In [None]:
for df in dfs:
    df['g2pLoF'] = df['mutation consequence'].apply(extract_g2pLoF_genes)

for df in dfs:
    df['definitiveFILTER'] = df['confidence category'].apply(extract_definitive)
    df['strongFILTER'] = df['confidence category'].apply(extract_strong)
    df['moderateFILTER'] = df['confidence category'].apply(extract_moderate)

### Rename columns
dddf = dddf.rename(columns=rename_cols)
eyedf = eyedf.rename(columns=rename_cols)
skidf = skidf.rename(columns=rename_cols)
candf = candf.rename(columns=rename_cols)
cardf = cardf.rename(columns=rename_cols)
skedf = skedf.rename(columns=rename_cols)

### Extract columns for analysis
dddf = dddf[pickup_cols]
eyedf = eyedf[pickup_cols]
skidf = skidf[pickup_cols]
candf = candf[pickup_cols]
cardf = cardf[pickup_cols]
skedf = skedf[pickup_cols]

### Extract LoF genes
lof_dd = dddf[dddf['g2pLoF'] == 'PASS']
lof_eye = eyedf[eyedf['g2pLoF'] == 'PASS']
lof_ski = skidf[skidf['g2pLoF'] == 'PASS']
lof_can = candf[candf['g2pLoF'] == 'PASS']
lof_car = cardf[cardf['g2pLoF'] == 'PASS']
lof_ske = skedf[skedf['g2pLoF'] == 'PASS']


### Concatnate dataframes
alldf = pd.concat([lof_dd, lof_eye, lof_ski, lof_can, lof_car, lof_ske])
alldf.fillna(value='.', inplace=True)

### Extract genes with strong evidence
alldf_filterd = alldf[alldf['strongFILTER'] == 'PASS']
alldf_filterd = alldf_filterd.drop_duplicates(subset='gene')

# Gene lists from 4 sources
ajhg_genes = ajhg['gene']
dcpr_genes = alldf_filterd['gene']
hi_genes = hi['gene']
phaplo_genes = phaplo['gene']

In [17]:
# Curation set
print('Curation set')
print('--- AJHG ---')
print(ajhg.head(3))
print(ajhg['gene'].nunique())
print('\n--- gnomAD ---')
print(gnomad.head(3))
print(gnomad['gene'].nunique())
print('\n--- HI ---')
print(higene.head(3))
print(higene['gene'].nunique())
print('\n--- Phaplo ---')
print(phaplo.head(3))
print(phaplo['gene'].nunique())


Curation set
--- AJHG ---
   gene       RefSeq
0  AAAS  NM_015665.5
1  AARS  NM_001605.2
2  AASS  NM_005763.3
1043

--- gnomAD ---
    gene       transcript  pLI  oe_lof_upper  cds_length
0  MED13  ENST00000397786  1.0         0.030        6522
1  NIPBL  ENST00000282516  1.0         0.032        8412
2   SMC3  ENST00000361804  1.0         0.037        3651
19658

--- HI ---
      gene  HIscore
0     AGRN       30
1  B3GALT6       30
2     GNB1        1
1500

--- Phaplo ---
      gene    pHaplo
0  CACNA1C  0.998982
1   ZNF462  1.000000
2     CHD8  0.991650
18641


In [19]:
print('Validation set')
print('--- AlphaMissense ---')
print(am.head(3))
print(am['enstID'].nunique())
print('\n--- Gnocchi ---')
print(gnocchi.head(3))
print(gnocchi['gene'].nunique())
print('\n--- Genovo ---')
print(genovo.head(3))
print(genovo['enstID'].nunique())

Validation set
--- AlphaMissense ---
            enstID version  mean_am_pathogenicity
0  ENST00000000233       5               0.742270
1  ENST00000000412       3               0.378343
2  ENST00000001008       4               0.422290
18761

--- Gnocchi ---
     gene  enhancer_Gnocchi
0   NOC2L          1.509945
1  SAMD11          1.579239
2  RNF223          2.725941
15541

--- Genovo ---
            enstID  enstVersion  genovo_LOEUF_corrected
0  ENST00000394484            1                0.000000
1  ENST00000329516            3                0.039973
2  ENST00000330331            5                0.071441
57308


In [21]:
# Merge AlphaMissense and Genovo data by enstID
am_genovo = pd.merge(am, genovo, on='enstID', how='inner')
print('\n--- AlphaMissense + Genovo ---')
print(am_genovo.head(3))
print(am_genovo['enstID'].nunique())



--- AlphaMissense + Genovo ---
            enstID version  mean_am_pathogenicity  enstVersion  \
0  ENST00000000233       5               0.742270            5   
1  ENST00000000412       3               0.378343            3   
2  ENST00000001008       4               0.422290            4   

   genovo_LOEUF_corrected  
0                0.109528  
1                0.787049  
2                0.524644  
16355


In [73]:
merged_gnocchi = gnocchi[gnocchi['tissue'] == 'Merged']
genelist = list(merged_gnocchi['gene'] )

In [76]:
raw_genelist = list(gnocchi['gene'].unique())

In [78]:
# comparison of gene lists (genelist vs. raw_genelist)
print(len(genelist))
print(len(raw_genelist))

# substraction of gene lists
diff = list(set(raw_genelist) - set(genelist))
print(diff)

15541
15860
['DCAF4L2', 'TAS2R20', 'ADAM8', 'SPTLC1', 'CALML5', 'USP17L17', 'C11orf94', 'CNOT3', 'NRG1', 'SPHAR', 'DDX11', 'C3orf56', 'ZNF382', 'SLC25A11', 'ZDHHC24', 'OR2M5', 'OR1N1', 'OR2T8', 'PAGR1', 'KRTAP12-1', 'SUMO4', 'CSF3', 'LCE6A', 'SPATA12', 'CCDC15', 'AGBL3', 'IFNA4', 'TAS2R42', 'LCE4A', 'HUS1B', 'CTAGE4', 'GPR84', 'ZNF497', 'COPS6', 'OR14A16', 'CEMP1', 'DIRAS3', 'TAS2R31', 'FAM170A', 'STH', 'TAS2R14', 'ZNF860', 'DNPH1', 'CCAR2', 'NR2E3', 'CTAGE8', 'IFNA8', 'FITM1', 'HIST2H3C', 'LENG9', 'OR7C1', 'UFC1', 'TIAF1', 'IFNB1', 'TYSND1', 'DVL2', 'KMT2B', 'CHRNB1', 'CARD6', 'OGDHL', 'HIST2H4B', 'OR14K1', 'IFNA21', 'TAS2R1', 'CCT6A', 'SYTL1', 'SRRM5', 'FPR1', 'HRCT1', 'MRGPRD', 'KIR3DX1', 'PTPN6', 'MANEAL', 'TAF1C', 'PTTG2', 'FAM218A', 'NDUFB10', 'TAS2R60', 'VN1R1', 'KRTAP4-3', 'CCDC177', 'KRTAP21-2', 'PM20D1', 'FAM71D', 'KRTDAP', 'PHGR1', 'MRGPRX1', 'PSAPL1', 'TRIM73', 'LRRN4CL', 'MRGPRX2', 'FPR3', 'ANAPC15', 'BLID', 'FRMD1', 'LCN6', 'NDUFA7', 'NPIPB5', 'PARP2', 'CFC1B', 'ZNF219', 

In [84]:
ascl2 = gnocchi[gnocchi['gene'] == 'ASCL2']
ascl2

Unnamed: 0,tissue,gene,ENSG,z
2126,ESC,ASCL2,ENSG00000183734,5.063194
31916,iPSC,ASCL2,ENSG00000183734,5.063194
47167,Blood_T-cell,ASCL2,ENSG00000183734,6.084149
47168,Digestive,ASCL2,ENSG00000183734,6.084149
67741,HSC_B-cell,ASCL2,ENSG00000183734,6.084149
108892,Epithelial,ASCL2,ENSG00000183734,5.521794
121319,Thymus,ASCL2,ENSG00000183734,6.084149
182268,PLACENTA,ASCL2,ENSG00000183734,5.521794
192625,PANCREAS,ASCL2,ENSG00000183734,5.569333
199282,LUNG,ASCL2,ENSG00000183734,6.084149


In [83]:
ascl2_without_merged = ascl2[ascl2['tissue'] != 'Merged']
mean_ascl2 = ascl2_without_merged['z'].mean()

print(mean_ascl2)

5.64469795825
