### Feature annotation of inactive genes

In [1]:
import pandas as pd 
import pysam
import numpy as np
from pybedtools import BedTool
from io import StringIO
from functools import reduce
import openpyxl
from pathlib import Path

In [2]:
wkdir="/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)

### inputs 
# inactive genes list 
inactive_genes_path = wkdir_path.joinpath("2_misexp_qc/misexp_gene_cov_corr/gene_id_post_tech_cov_qc_8650.txt")
# TPM expression matrix 
tpm_mtx_path = wkdir_path.joinpath("1_rna_seq_qc/tpm_mtx/tpm_4568samples_59144genes_smpl_qc.csv")
# gencode .gtf
gencode_path = wkdir_path.joinpath("reference/gencode/gencode.v31.annotation.sorted.gtf.gz")
# gnomad constraint information by transcript 
gnomad_constraint_path = wkdir_path.joinpath("reference/gnomad/gnomad.v2.1.1.lof_metrics.by_transcript.txt")
# pHaplo and pTriplo
ptriplo_phaplo_path = wkdir_path.joinpath("reference/phaplo_ptriplo/1-s2.0-S0092867422007887-mmc7.xlsx")
# GERP elements
gerp_elem_path = wkdir_path.joinpath("reference/conservation/gerp/gerpElements_hg38_multiz120Mammals.bed")
# Enhancer domain scores (EDS)
eds_path = wkdir_path.joinpath("reference/eds/1-s2.0-S0002929720300124-mmc2.xlsx")
# Episcore
episcore_path = wkdir_path.joinpath("reference/episcore/41467_2018_4552_MOESM5_ESM.xlsx")
# GTEx pass eQTL QC 
gtex_pass_eqtl_count_path = wkdir_path.joinpath("1_rna_seq_qc/gtex_pass_eqtl_qc/gene_pass_eqtl_count.csv")
# median gene TPM per tissue GTEx 
gtex_median_tpm_path = wkdir_path.joinpath("reference/gtex/median_tpm/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.tsv")
# GM12878 A/B compartments 
gm12878_ab_compartments_path = wkdir_path.joinpath("reference/4d_nucleome/gm12878_hi_c/compartments/4DNFILYQ1PAY.bg")
# gnomAD gene sets 
gnomad_gene_sets_path = wkdir_path.joinpath("reference/gnomad/supplement_2020/supplement/supplementary_dataset_13_gene_lists.tsv.gz")
# cosmic genes 
cosmic_genes_path = wkdir_path.joinpath("reference/cosmic_v97/cancer_gene_census.csv")
# OMIM genes 
omim_genes_path = wkdir_path.joinpath("reference/omim/genemap2.txt")
# TAD boundaries (GM12878 shared)
tad_boundaries_path = wkdir_path.joinpath("reference/4d_nucleome/shared_boundaries/4DNFIVK5JOFU_imr90_huvec_hnek_hmec.bed")
# Open Targets approved 
approved_drug_targets_path=wkdir_path.joinpath("reference/opentargets/targets/open_targets_approved_drugs.txt")
# Decipher genes 
decipher_genes_path = wkdir_path.joinpath("reference/decipher/DDG2P.csv.gz")
# chromHMM PBMCs fraction 
chromhmm_fraction_path = wkdir_path.joinpath("2_misexp_qc/chrom_hmm/gene_chromhmm_state_overlap.csv")
# phyloP gene body 
phylop_gene_body_path=wkdir_path.joinpath("3_misexp_genes/phylop/inactive_gene_phylop_gene_body.tsv")
# output directory
out_dir_path = wkdir_path.joinpath("3_misexp_genes")
out_dir_path.mkdir(parents=True, exist_ok=True)

In [3]:
# load inactive genes 
inactive_gene_id_df = pd.read_csv(inactive_genes_path, sep="\t", header=None).rename(columns={0:"gene_id"})
inactive_gene_id_set = set(inactive_gene_id_df.gene_id.unique())
print(f"Number of inactive genes: {len(inactive_gene_id_set)}")
gene_features_df = inactive_gene_id_df.copy()

# get genes with median TPM > 0.5 
tpm_mtx_df = pd.read_csv(tpm_mtx_path)
tpm_mtx_idx_df = tpm_mtx_df.set_index("gene_id")
tpm_median_df = pd.DataFrame(tpm_mtx_idx_df.median(axis=1)).rename(columns={0:"median_tpm"}).reset_index()
intrvl_median_tpm05_gene_ids = tpm_median_df[tpm_median_df.median_tpm > 0.5].gene_id.tolist()
print(f"INTERVAL, Number of gene IDs with median TPM > 0.5: {len(intrvl_median_tpm05_gene_ids)}")
# check no overlap active and inactive
overlap_active_inactive = set(inactive_gene_id_set).intersection(intrvl_median_tpm05_gene_ids)
print(f"Overlap between median TPM > 0.5 and inactive gene set: {len(overlap_active_inactive)}")

Number of inactive genes: 8650
INTERVAL, Number of gene IDs with median TPM > 0.5: 17418
Overlap between median TPM > 0.5 and inactive gene set: 0


In [4]:
# generate bed file of inactive gene, active (median TPM > 0.5) and all genes bed file, collect gene names and gene length
inactive_gene_info_dict = {}
bed_file_dir = out_dir_path.joinpath("bed_files")
bed_file_dir.mkdir(parents=True, exist_ok=True)
inactive_genes_bed_path = bed_file_dir.joinpath("inactive_genes.bed")
active_genes_bed_path = bed_file_dir.joinpath("genes_median_tpm_grtr0.5.bed")
all_genes_bed_path = bed_file_dir.joinpath("all_genes.bed")

with open(inactive_genes_bed_path, "w") as f_inactive, open(active_genes_bed_path, "w") as f_active, open(all_genes_bed_path, "w") as f_all:
    for gtf in pysam.TabixFile(str(gencode_path)).fetch(parser = pysam.asGTF()):
        if gtf.feature == "gene":
            gene_id = gtf.gene_id.split(".")[0]
            chrom, start, end, strand = gtf.contig, gtf.start, gtf.end, gtf.strand
            # write all genes to bed file 
            f_all.write(f"{chrom}\t{start}\t{end}\t{gene_id}\t0\t{strand}\n")
            if gene_id in inactive_gene_id_set:
                gene_len = end - start
                inactive_gene_info_dict[gene_id] = [gtf.gene_name, gene_len]
                # write inactive genes to an additional bed file
                f_inactive.write(f"{chrom}\t{start}\t{end}\t{gene_id}\t0\t{strand}\n")
            if gene_id in intrvl_median_tpm05_gene_ids: 
                # write active genes to an additional bed file
                f_active.write(f"{chrom}\t{start}\t{end}\t{gene_id}\t0\t{strand}\n")

if set(inactive_gene_info_dict.keys()) != inactive_gene_id_set: 
    raise ValueError(f"Inactive genes missing from input gencode: {inactive_gene_id_set - set(gene_id_gene_symbol_dict.keys())}") 

In [5]:
### genomic features 

## gene length 
inactive_gene_info_df = pd.DataFrame.from_dict(inactive_gene_info_dict, orient="index", columns=["gene_name", "gene_length"])
inactive_gene_info_df = inactive_gene_info_df.reset_index().rename(columns={"index":"gene_id"})

## distance to closest gene 
# load bed files 
inactive_genes_bed = BedTool(inactive_genes_bed_path)
all_genes_bed = BedTool(all_genes_bed_path)

# get closest gene distance, -d reports distance, -N requires different names for query and closest 
gene_gene_distance_str = StringIO(str(inactive_genes_bed.closest(all_genes_bed, d=True, N=True)))
gene_gene_dist_cols = {0:"chrom_a", 1:"start_a", 2:"end_a", 3:"gene_id_a", 4:"score", 5:"strand",
                       6:"chrom_b", 7:"start_b", 8:"end_b", 9:"gene_id_b", 10:"score", 11:"strand", 
                      12:"gene_distance_min"
                      }
gene_gene_distance_df = pd.read_csv(gene_gene_distance_str, sep="\t", header=None).rename(columns=gene_gene_dist_cols)
inactive_gene_distance_df = pd.DataFrame(gene_gene_distance_df.groupby("gene_id_a")["gene_distance_min"].min().reset_index()).rename(columns={"gene_id_a": "gene_id", "distance":f"min_distance"})

# check no identical gene overlaps
identical_gene_overlap = gene_gene_distance_df[gene_gene_distance_df["gene_id_a"] == gene_gene_distance_df["gene_id_b"]].shape[0]
if identical_gene_overlap != 0: 
    raise ValueError("Identical genes found in dataframe in closest gene dataframe.")
    
## gene density 
# Count number of genes in +/-1Mb window 
gene_density_window = 1000000
gene_gene_density_str = StringIO(str(inactive_genes_bed.window(all_genes_bed, w=gene_density_window)))
gene_gene_density_cols = {0:"chrom_a", 1:"start_a", 2:"end_a", 3:"gene_id_a", 4:"score", 5:"strand",  
                       6:"chrom_b", 7:"start_b", 8:"end_b", 9:"gene_id_b", 10:"score", 11:"strand"}
gene_gene_density_df = pd.read_csv(gene_gene_density_str, sep="\t", header=None).rename(columns=gene_gene_density_cols)
# remove gene duplicates in window 
gene_gene_density_df = gene_gene_density_df[~(gene_gene_density_df.gene_id_a == gene_gene_density_df.gene_id_b)]

# count unique genes in window (avoid duplicate gene IDs)
inactive_gene_density_df = pd.DataFrame(gene_gene_density_df.groupby("gene_id_a")["gene_id_b"].nunique().reset_index()).rename(columns={"gene_id_a": "gene_id", "gene_id_b": "gene_count_1Mb"})

In [6]:
### conservation and constraint 

## constraint
#gnomAD 
gnomad_constraint_df = pd.read_csv(gnomad_constraint_path, sep="\t")
# subset to canonical transcript 
gnomad_constraint_canon_df = gnomad_constraint_df[gnomad_constraint_df.canonical]
print(f"Number of canonical transcripts with gnomAD metrics: {gnomad_constraint_canon_df.shape[0]}")
gnomad_constraint_canon_metrics_df = gnomad_constraint_canon_df[["gene_id", "pLI", "pNull", "pRec", "oe_lof_upper", "oe_mis_upper"]]
# annotate inactive genes 
inactive_gene_gnomad_df = pd.merge(inactive_gene_info_df[["gene_id"]], gnomad_constraint_canon_metrics_df, on="gene_id", how="left")
inactive_gene_with_gnomad_info = inactive_gene_gnomad_df[inactive_gene_gnomad_df.oe_lof_upper.notna()].shape[0]
print(f"Inactive genes with gnomAD metrics: {inactive_gene_with_gnomad_info}")

## pHaplo, pTriplo (Collins et al., 2022)
ptriplo_phaplo_full_df = pd.read_excel(ptriplo_phaplo_path).rename(columns={"Gene":"gene_name"})
print(f"Number of genes with pHaplo and pTriplo data: {ptriplo_phaplo_full_df.shape[0]}")
# subset to pHaplo and pTriplo
ptriplo_phaplo_df = ptriplo_phaplo_full_df[["gene_name", "pHaplo", "pTriplo"]]
ptriplo_phaplo_gene_id_df = pd.merge(inactive_gene_info_df[["gene_id", "gene_name"]], ptriplo_phaplo_df, on="gene_name", how="left")
inactive_gene_with_phalpo_ptriplo_info = ptriplo_phaplo_gene_id_df[ptriplo_phaplo_gene_id_df.pHaplo.notna()].shape[0]
print(f"Inactive genes with pHaplo and pTriplo data: {inactive_gene_with_phalpo_ptriplo_info}")

## EDS (enhancer domain score)
# load EDS scores 
eds_df = pd.read_excel(eds_path).rename(columns={"GeneSymbol":"gene_id"})
# EDS score and information on number of enhancers - proximity and activity linking 
eds_columns_to_keep = ['gene_id', 'EDS', 'ActivityLinking_Conserved_nt_count',
                       'ActivityLinking_nt_count', 'ProximityLinking_Conserved_nt_count',
                       'ProximityLinking_nt_count', 'ActivityLinking_EnhancerNumber',
                       'ActivityLinking_NumberConservedElements',
                       'ProximityLinking_EnhancerNumber','ProximityLinking_NumberConservedElements']
gene_features_eds_df = pd.merge(gene_features_df, eds_df, 
                                on="gene_id", 
                                how="left")[eds_columns_to_keep].drop_duplicates()
inactive_genes_with_eds = gene_features_eds_df[gene_features_eds_df.EDS.notna()].shape[0]
print(f"Number of inactive genes with EDS: {inactive_genes_with_eds}")

## Episcore 
# load episcore
episcore_df = pd.read_excel(episcore_path)
# replace header with first row 
header = episcore_df.iloc[0]
episcore_df = episcore_df[1:] 
episcore_df.columns = header
episcore_clean_df = episcore_df[["EnsembleID","Episcore"]].rename(columns={"EnsembleID":"gene_id"})
# annotated inactive gene set 
gene_features_episcore_df = pd.merge(inactive_gene_info_df[["gene_id"]], episcore_clean_df, 
                                     on="gene_id", 
                                     how="left").drop_duplicates()
gene_features_episcore_df = gene_features_episcore_df.astype({"Episcore": float})
inactive_gene_with_episcore = gene_features_episcore_df[gene_features_episcore_df.Episcore.notna()].shape[0]
print(f"Number of genes with episcore: {inactive_gene_with_episcore}")

Number of canonical transcripts with gnomAD metrics: 19704
Inactive genes with gnomAD metrics: 2934
Number of genes with pHaplo and pTriplo data: 18641
Inactive genes with pHaplo and pTriplo data: 2850


  for idx, row in parser.parse():


Number of inactive genes with EDS: 3140
Number of genes with episcore: 3092


In [7]:
## conservation 
# Number of conserved GERP elements per base pair within +/-10kb from gene 
gerp_elem_window = 10000
gerp_elem_bed = BedTool(gerp_elem_path)

gerp_elem_intersect_str = StringIO(str(inactive_genes_bed.window(gerp_elem_bed, w=gerp_elem_window)))
gerp_elem_intersect_cols = {0:"chrom_gene", 1:"start_gene", 2:"end_gene", 3:"gene_id", 4:"score", 5:"strand",  
                            6:"chrom_elem", 7:"start_elem", 8:"end_elem", 9:"elem_id"}
gerp_elem_intersect_df = pd.read_csv(gerp_elem_intersect_str, sep="\t", header=None).rename(columns=gerp_elem_intersect_cols)
# count number of intersecting elements 
gerp_elem_count_df = pd.DataFrame(gerp_elem_intersect_df.groupby("gene_id", as_index=False).elem_id.count())
# count missing elements
gerp_elem_count_df = pd.merge(inactive_gene_info_df[["gene_id", "gene_length"]], gerp_elem_count_df, 
                              on="gene_id", 
                              how="left").fillna(0)
gerp_elem_count_df = gerp_elem_count_df.rename(columns={"elem_id":"gerp_element_count"})
gerp_elem_count_df["window_length"] = gerp_elem_count_df.gene_length + 2 * gerp_elem_window
gerp_elem_count_df["gerp_element_per_bp"] = gerp_elem_count_df.gerp_element_count/gerp_elem_count_df.window_length
gerp_elem_count_per_bp_df = gerp_elem_count_df[["gene_id", "gerp_element_per_bp"]]

## mean 100-way phyloP score gene body
phylop_gene_body_df = pd.read_csv(phylop_gene_body_path, sep="\t")
gene_mean_phylop_df = phylop_gene_body_df[["gene_id", "phylop_mean"]]

In [8]:
### Gene expression features 
gene_features_expression_df = inactive_gene_info_df[["gene_id"]].copy()

# load median TPM per tissue 
gtex_median_tpm_tissue_trunc_df = pd.read_csv(gtex_median_tpm_path, sep="\t").drop(columns=["Description"]).rename(columns={"Name":"gene_id"})
gtex_median_tpm_tissue_trunc_df["gene_id"] = gtex_median_tpm_tissue_trunc_df.gene_id.str.split(".").str[0]
gtex_median_tpm_tissue_trunc_df = gtex_median_tpm_tissue_trunc_df.set_index("gene_id")

## number of tissues gene is expressed 
tissues_to_drop = ['Cells - Cultured fibroblasts', 'Cells - EBV-transformed lymphocytes']
gtex_median_tpm_tissue_trunc_df = gtex_median_tpm_tissue_trunc_df.drop(columns=tissues_to_drop)
tissue_expression_count_df = pd.DataFrame((gtex_median_tpm_tissue_trunc_df > 0.5).sum(axis=1))
tissue_expression_count_df = tissue_expression_count_df.rename(columns={0:"tissue_expression"}).reset_index()
inactive_tissue_expression_df = pd.merge(inactive_gene_info_df[["gene_id"]], 
                                               tissue_expression_count_df, on="gene_id", how="left")

## expression by tissue
# collect tissues and associated sub-tissues 
gtex_organs_to_tissue_dict = {}
for column in gtex_median_tpm_tissue_trunc_df.columns: 
    organ_name = column.split(" - ")[0]
    gtex_organs_to_tissue_dict.setdefault(organ_name,[]).append(column)
# remove blood and cells 
for organ in ["Cells", "Whole Blood"]:
    gtex_organs_to_tissue_dict.pop(organ, None)
# if median TPM > 0.5 in any sub-tissue then gene expressed in tissue
for organ in gtex_organs_to_tissue_dict.keys():
    tissues = gtex_organs_to_tissue_dict[organ]
    gtex_tpm_organ_df = gtex_median_tpm_tissue_trunc_df[tissues].copy()
    #categorical variable for gene expression, TPM > 0.5
    gene_tpm_binary_df = pd.DataFrame((gtex_tpm_organ_df > 0.5).any(axis=1).astype(int)).rename(columns={0:f"{organ.lower()}_expression"})
    gene_features_expression_df = pd.merge(gene_features_expression_df, gene_tpm_binary_df, on="gene_id", how="left")

In [9]:
## distance to nearest active gene 
active_genes_bed = BedTool(active_genes_bed_path)
active_gene_distance_str = StringIO(str(inactive_genes_bed.closest(active_genes_bed, d=True, N=True)))
active_gene_distance_df = pd.read_csv(active_gene_distance_str, sep="\t", header=None).rename(columns=gene_gene_dist_cols)
active_gene_to_gene_distance_df = pd.DataFrame(active_gene_distance_df.groupby("gene_id_a")["gene_distance_min"].min().reset_index()).rename(columns={"gene_id_a": "gene_id", "gene_distance_min":f"active_gene_distance"})

# check no identical gene overlaps
identical_gene_overlap = active_gene_distance_df[active_gene_distance_df["gene_id_a"] == active_gene_distance_df["gene_id_b"]].shape[0]
if identical_gene_overlap != 0: 
    raise ValueError("Identical genes found in dataframe in closest gene dataframe.")
# subset to inactive genes 
inactive_gene_active_distance_df = pd.merge(inactive_gene_info_df[["gene_id"]], active_gene_to_gene_distance_df, on="gene_id", how="left")

## active gene density 
# Count number of genes in +/-1Mb window 
active_gene_gene_density_str = StringIO(str(inactive_genes_bed.window(active_genes_bed, w=gene_density_window)))
active_gene_gene_density_df = pd.read_csv(active_gene_gene_density_str, sep="\t", header=None).rename(columns=gene_gene_density_cols)
# remove gene duplicates in window 
active_gene_gene_density_df = active_gene_gene_density_df[~(active_gene_gene_density_df.gene_id_a == active_gene_gene_density_df.gene_id_b)]
# count unique genes in window (avoid duplicate gene IDs)
active_gene_density_df = pd.DataFrame(active_gene_gene_density_df.groupby("gene_id_a")["gene_id_b"].nunique().reset_index()).rename(columns={"gene_id_a": "gene_id", "gene_id_b": "active_gene_count_1Mb"})
# fill in missing genes
active_gene_density_all_genes_df = pd.merge(inactive_gene_info_df[["gene_id"]], active_gene_density_df, on="gene_id", how="left").fillna(0)

In [10]:
### regulatory features 

## fraction of gene body overlapping chromatin state in PBMCs
chromhmm_fraction_df = pd.read_csv(chromhmm_fraction_path)
chromhmm_fraction_df["gene_id"] = chromhmm_fraction_df.gene_id.str.split(".").str[0]
chromhmm_fraction_inactive_df = pd.merge(inactive_gene_info_df[["gene_id"]], chromhmm_fraction_df, on="gene_id", how="left")

chromhmm_15states = ['1_TssA', '2_TssAFlnk', '3_TxFlnk', '4_Tx', '5_TxWk', '6_EnhG', '7_Enh', '8_ZNF/Rpts', '9_Het', 
                     '10_TssBiv', '11_BivFlnk', '12_EnhBiv', '13_ReprPC', '14_ReprPCWk', '15_Quies']
chrom_col_replace = {col:f"gene_fraction_{col.split('_')[1].replace('/', '')}" for col in chromhmm_15states} 
chromhmm_fraction_inactive_df = chromhmm_fraction_inactive_df.rename(columns=chrom_col_replace)

# fraction of gene body overlap A/B compartment in GM12878
inactive_gene_bed = BedTool(inactive_genes_bed_path)
gm12878_ab_bed = BedTool(gm12878_ab_compartments_path)
inactive_gene_intersect_ab_str = StringIO(str(inactive_gene_bed.intersect(gm12878_ab_bed, wo=True)))
gene_ab_cols = {0:"chrom_gene", 1:"start_gene", 2:"end_gene", 3:"gene_id", 4:"score", 5:"strand", 
                6: 'compartment_chrom', 7: 'compartment_start', 8: 'compartment_end',
                9: 'compartment_score', 10: 'overlap'}
inactive_gene_intersect_ab_df = pd.read_csv(inactive_gene_intersect_ab_str, sep="\t", header=None).rename(columns=gene_ab_cols)
# label compartment types 
# A-compartment: > 0 and B-compartment < 0
conditions = [(inactive_gene_intersect_ab_df.compartment_score >= 0) & (~inactive_gene_intersect_ab_df.compartment_score.isnull()), 
              (inactive_gene_intersect_ab_df.compartment_score < 0) & (~inactive_gene_intersect_ab_df.compartment_score.isnull()), 
              (inactive_gene_intersect_ab_df.compartment_score.isnull())]

values = ["A", "B", "Unassigned"]
inactive_gene_intersect_ab_df["compartment_type"] = np.select(conditions, values)

gene_compartment_overlap = {}
for i, gene_id in enumerate(inactive_gene_intersect_ab_df.gene_id.unique()):
    gene_compartment_overlap[i] = [gene_id]
    for compartment in ["A", "B", "Unassigned"]:
        total_overlap = inactive_gene_intersect_ab_df[(inactive_gene_intersect_ab_df.gene_id == gene_id) & 
                                                      (inactive_gene_intersect_ab_df.compartment_type == compartment)
                                                     ].overlap.sum()
        gene_compartment_overlap[i].append(total_overlap)
gene_compartment_overlap_df = pd.DataFrame.from_dict(gene_compartment_overlap, 
                                                     orient="index", 
                                                     columns=["gene_id", "overlap_A", "overlap_B", "overlap_unassigned"])
gene_compartment_overlap_df = pd.merge(inactive_gene_info_df[["gene_id", "gene_length"]], 
                                       gene_compartment_overlap_df, 
                                       on="gene_id", 
                                       how="inner"
                                      )
gene_compartment_overlap_df["fraction_A"] = gene_compartment_overlap_df.overlap_A / gene_compartment_overlap_df.gene_length
gene_compartment_overlap_df["fraction_B"] = gene_compartment_overlap_df.overlap_B / gene_compartment_overlap_df.gene_length
gene_compartment_overlap_df["fraction_unassigned"] = gene_compartment_overlap_df.overlap_unassigned / gene_compartment_overlap_df.gene_length
gene_compartment_overlap_df = gene_compartment_overlap_df[["gene_id", "fraction_A", "fraction_B", "fraction_unassigned"]]

In [11]:
## TAD boundary distance 
tad_boundaries_bed = BedTool(tad_boundaries_path)
inactive_gene_tad_distance_str = StringIO(str(inactive_gene_bed.closest(tad_boundaries_bed, d=True)))
gene_tad_distance_cols = {0:"chrom", 1:"start", 2:"end", 3:"gene_id", 4:"score", 5:"strand",
                          6:"chrom_tad", 7:"start_tad", 8:"end_tad", 9:"tad_strength", 10:"tad_score", 
                          11:"tad_distance"
                         }
inactive_gene_tad_distance_df = pd.read_csv(inactive_gene_tad_distance_str, sep="\t", header=None).rename(columns=gene_tad_distance_cols)
inactive_gene_tad_distance_min_df = inactive_gene_tad_distance_df.groupby("gene_id", as_index=False).tad_distance.min()

### Gene Sets 

In [12]:
# olfactory, autosomal dominant, haploinsufficient and autosomal recessive genes from gnomAD
gnomad_gene_set_df = pd.read_csv(gnomad_gene_sets_path, sep="\t")
inactive_gene_gnomad_sets_df = inactive_gene_info_df[["gene_id", "gene_name"]].copy()
for gnomad_gene_set in gnomad_gene_set_df.gene_list.unique():
    gene_set = gnomad_gene_set_df[gnomad_gene_set_df.gene_list == gnomad_gene_set].gene.unique()
    inactive_gene_gnomad_sets_df[f"gnomad_{gnomad_gene_set.lower().replace(' ', '_')}"] = np.where(inactive_gene_info_df.gene_name.isin(gene_set), 1, 0)
inactive_gene_gnomad_sets_df = inactive_gene_gnomad_sets_df.drop(columns=["gene_name"])

In [13]:
# oncogenes, cosmic v97, downloaded Cancer Gene Census 
cosmic_cancer_gene_df = pd.read_csv(cosmic_genes_path)
# select Tier 1, dominant oncogenes
oncogenes_df = cosmic_cancer_gene_df[(cosmic_cancer_gene_df["Tier"] == 1) &
                                     (cosmic_cancer_gene_df["Molecular Genetics"] == "Dom") & 
                                     (cosmic_cancer_gene_df["Role in Cancer"].str.contains("oncogene"))
                                     ]
# get ENSG IDs
oncogene_gene_id_list = []
for synonyms in oncogenes_df.Synonyms.tolist(): 
    if "ENSG" in synonyms: 
        for synonym in synonyms.split(","):
            if synonym.startswith("ENSG"): 
                oncogene_gene_id_list.append(synonym.split(".")[0])
    else: 
        oncogene_gene_id_list.append(np.nan)
print(f"Number of oncogenes in list: {len(oncogene_gene_id_list)}")
oncogene_info_df = inactive_gene_info_df[["gene_id"]].copy()
oncogene_info_df["oncogene"] = np.where(oncogene_info_df.gene_id.isin(oncogene_gene_id_list), 1, 0)

Number of oncogenes in list: 245


In [14]:
# Decipher genes - curated list of genes associated with developmental disorders 
# https://www.deciphergenomics.org/ddd/ddgenes 
decipher_genes_df = pd.read_csv(decipher_genes_path)
# limit to strong, definitive and moderate 
decipher_genes_conf_df = decipher_genes_df[decipher_genes_df["confidence category"].isin(["strong", "definitive", "moderate"])]
decipher_genes = decipher_genes_conf_df["gene symbol"].unique()
decipher_genes_df = inactive_gene_info_df[["gene_name", "gene_id"]].copy()
decipher_genes_df["decipher_gene"] = np.where(decipher_genes_df.gene_name.isin(decipher_genes), 1, 0)
decipher_genes_df = decipher_genes_df.drop(columns=["gene_name"])

In [15]:
# OpenTargets approved drugs 
approved_targets = pd.read_csv(approved_drug_targets_path, header=None)[0].tolist()
print(f"Number of approved drug targets: {len(approved_targets)}")
inactive_approved_targets_df = inactive_gene_info_df[["gene_id"]].copy()
inactive_approved_targets_df["approved_target"] = np.where(inactive_approved_targets_df.gene_id.isin(approved_targets), 1, 0)
inactive_gene_approved_target = inactive_approved_targets_df[inactive_approved_targets_df.approved_target == 1].shape[0]
print(f"Number of approved targets in inactive gene set: {inactive_gene_approved_target}")

Number of approved drug targets: 929
Number of approved targets in inactive gene set: 213


In [16]:
# OMIM genes 
omim_genes_list = []
with open(omim_genes_path, 'r') as f_in: 
    for line in f_in: 
        if line.startswith("#"):
            continue 
        else: 
            genes = line.split("\t")[6]
            for gene in genes.split(", "): 
                omim_genes_list.append(gene)
omim_genes_set = set(omim_genes_list)
print(f"Number of OMIM genes: {len(omim_genes_set)}")
inactive_omim_df = inactive_gene_info_df[["gene_id", "gene_name"]].copy()
inactive_omim_df["omim_gene"] = np.where(inactive_omim_df.gene_name.isin(omim_genes_set), 1, 0)
inactive_omim_df = inactive_omim_df.drop(columns=["gene_name"])

Number of OMIM genes: 38458


### Merge features 

In [17]:
feature_df_list = [inactive_gene_info_df[["gene_id", "gene_length"]],
                    inactive_gene_density_df, 
                    inactive_gene_distance_df, 
                    inactive_tissue_expression_df, 
                    gene_features_expression_df,
                    inactive_gene_active_distance_df, 
                    active_gene_density_all_genes_df, 
                    chromhmm_fraction_inactive_df,
                    gene_compartment_overlap_df, 
                    inactive_gene_gnomad_sets_df, 
                    oncogene_info_df,
                    gene_features_eds_df, 
                    gene_features_episcore_df,
                    ptriplo_phaplo_gene_id_df[["gene_id", "pHaplo", "pTriplo"]],
                    inactive_gene_gnomad_df, 
                    gerp_elem_count_per_bp_df, 
                    inactive_gene_tad_distance_min_df, 
                    gene_mean_phylop_df, 
                    inactive_approved_targets_df, 
                    decipher_genes_df, 
                    inactive_omim_df, 
                   ]

In [18]:
# merge features 
gene_features_merged_df = reduce(lambda  left,right: pd.merge(left,right, on="gene_id",
                                                              how='inner'), feature_df_list)

In [19]:
# rename features for statsmodels 
adj_colnames = {col:col.replace(" ", "_").replace("\n","").replace("-","").replace(".","") for col in gene_features_merged_df.columns}
gene_features_merged_adj_colnames_df = gene_features_merged_df.rename(columns=adj_colnames)
gene_features_merged_adj_colnames_df.to_csv(out_dir_path.joinpath(f"inactive_gene_features_{len(inactive_gene_id_set)}.csv"), index=False)