### PhyloP mean conservation score over gene body

In [1]:
import pyBigWig
import numpy as np
import pandas as pd 
from pathlib import Path 

In [2]:
wkdir="/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/"
wkdir_path = Path(wkdir)

inactive_genes_path = wkdir_path.joinpath('3_misexp_genes/bed_files/inactive_genes_phylop.bed')
phylop_scores_bw = wkdir_path.joinpath("reference/conservation/phylop/hg38.phyloP100way.bw")
# output
out_dir =wkdir_path.joinpath("3_misexp_genes/phylop")
out_dir.mkdir(parents=True, exist_ok=True)

In [3]:
inactive_genes_df = pd.read_csv(inactive_genes_path, sep='\t', header=None, names=['chrom', 'start', 'end', "gene_id"])
inactive_genes = inactive_genes_df.gene_id.unique()
print(f"Number of inactive genes: {len(inactive_genes)}")

Number of inactive genes: 8650


In [4]:
phylop_bw = pyBigWig.open(str(phylop_scores_bw))
phylop_consv_results = {}
for index, row in inactive_genes_df.iterrows():
    chrom = row['chrom']
    start = row['start']
    end = row['end']
    gene_id = row["gene_id"]

    # query the BigWig file for the given interval
    values = phylop_bw.values(chrom, start, end)
    if len(values) != end - start: 
        print(gene_id)
    
    # calculate mean, ignores NaNs
    mean_value = np.nanmean(values) if values is not None else None
    
    # add metrics to dictionary 
    phylop_consv_results[index] = [chrom, start, end, gene_id, mean_value]
# close the BigWig file
phylop_bw.close()

  from ipykernel import kernelapp as app


In [5]:
pylop_results_columns=["chrom", "start", "end", "gene_id", "phylop_mean"]
phylop_consv_results_df = pd.DataFrame.from_dict(phylop_consv_results, orient="index", columns=pylop_results_columns)

In [6]:
# write to file 
phylop_consv_results_path = out_dir.joinpath("inactive_gene_phylop_gene_body.tsv")
phylop_consv_results_df.to_csv(phylop_consv_results_path, sep="\t", index=False)