#  EMSv2: Expression Modifier Score v2


This notebook provides examples of running the EMSv2 model using the results from the Enformer.
Enformer is a model that integrates long-range interactions for high-accuracy gene expression prediction, as described in the paper:

**"Effective gene expression prediction from sequence by integrating long-range interactions"**  
Žiga Avsec et al.

The Enformer code is available on GitHub: [Enformer GitHub Repository](https://github.com/google-deepmind/deepmind-research/blob/master/enformer/)


## Setup

In [32]:
import pandas as pd
import os
import glob
import numpy as np
from typing import Dict, Text
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
import tensorflow as tf
import keras
from keras.models import load_model
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Dropout, Activation, Flatten
import keras.backend as K

CPU times: user 6.72 ms, sys: 0 ns, total: 6.72 ms
Wall time: 10.2 ms


In [33]:
!wget https://github.com/ytakahashi-statgen/expression_modifier_score_v2/raw/refs/heads/main/enformer_output.csv | -O /root/data/enformer_output.csv
df_enf = pd.read_csv(f"./enformer_output.csv", delimiter=",",index_col=0)
df_enf

/bin/bash: line 1: -O: command not found
--2025-02-27 05:32:27--  https://github.com/ytakahashi-statgen/expression_modifier_score_v2/raw/refs/heads/main/enformer_output.csv
Resolving github.com (github.com)... 140.82.116.4
Connecting to github.com (github.com)|140.82.116.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ytakahashi-statgen/expression_modifier_score_v2/refs/heads/main/enformer_output.csv [following]
--2025-02-27 05:32:27--  https://raw.githubusercontent.com/ytakahashi-statgen/expression_modifier_score_v2/refs/heads/main/enformer_output.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5889 (5.8K) [text/plain]
Saving to: ‘enformer_output.csv’


2025-02-27 05:32:27 (64.3 MB/s) - 

Unnamed: 0,chrom,pos,id,ref,alt,PC0,PC1,PC2,PC3,PC4,...,PC90,PC91,PC92,PC93,PC94,PC95,PC96,PC97,PC98,PC99
0,chr1,66926,3385321,AG,A,-836.31354,-115.440155,-105.89859,363.2731,3.020063,...,-34.88388,-12.332027,-31.913256,-17.358685,-11.733356,-9.297302,-131.23456,-44.78366,24.886904,-37.1836
1,chr1,69134,2205837,A,G,-7.43252,2.266495,-4.216263,7.759078,1.582735,...,-1.328722,-1.11257,0.719973,-2.699473,-1.532886,0.802354,-1.897553,1.226293,1.11976,0.277107
2,chr1,69314,3205580,T,G,-4.573217,7.087116,-8.082282,10.568202,2.177188,...,-2.290263,-1.098893,-3.132065,-2.266849,-1.732193,3.386024,-3.429431,3.390162,1.662983,-1.75327
3,chr1,69423,3205581,G,A,-2.699423,-1.337945,0.44314,0.211538,-0.564606,...,0.238912,0.45854,-0.027998,0.700069,0.093475,-0.015051,0.401435,-0.407526,0.363673,0.55399
4,chr1,69581,2252161,C,G,-2.234682,4.260884,-6.350684,2.498135,-2.610524,...,-4.241713,-0.514985,1.946547,-2.963999,-3.155039,-0.040446,-0.319353,1.259229,-2.399467,-1.180112


## Calculate Distance to TSS for variant-gene pairs

In [34]:
!wget https://storage.googleapis.com/adult-gtex/references/v8/reference-tables/gencode.v26.GRCh38.genes.gtf | -O /root/data/gencode.v26.GRCh38.genes.gtf

/bin/bash: line 1: -O: command not found
--2025-02-27 05:32:53--  https://storage.googleapis.com/adult-gtex/references/v8/reference-tables/gencode.v26.GRCh38.genes.gtf
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.195.207, 172.253.117.207, 142.250.99.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.195.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 134502408 (128M) [application/octet-stream]
Saving to: ‘gencode.v26.GRCh38.genes.gtf’


2025-02-27 05:32:54 (88.9 MB/s) - ‘gencode.v26.GRCh38.genes.gtf’ saved [134502408/134502408]



In [35]:
#check the gtf file:
!head -n 7 ./gencode.v26.GRCh38.genes.gtf

##description: evidence-based annotation of the human genome (GRCh38), version 26 (Ensembl 88)
##provider: GENCODE
##contact: gencode-help@sanger.ac.uk
##format: gtf
##date: 2017-03-14
##collapsed version generated by GTEx LDACC
chr1	HAVANA	gene	11869	14403	.	+	.	gene_id "ENSG00000223972.5"; transcript_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1"; level 2; havana_gene "OTTHUMG00000000961.2";


In [36]:
hdr = ["chr","source","type","gene_left","gene_right", ".", "strand","..", "metadata"] #just the header to read gtf
gtf = pd.read_csv("./gencode.v26.GRCh38.genes.gtf", sep='\t', skiprows=6, header=None)#gtf downloaded from GTEx portal
gtf.columns = hdr
gtf = gtf[gtf.type=="gene"] #50K genes
gtf.head() #check

Unnamed: 0,chr,source,type,gene_left,gene_right,.,strand,..,metadata
0,chr1,HAVANA,gene,11869,14403,.,+,.,"gene_id ""ENSG00000223972.5""; transcript_id ""EN..."
6,chr1,HAVANA,gene,14410,29553,.,-,.,"gene_id ""ENSG00000227232.5""; transcript_id ""EN..."
19,chr1,ENSEMBL,gene,17369,17436,.,-,.,"gene_id ""ENSG00000278267.1""; transcript_id ""EN..."
22,chr1,HAVANA,gene,29571,31109,.,+,.,"gene_id ""ENSG00000243485.5""; transcript_id ""EN..."
28,chr1,HAVANA,gene,34554,36081,.,-,.,"gene_id ""ENSG00000237613.2""; transcript_id ""EN..."


In [37]:
#check this metadata column:
gtf.metadata.values[0]

'gene_id "ENSG00000223972.5"; transcript_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1"; level 2; havana_gene "OTTHUMG00000000961.2";'

In [38]:
#annotate tss, which is the left for plus strand and right for minus strand
gtf["tss_position"] = -1 #dummy
gtf.loc[gtf.strand=="-", "tss_position"] = gtf.gene_right
gtf.loc[gtf.strand=="+", "tss_position"] = gtf.gene_left
#annotate ENSG ID
gtf["ensg_id"] = gtf.metadata.str.split(";").str[0].str.split('"').str[1]
gtf["gene_name"] = gtf.metadata.str.split(";").str[3].str.split('"').str[1]

#write the file:
gtf[["chr", "tss_position", "ensg_id", "gene_name"]].to_csv("./gencode.v26.GRCh38.genes.tssposition.tsv", sep='\t', index=False)

In [39]:
#checking the tss position file:

#function to get ENSG ID list:
df = pd.read_csv("./gencode.v26.GRCh38.genes.tssposition.tsv", sep='\t')
df

Unnamed: 0,chr,tss_position,ensg_id,gene_name
0,chr1,11869,ENSG00000223972.5,DDX11L1
1,chr1,29553,ENSG00000227232.5,WASH7P
2,chr1,17436,ENSG00000278267.1,MIR6859-1
3,chr1,29571,ENSG00000243485.5,MIR1302-2HG
4,chr1,36081,ENSG00000237613.2,FAM138A
...,...,...,...,...
56195,chrM,14673,ENSG00000198695.2,MT-ND6
56196,chrM,14742,ENSG00000210194.1,MT-TE
56197,chrM,14747,ENSG00000198727.2,MT-CYB
56198,chrM,15888,ENSG00000210195.2,MT-TT


In [40]:
#split per chr to make it faster when annotating.
tss_dict = {}
for chr in df.chr.unique():
    tss_dict[chr] = df[df.chr==chr]
tss_dict

{'chr1':        chr  tss_position             ensg_id    gene_name
 0     chr1         11869   ENSG00000223972.5      DDX11L1
 1     chr1         29553   ENSG00000227232.5       WASH7P
 2     chr1         17436   ENSG00000278267.1    MIR6859-1
 3     chr1         29571   ENSG00000243485.5  MIR1302-2HG
 4     chr1         36081   ENSG00000237613.2      FAM138A
 ...    ...           ...                 ...          ...
 5051  chr1     248859085  ENSG00000171163.15       ZNF692
 5052  chr1     248859164   ENSG00000227237.1   AL672294.1
 5053  chr1     248906196  ENSG00000185220.11        PGBD2
 5054  chr1     248912795   ENSG00000200495.1   RNU6-1205P
 5055  chr1     248936581   ENSG00000233084.2    RPL23AP25
 
 [5056 rows x 4 columns],
 'chr2':        chr  tss_position             ensg_id   gene_name
 5056  chr2         46870   ENSG00000184731.5     FAM110C
 5057  chr2        197569   ENSG00000227061.1  AC079779.7
 5058  chr2        266398  ENSG00000035115.21      SH3YL1
 5059  chr2     

In [41]:
df_enf['id'] = df_enf['chrom'] + '_' + df_enf['pos'].astype(str) + '_' + df_enf['ref'] + '_' + df_enf['alt']
df_enf

Unnamed: 0,chrom,pos,id,ref,alt,PC0,PC1,PC2,PC3,PC4,...,PC90,PC91,PC92,PC93,PC94,PC95,PC96,PC97,PC98,PC99
0,chr1,66926,chr1_66926_AG_A,AG,A,-836.31354,-115.440155,-105.89859,363.2731,3.020063,...,-34.88388,-12.332027,-31.913256,-17.358685,-11.733356,-9.297302,-131.23456,-44.78366,24.886904,-37.1836
1,chr1,69134,chr1_69134_A_G,A,G,-7.43252,2.266495,-4.216263,7.759078,1.582735,...,-1.328722,-1.11257,0.719973,-2.699473,-1.532886,0.802354,-1.897553,1.226293,1.11976,0.277107
2,chr1,69314,chr1_69314_T_G,T,G,-4.573217,7.087116,-8.082282,10.568202,2.177188,...,-2.290263,-1.098893,-3.132065,-2.266849,-1.732193,3.386024,-3.429431,3.390162,1.662983,-1.75327
3,chr1,69423,chr1_69423_G_A,G,A,-2.699423,-1.337945,0.44314,0.211538,-0.564606,...,0.238912,0.45854,-0.027998,0.700069,0.093475,-0.015051,0.401435,-0.407526,0.363673,0.55399
4,chr1,69581,chr1_69581_C_G,C,G,-2.234682,4.260884,-6.350684,2.498135,-2.610524,...,-4.241713,-0.514985,1.946547,-2.963999,-3.155039,-0.040446,-0.319353,1.259229,-2.399467,-1.180112


In [42]:
df_vcf = df_enf.iloc[:,:5]
df_vcf.columns=['chr', 'pos','id','ref','alt']
df_vcf

Unnamed: 0,chr,pos,id,ref,alt
0,chr1,66926,chr1_66926_AG_A,AG,A
1,chr1,69134,chr1_69134_A_G,A,G
2,chr1,69314,chr1_69314_T_G,T,G
3,chr1,69423,chr1_69423_G_A,G,A
4,chr1,69581,chr1_69581_C_G,C,G


In [43]:
%%time
#definee the functino that
def return_vg(chr, pos): #input: chr and pos, output: dataframe of chr, pos, tss_distance, ensg_id, gene_name
    st = tss_dict[chr]
    st["tss_distance"] = pos - st.tss_position #negative = the variant upstream in the genome 逆かも？need to check
    st = st[abs(st.tss_distance)<10**6]
    st["pos"] = pos
    cols = ["chr","pos","tss_distance","ensg_id","gene_name"]
    return (st[cols])
def convert_to_vg(vcf): #input: vcf-like tsv, output: expanded tsv where row = variant-gene pair
    out = []
    for i in range(vcf.shape[0]):
        tmp = return_vg(vcf.chr[i], vcf.pos[i])
        tmp[vcf.columns] = vcf.iloc[i,:]
        out.append(tmp)
    return (pd.concat(out))

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 11 µs


In [44]:
%%time
#check that the output is good:
df_tss = convert_to_vg(df_vcf)
df_tss["ensg_id"] = df_tss["ensg_id"].str.split('.', expand=True)[0]
df_tss

CPU times: user 18.7 ms, sys: 1.07 ms, total: 19.7 ms
Wall time: 18.8 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,chr,pos,tss_distance,ensg_id,gene_name,id,ref,alt
0,chr1,66926,55057,ENSG00000223972,DDX11L1,chr1_66926_AG_A,AG,A
1,chr1,66926,37373,ENSG00000227232,WASH7P,chr1_66926_AG_A,AG,A
2,chr1,66926,49490,ENSG00000278267,MIR6859-1,chr1_66926_AG_A,AG,A
3,chr1,66926,37355,ENSG00000243485,MIR1302-2HG,chr1_66926_AG_A,AG,A
4,chr1,66926,30845,ENSG00000237613,FAM138A,chr1_66926_AG_A,AG,A
...,...,...,...,...,...,...,...,...
65,chr1,69581,-938612,ENSG00000231702,RP11-54O7.10,chr1_69581_C_G,C,G
66,chr1,69581,-943612,ENSG00000224969,RP11-54O7.11,chr1_69581_C_G,C,G
67,chr1,69581,-950542,ENSG00000188157,AGRN,chr1_69581_C_G,C,G
68,chr1,69581,-990153,ENSG00000217801,RP11-465B22.3,chr1_69581_C_G,C,G


In [46]:
df_tss.drop(df_tss.columns.difference(['id', 'ensg_id', 'tss_distance']), axis=1, inplace=True)
df_tss

Unnamed: 0,tss_distance,ensg_id,id
0,55057,ENSG00000223972,chr1_66926_AG_A
1,37373,ENSG00000227232,chr1_66926_AG_A
2,49490,ENSG00000278267,chr1_66926_AG_A
3,37355,ENSG00000243485,chr1_66926_AG_A
4,30845,ENSG00000237613,chr1_66926_AG_A
...,...,...,...
65,-938612,ENSG00000231702,chr1_69581_C_G
66,-943612,ENSG00000224969,chr1_69581_C_G
67,-950542,ENSG00000188157,chr1_69581_C_G
68,-990153,ENSG00000217801,chr1_69581_C_G


In [47]:
X_data = pd.merge(df_enf, df_tss, how="left",on="id")
X_data

Unnamed: 0,chrom,pos,id,ref,alt,PC0,PC1,PC2,PC3,PC4,...,PC92,PC93,PC94,PC95,PC96,PC97,PC98,PC99,tss_distance,ensg_id
0,chr1,66926,chr1_66926_AG_A,AG,A,-836.313540,-115.440155,-105.898590,363.273100,3.020063,...,-31.913256,-17.358685,-11.733356,-9.297302,-131.234560,-44.783660,24.886904,-37.183600,55057,ENSG00000223972
1,chr1,66926,chr1_66926_AG_A,AG,A,-836.313540,-115.440155,-105.898590,363.273100,3.020063,...,-31.913256,-17.358685,-11.733356,-9.297302,-131.234560,-44.783660,24.886904,-37.183600,37373,ENSG00000227232
2,chr1,66926,chr1_66926_AG_A,AG,A,-836.313540,-115.440155,-105.898590,363.273100,3.020063,...,-31.913256,-17.358685,-11.733356,-9.297302,-131.234560,-44.783660,24.886904,-37.183600,49490,ENSG00000278267
3,chr1,66926,chr1_66926_AG_A,AG,A,-836.313540,-115.440155,-105.898590,363.273100,3.020063,...,-31.913256,-17.358685,-11.733356,-9.297302,-131.234560,-44.783660,24.886904,-37.183600,37355,ENSG00000243485
4,chr1,66926,chr1_66926_AG_A,AG,A,-836.313540,-115.440155,-105.898590,363.273100,3.020063,...,-31.913256,-17.358685,-11.733356,-9.297302,-131.234560,-44.783660,24.886904,-37.183600,30845,ENSG00000237613
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,chr1,69581,chr1_69581_C_G,C,G,-2.234682,4.260884,-6.350684,2.498135,-2.610524,...,1.946547,-2.963999,-3.155039,-0.040446,-0.319353,1.259229,-2.399467,-1.180112,-938612,ENSG00000231702
346,chr1,69581,chr1_69581_C_G,C,G,-2.234682,4.260884,-6.350684,2.498135,-2.610524,...,1.946547,-2.963999,-3.155039,-0.040446,-0.319353,1.259229,-2.399467,-1.180112,-943612,ENSG00000224969
347,chr1,69581,chr1_69581_C_G,C,G,-2.234682,4.260884,-6.350684,2.498135,-2.610524,...,1.946547,-2.963999,-3.155039,-0.040446,-0.319353,1.259229,-2.399467,-1.180112,-950542,ENSG00000188157
348,chr1,69581,chr1_69581_C_G,C,G,-2.234682,4.260884,-6.350684,2.498135,-2.610524,...,1.946547,-2.963999,-3.155039,-0.040446,-0.319353,1.259229,-2.399467,-1.180112,-990153,ENSG00000217801


In [48]:
col = ['id', 'ensg_id'] + [f'PC{i}' for i in range(100)] + ['tss_distance']
X_data = X_data[col]
X_data

Unnamed: 0,id,ensg_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,...,PC91,PC92,PC93,PC94,PC95,PC96,PC97,PC98,PC99,tss_distance
0,chr1_66926_AG_A,ENSG00000223972,-836.313540,-115.440155,-105.898590,363.273100,3.020063,-88.814600,18.764778,-147.31728,...,-12.332027,-31.913256,-17.358685,-11.733356,-9.297302,-131.234560,-44.783660,24.886904,-37.183600,55057
1,chr1_66926_AG_A,ENSG00000227232,-836.313540,-115.440155,-105.898590,363.273100,3.020063,-88.814600,18.764778,-147.31728,...,-12.332027,-31.913256,-17.358685,-11.733356,-9.297302,-131.234560,-44.783660,24.886904,-37.183600,37373
2,chr1_66926_AG_A,ENSG00000278267,-836.313540,-115.440155,-105.898590,363.273100,3.020063,-88.814600,18.764778,-147.31728,...,-12.332027,-31.913256,-17.358685,-11.733356,-9.297302,-131.234560,-44.783660,24.886904,-37.183600,49490
3,chr1_66926_AG_A,ENSG00000243485,-836.313540,-115.440155,-105.898590,363.273100,3.020063,-88.814600,18.764778,-147.31728,...,-12.332027,-31.913256,-17.358685,-11.733356,-9.297302,-131.234560,-44.783660,24.886904,-37.183600,37355
4,chr1_66926_AG_A,ENSG00000237613,-836.313540,-115.440155,-105.898590,363.273100,3.020063,-88.814600,18.764778,-147.31728,...,-12.332027,-31.913256,-17.358685,-11.733356,-9.297302,-131.234560,-44.783660,24.886904,-37.183600,30845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,chr1_69581_C_G,ENSG00000231702,-2.234682,4.260884,-6.350684,2.498135,-2.610524,1.800957,-1.117282,-5.00100,...,-0.514985,1.946547,-2.963999,-3.155039,-0.040446,-0.319353,1.259229,-2.399467,-1.180112,-938612
346,chr1_69581_C_G,ENSG00000224969,-2.234682,4.260884,-6.350684,2.498135,-2.610524,1.800957,-1.117282,-5.00100,...,-0.514985,1.946547,-2.963999,-3.155039,-0.040446,-0.319353,1.259229,-2.399467,-1.180112,-943612
347,chr1_69581_C_G,ENSG00000188157,-2.234682,4.260884,-6.350684,2.498135,-2.610524,1.800957,-1.117282,-5.00100,...,-0.514985,1.946547,-2.963999,-3.155039,-0.040446,-0.319353,1.259229,-2.399467,-1.180112,-950542
348,chr1_69581_C_G,ENSG00000217801,-2.234682,4.260884,-6.350684,2.498135,-2.610524,1.800957,-1.117282,-5.00100,...,-0.514985,1.946547,-2.963999,-3.155039,-0.040446,-0.319353,1.259229,-2.399467,-1.180112,-990153


In [49]:
sign = np.sign(X_data.iloc[:, 2:102].values)
X_data.iloc[:,2:] = X_data.iloc[:,2:].apply(lambda x: np.log(abs(x)+1))
X_data.iloc[:,2:102] = X_data.iloc[:,2:102] * sign
X_data

1      10.528731
2      10.809546
3      10.528249
4      10.336762
         ...    
345    13.752159
346    13.757471
347    13.764789
348    13.805616
349    13.809199
Name: tss_distance, Length: 350, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_data.iloc[:,2:] = X_data.iloc[:,2:].apply(lambda x: np.log(abs(x)+1))


Unnamed: 0,id,ensg_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,...,PC91,PC92,PC93,PC94,PC95,PC96,PC97,PC98,PC99,tss_distance
0,chr1_66926_AG_A,ENSG00000223972,-6.730199,-4.757377,-4.671881,5.897904,1.391298,-4.497748,2.983901,-4.999354,...,-2.590169,-3.493875,-2.910103,-2.544225,-2.331882,-4.884577,-3.823927,3.253737,-3.642406,10.916142
1,chr1_66926_AG_A,ENSG00000227232,-6.730199,-4.757377,-4.671881,5.897904,1.391298,-4.497748,2.983901,-4.999354,...,-2.590169,-3.493875,-2.910103,-2.544225,-2.331882,-4.884577,-3.823927,3.253737,-3.642406,10.528731
2,chr1_66926_AG_A,ENSG00000278267,-6.730199,-4.757377,-4.671881,5.897904,1.391298,-4.497748,2.983901,-4.999354,...,-2.590169,-3.493875,-2.910103,-2.544225,-2.331882,-4.884577,-3.823927,3.253737,-3.642406,10.809546
3,chr1_66926_AG_A,ENSG00000243485,-6.730199,-4.757377,-4.671881,5.897904,1.391298,-4.497748,2.983901,-4.999354,...,-2.590169,-3.493875,-2.910103,-2.544225,-2.331882,-4.884577,-3.823927,3.253737,-3.642406,10.528249
4,chr1_66926_AG_A,ENSG00000237613,-6.730199,-4.757377,-4.671881,5.897904,1.391298,-4.497748,2.983901,-4.999354,...,-2.590169,-3.493875,-2.910103,-2.544225,-2.331882,-4.884577,-3.823927,3.253737,-3.642406,10.336762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,chr1_69581_C_G,ENSG00000231702,-1.173931,1.660299,-1.994793,1.252230,-1.283853,1.029961,-0.750133,-1.791926,...,-0.415405,1.080634,-1.377253,-1.424322,-0.039649,-0.277142,0.815024,-1.223619,-0.779376,13.752159
346,chr1_69581_C_G,ENSG00000224969,-1.173931,1.660299,-1.994793,1.252230,-1.283853,1.029961,-0.750133,-1.791926,...,-0.415405,1.080634,-1.377253,-1.424322,-0.039649,-0.277142,0.815024,-1.223619,-0.779376,13.757471
347,chr1_69581_C_G,ENSG00000188157,-1.173931,1.660299,-1.994793,1.252230,-1.283853,1.029961,-0.750133,-1.791926,...,-0.415405,1.080634,-1.377253,-1.424322,-0.039649,-0.277142,0.815024,-1.223619,-0.779376,13.764789
348,chr1_69581_C_G,ENSG00000217801,-1.173931,1.660299,-1.994793,1.252230,-1.283853,1.029961,-0.750133,-1.791926,...,-0.415405,1.080634,-1.377253,-1.424322,-0.039649,-0.277142,0.815024,-1.223619,-0.779376,13.805616


In [50]:
# Custom loss function for EMS model training and prediction
def custom_canonical_mse(y_true_pip, y_pred, mask_val=-2): #input y_true_pip is now a vector of length N and is pip, for pos and neg and is masked otherwise
    mask = K.cast(K.not_equal(y_true_pip, mask_val), K.floatx()) #this is the mask
    #first, create binary label:
    y_bin = tf.where(tf.greater(y_true_pip, thres),1.0, y_true_pip)
    y_bin = tf.where(tf.less(y_bin, 0.0001),0.0, y_bin) #the mask -2 will be gone, but this should be fine if we mask first.
    #and the weight
    sumpos = tf.math.reduce_sum(tf.where(tf.greater(y_true_pip, thres),y_true_pip, 0), axis=0) #this gives the sum(PIP) for pos.
    nneg = tf.math.reduce_sum(tf.cast(tf.less(y_true_pip, 0.0001), tf.float32)*mask, axis=0) #this gives the num. neg
    neg_weight = sumpos/nneg
    weight = tf.where(tf.greater(y_true_pip, thres), y_true_pip, neg_weight) #positive weight = PIP itself, negative weight = sum(PIP in pos)/nneg
    loss = K.square(y_pred - y_bin) #loss of the binary prediction
    loss = K.sum(K.sum(loss*weight*mask, axis=1)) #loss is a scaler. sum over all tissues
    return loss

In [51]:
# Load the pretrained EMSv2 model
!wget https://github.com/ytakahashi-statgen/expression_modifier_score_v2/raw/refs/heads/main/ems_model.h5 | -O /root/data/ems_model.h5
model = load_model('./ems_model.h5', custom_objects={"custom_canonical_mse":custom_canonical_mse})

tissues = ['Whole_Blood', 'Muscle_Skeletal', 'Liver', 'Brain_Cerebellum','Prostate', 'Spleen', 'Skin_Sun_Exposed_Lower_leg', 'Artery_Coronary',
       'Esophagus_Muscularis', 'Esophagus_Gastroesophageal_Junction','Artery_Tibial', 'Heart_Atrial_Appendage', 'Nerve_Tibial',
       'Heart_Left_Ventricle', 'Adrenal_Gland', 'Adipose_Visceral_Omentum','Pancreas', 'Lung', 'Pituitary',
       'Brain_Nucleus_accumbens_basal_ganglia', 'Colon_Transverse','Adipose_Subcutaneous', 'Esophagus_Mucosa', 'Brain_Cortex', 'Thyroid',
       'Stomach', 'Breast_Mammary_Tissue', 'Colon_Sigmoid','Skin_Not_Sun_Exposed_Suprapubic', 'Testis', 'Artery_Aorta',
       'Brain_Amygdala', 'Brain_Anterior_cingulate_cortex_BA24','Brain_Caudate_basal_ganglia', 'Brain_Cerebellar_Hemisphere',
       'Brain_Frontal_Cortex_BA9', 'Brain_Hippocampus', 'Brain_Hypothalamus','Brain_Putamen_basal_ganglia', 'Brain_Spinal_cord_cervical_c-1',
       'Brain_Substantia_nigra', 'Cells_Cultured_fibroblasts','Cells_EBV-transformed_lymphocytes', 'Kidney_Cortex',
       'Minor_Salivary_Gland', 'Ovary', 'Small_Intestine_Terminal_Ileum','Uterus', 'Vagina']

/bin/bash: line 1: -O: command not found
--2025-02-27 05:34:53--  https://github.com/ytakahashi-statgen/expression_modifier_score_v2/raw/refs/heads/main/ems_model.h5
Resolving github.com (github.com)... 140.82.116.4
Connecting to github.com (github.com)|140.82.116.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ytakahashi-statgen/expression_modifier_score_v2/refs/heads/main/ems_model.h5 [following]
--2025-02-27 05:34:53--  https://raw.githubusercontent.com/ytakahashi-statgen/expression_modifier_score_v2/refs/heads/main/ems_model.h5
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4587704 (4.4M) [application/octet-stream]
Saving to: ‘ems_model.h5’


2025-02-27 05:34:54 (70.7 MB/s) - ‘ems_model.



In [52]:
# Perform prediction and format as DataFrame
y_prob = model.predict(X_data.iloc[:, 2:])
y_prob = pd.DataFrame(y_prob,columns=tissues,index=X_data.index)
y_prob = pd.concat([X_data.iloc[:, :2], y_prob], axis=1)
y_prob

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step


Unnamed: 0,id,ensg_id,Whole_Blood,Muscle_Skeletal,Liver,Brain_Cerebellum,Prostate,Spleen,Skin_Sun_Exposed_Lower_leg,Artery_Coronary,...,Brain_Spinal_cord_cervical_c-1,Brain_Substantia_nigra,Cells_Cultured_fibroblasts,Cells_EBV-transformed_lymphocytes,Kidney_Cortex,Minor_Salivary_Gland,Ovary,Small_Intestine_Terminal_Ileum,Uterus,Vagina
0,chr1_66926_AG_A,ENSG00000223972,0.821231,0.808179,0.813768,0.804463,0.824214,0.832166,0.804939,0.782668,...,0.791819,0.781386,0.814719,0.829434,0.661146,0.796537,0.793140,0.799075,0.790446,0.811692
1,chr1_66926_AG_A,ENSG00000227232,0.841479,0.833510,0.833669,0.820334,0.849234,0.852683,0.826032,0.816357,...,0.816119,0.806160,0.833882,0.850838,0.680601,0.826553,0.818866,0.825196,0.819595,0.828879
2,chr1_66926_AG_A,ENSG00000278267,0.826998,0.815914,0.819892,0.808811,0.832286,0.838257,0.811459,0.793182,...,0.799788,0.788703,0.820218,0.836151,0.666671,0.805481,0.800567,0.807056,0.799084,0.816747
3,chr1_66926_AG_A,ENSG00000243485,0.841503,0.833537,0.833690,0.820354,0.849259,0.852705,0.826055,0.816392,...,0.816143,0.806187,0.833905,0.850860,0.680624,0.826586,0.818895,0.825224,0.819628,0.828898
4,chr1_66926_AG_A,ENSG00000237613,0.850645,0.843715,0.841529,0.827815,0.858007,0.861252,0.834382,0.828975,...,0.824337,0.816039,0.842428,0.859096,0.689061,0.838849,0.829948,0.835384,0.831562,0.836131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,chr1_69581_C_G,ENSG00000231702,0.056119,0.050546,0.139672,0.113500,0.040264,0.064704,0.053469,0.037276,...,0.080197,0.132446,0.053008,0.181868,0.288398,0.055740,0.085497,0.104940,0.075178,0.051968
346,chr1_69581_C_G,ENSG00000224969,0.055210,0.049720,0.137921,0.112033,0.039507,0.063768,0.052632,0.036576,...,0.079011,0.130904,0.052162,0.180014,0.286677,0.054778,0.084191,0.103522,0.073914,0.050914
347,chr1_69581_C_G,ENSG00000188157,0.053978,0.048602,0.135536,0.110038,0.038486,0.062500,0.051499,0.035633,...,0.077401,0.128804,0.051016,0.177482,0.284315,0.053479,0.082420,0.101596,0.072202,0.049494
348,chr1_69581_C_G,ENSG00000217801,0.047560,0.042777,0.122799,0.099446,0.033225,0.055836,0.045593,0.030776,...,0.068930,0.117598,0.045040,0.163815,0.271323,0.046738,0.073136,0.091401,0.063266,0.042196


In [53]:
!wget https://github.com/ytakahashi-statgen/expression_modifier_score_v2/raw/refs/heads/main/ems_scaling_scores.csv | -O /root/data/ems_scaling_scores.csv

df_ems_scale = pd.read_csv(f"./ems_scaling_scores.csv", delimiter=",",index_col=0)
df_ems_scale

/bin/bash: line 1: -O: command not found
--2025-02-27 05:35:10--  https://github.com/ytakahashi-statgen/expression_modifier_score_v2/raw/refs/heads/main/ems_scaling_scores.csv
Resolving github.com (github.com)... 140.82.116.3
Connecting to github.com (github.com)|140.82.116.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ytakahashi-statgen/expression_modifier_score_v2/refs/heads/main/ems_scaling_scores.csv [following]
--2025-02-27 05:35:11--  https://raw.githubusercontent.com/ytakahashi-statgen/expression_modifier_score_v2/refs/heads/main/ems_scaling_scores.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1051223 (1.0M) [text/plain]
Saving to: ‘ems_scaling_scores.csv’


2025-02-27 05:35:11

Unnamed: 0,EMS_raw_score,Whole_Blood_EMSv2,Muscle_Skeletal_EMSv2,Liver_EMSv2,Brain_Cerebellum_EMSv2,Prostate_EMSv2,Spleen_EMSv2,Skin_Sun_Exposed_Lower_leg_EMSv2,Artery_Coronary_EMSv2,Esophagus_Muscularis_EMSv2,...,Brain_Spinal_cord_cervical_c-1_EMSv2,Brain_Substantia_nigra_EMSv2,Cells_Cultured_fibroblasts_EMSv2,Cells_EBV-transformed_lymphocytes_EMSv2,Kidney_Cortex_EMSv2,Minor_Salivary_Gland_EMSv2,Ovary_EMSv2,Small_Intestine_Terminal_Ileum_EMSv2,Uterus_EMSv2,Vagina_EMSv2
0,0.000,2.556027e-07,1.162706e-07,3.487263e-08,1.668640e-07,1.025745e-07,8.954207e-08,1.312655e-07,5.288330e-08,1.331129e-07,...,3.955548e-08,2.706677e-08,1.433407e-07,4.508196e-08,0.0,1.154782e-07,1.863451e-08,1.474391e-07,4.961639e-09,4.723821e-08
1,0.001,2.597645e-07,1.112291e-07,3.410651e-08,1.656341e-07,1.018860e-07,9.139348e-08,1.356897e-07,5.238486e-08,1.322462e-07,...,3.873323e-08,2.600449e-08,1.420262e-07,4.325035e-08,0.0,1.119999e-07,1.830622e-08,1.502013e-07,4.822321e-09,4.608864e-08
2,0.002,2.664786e-07,1.082820e-07,3.343280e-08,1.649444e-07,9.911833e-08,9.054958e-08,1.393615e-07,5.209269e-08,1.360159e-07,...,3.802814e-08,3.306689e-08,1.395010e-07,4.505086e-08,0.0,1.090032e-07,1.802021e-08,1.463158e-07,4.702461e-09,4.735540e-08
3,0.003,2.659103e-07,1.055412e-07,3.282376e-08,1.665127e-07,9.653016e-08,9.131524e-08,1.421061e-07,5.346654e-08,1.372970e-07,...,3.741678e-08,3.183584e-08,1.388541e-07,4.343469e-08,0.0,1.010502e-07,1.776513e-08,1.451037e-07,4.598581e-09,4.846618e-08
4,0.004,2.612606e-07,1.013437e-07,3.443550e-08,1.681012e-07,9.019851e-08,9.364118e-08,1.413469e-07,5.490442e-08,1.416084e-07,...,3.687059e-08,3.068532e-08,1.367327e-07,4.505016e-08,0.0,9.883611e-08,1.754499e-08,1.441393e-07,4.501694e-09,4.771880e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
946,0.946,6.203206e-04,1.023816e-03,3.404440e-04,9.473456e-04,3.618193e-04,4.034268e-04,1.608929e-03,2.395134e-04,9.802826e-04,...,2.636820e-04,2.190630e-04,8.740784e-04,1.214582e-04,0.0,1.008104e-04,2.826612e-04,2.147195e-04,1.428170e-04,1.152042e-04
947,0.947,6.492004e-04,1.072760e-03,3.525540e-04,9.786403e-04,3.802350e-04,4.559099e-04,1.718465e-03,2.563426e-04,1.059873e-03,...,2.650370e-04,2.178908e-04,9.271816e-04,1.221262e-04,0.0,1.057784e-04,2.961843e-04,2.179266e-04,1.478784e-04,1.153829e-04
948,0.948,7.465729e-04,1.206495e-03,3.774965e-04,1.002989e-03,3.787895e-04,4.894102e-04,1.871915e-03,2.792082e-04,1.167385e-03,...,2.652379e-04,2.234679e-04,1.037698e-03,1.369627e-04,0.0,1.038644e-04,3.214386e-04,2.271043e-04,1.575682e-04,1.188993e-04
949,0.949,9.142184e-04,1.356952e-03,3.595121e-04,9.802139e-04,4.051968e-04,5.304951e-04,1.981703e-03,2.917439e-04,1.274816e-03,...,2.720298e-04,2.174416e-04,1.088183e-03,1.558802e-04,0.0,1.044820e-04,3.336575e-04,2.292792e-04,1.675573e-04,1.225575e-04


In [56]:
# Convert raw scores to scaled scores
df_scaled_scores = pd.DataFrame()
df_raw_scores = df_ems_scale["EMS_raw_score"].values

for tissue in tqdm(tissues):
    df_tissue_scaled_indices = []
    for predicted_score in y_prob[tissue].values:
        # Find the index of the closest raw score to the predicted score
        closest_idx = np.abs(np.asarray(df_raw_scores) - predicted_score).argmin()
        df_tissue_scaled_indices.append(closest_idx)

    # Get the corresponding scaled scores for the tissue
    df_tissue_scaled_scores = pd.DataFrame(df_ems_scale.loc[df_tissue_scaled_indices, tissue + "_EMSv2"])  # tmp1 -> df_tissue_scaled_scores
    df_tissue_scaled_scores.reset_index(drop=True, inplace=True)
    df_scaled_scores = pd.concat([df_scaled_scores, df_tissue_scaled_scores], axis=1)

df_ems = pd.concat([y_prob[["id", "ensg_id"]], df_scaled_scores], axis=1)
df_ems


  0%|          | 0/49 [00:00<?, ?it/s]

Unnamed: 0,id,ensg_id,Whole_Blood_EMSv2,Muscle_Skeletal_EMSv2,Liver_EMSv2,Brain_Cerebellum_EMSv2,Prostate_EMSv2,Spleen_EMSv2,Skin_Sun_Exposed_Lower_leg_EMSv2,Artery_Coronary_EMSv2,...,Brain_Spinal_cord_cervical_c-1_EMSv2,Brain_Substantia_nigra_EMSv2,Cells_Cultured_fibroblasts_EMSv2,Cells_EBV-transformed_lymphocytes_EMSv2,Kidney_Cortex_EMSv2,Minor_Salivary_Gland_EMSv2,Ovary_EMSv2,Small_Intestine_Terminal_Ileum_EMSv2,Uterus_EMSv2,Vagina_EMSv2
0,chr1_66926_AG_A,ENSG00000223972,1.694360e-04,1.239245e-04,2.927574e-05,4.790525e-05,2.683441e-05,6.545753e-05,1.342508e-04,1.670276e-05,...,9.624206e-06,6.097183e-06,1.474230e-04,1.925296e-05,9.240801e-07,1.074668e-05,1.551993e-05,1.649627e-05,8.135046e-06,9.062786e-06
1,chr1_66926_AG_A,ENSG00000227232,1.891989e-04,1.687067e-04,3.549655e-05,6.321735e-05,3.921043e-05,8.421587e-05,1.792206e-04,2.352192e-05,...,1.277194e-05,1.022922e-05,1.982944e-04,2.525737e-05,1.285376e-06,1.507973e-05,2.110975e-05,2.185489e-05,1.245800e-05,1.126047e-05
2,chr1_66926_AG_A,ENSG00000278267,1.749497e-04,1.344264e-04,3.114067e-05,5.234555e-05,2.978663e-05,7.107353e-05,1.441768e-04,1.854874e-05,...,1.026529e-05,7.192332e-06,1.587017e-04,2.074170e-05,1.041998e-06,1.194395e-05,1.705932e-05,1.777832e-05,8.780966e-06,9.510002e-06
3,chr1_66926_AG_A,ENSG00000243485,1.899681e-04,1.687067e-04,3.549655e-05,6.321735e-05,3.921043e-05,8.421587e-05,1.792206e-04,2.352192e-05,...,1.277194e-05,1.022922e-05,1.982944e-04,2.525737e-05,1.285376e-06,1.507973e-05,2.110975e-05,2.185489e-05,1.245800e-05,1.126047e-05
4,chr1_66926_AG_A,ENSG00000237613,2.035464e-04,1.916620e-04,4.084001e-05,7.325072e-05,4.602754e-05,9.367617e-05,2.001398e-04,2.781162e-05,...,1.348491e-05,1.290267e-05,2.202121e-04,2.898897e-05,1.547903e-06,1.881849e-05,2.673169e-05,2.504046e-05,1.519391e-05,1.320404e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,chr1_69581_C_G,ENSG00000231702,9.363435e-07,4.968435e-07,3.195669e-07,4.323856e-07,1.072410e-07,2.460259e-07,5.798123e-07,7.982564e-08,...,6.067702e-08,3.637884e-08,3.273333e-07,1.135538e-07,1.039653e-08,5.840634e-08,3.922441e-08,1.302183e-07,1.184188e-08,5.524586e-08
346,chr1_69581_C_G,ENSG00000224969,9.142409e-07,4.925065e-07,3.249271e-07,4.357188e-07,1.072410e-07,2.418307e-07,5.798123e-07,7.982564e-08,...,7.500762e-08,3.599509e-08,3.256966e-07,1.236247e-07,1.546703e-08,5.487289e-08,3.872488e-08,1.285143e-07,1.165869e-08,5.685695e-08
347,chr1_69581_C_G,ENSG00000188157,8.854347e-07,4.929942e-07,3.200217e-07,4.347839e-07,1.076417e-07,2.156402e-07,5.757448e-07,7.788930e-08,...,7.283573e-08,4.512367e-08,3.059000e-07,1.215442e-07,1.517815e-08,5.315936e-08,2.675518e-08,1.255515e-07,1.129707e-08,5.487152e-08
348,chr1_69581_C_G,ENSG00000217801,8.431398e-07,3.552068e-07,3.058207e-07,4.261557e-07,1.090410e-07,1.968539e-07,5.275335e-07,7.555706e-08,...,8.163165e-08,4.830927e-08,2.707753e-07,6.170836e-08,1.869601e-08,3.714060e-08,1.415494e-08,1.037684e-07,3.094948e-09,4.678982e-08
