This part was not included in the manuscript.
use this notebook to build LR models for hub essentiality classifier

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, scipy
import networkx as nx
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.metrics import (
    balanced_accuracy_score,
    average_precision_score,
    confusion_matrix,
    f1_score,
    classification_report,
    recall_score,
    roc_auc_score,
    auc,
    precision_score,
    precision_recall_curve,
    )



In [None]:
dt = pd.read_csv('resources/all_hubs.for_LR.txt', sep='\t', )
dt = dt[dt['GC']>=0.4] # only study data with GC>0.4
# dt 

print((dt['Ess']=='Ess').sum()) # 91
print((dt['Ess']=='Noness').sum()) # 548


In [None]:
# add seq PCs
PC_list = ['PC{}'.format(i) for i in range(1,31)] # top 30 PCs
dt_pc_1 = pd.read_csv('resources/ext_0.rep_2/hub_PC30.1st.withGC.1.txt', sep='\t')
dt_pc_1 = dt_pc_1[['str']+PC_list].copy()
# dt_pc_1

dt_pc_2 = pd.read_csv('resources/ext_0.rep_2/hub_PC30.2nd.0.txt', sep='\t')
dt_pc_2 = dt_pc_2[['str']+PC_list].copy()
# dt_pc_2

dt_1 = dt[dt['round']=='1st'].copy().reset_index(drop=True)
dt_2 = dt[dt['round']=='2nd'].copy().reset_index(drop=True)

# print(dt_1.merge(dt_pc_1, on='str').iloc[:5,])
# print(dt_2.merge(dt_pc_2, on='str').iloc[:5,])

dt = pd.concat((dt_1.merge(dt_pc_1, on='str'), dt_2.merge(dt_pc_2, on='str')), ignore_index=True)
N_original = len(dt.columns)


str_of_tested_hubs = list(dt['str'])

In [None]:
# all features (peak + PR)
feature_cols = ['CTCF.narrow.rep-1', 'CTCF.narrow.rep-2',
       'RAD21.narrow.rep-1', 'RAD21.narrow.rep-2', 'SMC3.narrow.rep-1',
       'H3K27ac.narrow.rep-1', 'H3K27me3.narrow.rep-1', 'H3K27me3.broad.rep-2',
       'H3K36me3.narrow.rep-1', 'H3K36me3.narrow.rep-2',
       'H3K36me3.broad.rep-3', 'H3K4me1.narrow.rep-1', 'H3K4me1.narrow.rep-2',
       'H3K4me2.narrow.rep-1', 'H3K4me3.narrow.rep-1', 'H3K4me3.narrow.rep-2',
       'H3K9ac.narrow.rep-1', 'H3K9ac.narrow.rep-2', 'H3K9me3.narrow.rep-1',
       'H3K9me3.broad.rep-2', 'H4K20me1.narrow.rep-1', 'ATAC',
       'ess_gene.Morgens', 'ess_gene.Wang', 'lncRNA.Liu', 'all_gene'
       ] 

feature_cols_for_PR = ['pr_mean_{}.scaled'.format(i) for i in feature_cols] + ['pr_mean_default.scaled']

chrid_list = ['chr{}'.format(i) for i in range(1,9)] + ['chr{}'.format(i) for i in range(10,22)] +['chrX'] + ['der9' + 'phil22']

In [None]:
# add "scaled PR info" and "peak info" to dt

for i, chrid in enumerate(chrid_list):
    pr_result_new = pd.read_csv('PR-LR/PR_scores/proc/result.{}.1.txt'.format(chrid), sep='\t', )
    pr_result_new = pr_result_new[['str']+feature_cols_for_PR]
    
    pk_result = pd.read_csv('PR-LR/node_meta.1/{}.txt'.format(chrid), sep='\t',)
    pk_result = pk_result[['str']+feature_cols]
    if i==0:
        pr_all = pr_result_new.copy()
        pk_all = pk_result.copy()
    else:
        pr_all = pd.concat((pr_all, pr_result_new), ignore_index=True, )
        pk_all = pd.concat((pk_all, pk_result), ignore_index=True, )


dt = dt.merge(pr_all, how='left', on='str')
dt = dt.merge(pk_all, how='left', on='str')

# fill na using mean
dt.iloc[:,N_original:] = dt.iloc[:,N_original:].fillna(dt.iloc[:,N_original:].mean())

    

In [None]:
# UMAP visualization
r__ = dt[
    ['Ess', 'GC'] +
    ['PC{}'.format(i) for i in range(1,31)] +
    feature_cols +
    feature_cols_for_PR].copy()

tsne = TSNE()

X_embedded = tsne.fit_transform(r__.iloc[:,1:])
X_embedded
tsneDf = pd.DataFrame(data = X_embedded, columns = ['t-SNE1', 't-SNE2'])
tsneDf['Ess'] = r__['Ess']
# print(tsneDf)
# plot if these information is enough to determine hubs
sns.jointplot(data=tsneDf, x="t-SNE1", y="t-SNE2", hue="Ess", palette=['#416fec30', 'red'])
plt.legend(prop={'size':13})
plt.xlabel('t-SNE1', size=17)
plt.ylabel('t-SNE2', size=17)
plt.show()

# ess and noness hubs are not distinguishable on a UMAP

In [None]:
############
# these dictionaries save the features to use, iterate thru them
PCs_to_use_dict = {
    '1,2':[1,2],
    '2,3':[2,3],
    '1,2,3':[1,2,3],
    '2,3,4':[2,3,4],
    'top4':[1,2,3,4],
    'top5':[1,2,3,4,5],
    'top10':list(np.arange(1,11)),
    'top20':list(np.arange(1,21)),
    'top30':list(np.arange(1,31)),
    'no':[]
}

GC_to_use_dict = {
    'GC':['GC'],
    'noGC':[]
}

c_features_to_use_dict = {
    'core histones':['H3K4me3.narrow.rep-1', 'H3K4me1.narrow.rep-1', 'H3K9me3.broad.rep-2', 'H3K27me3.broad.rep-2', 'H3K36me3.broad.rep-3', 'H3K27ac.narrow.rep-1'],
    'all histones':['H3K4me3.narrow.rep-1', 'H3K4me1.narrow.rep-1', 'H3K9me3.broad.rep-2', 'H3K27me3.broad.rep-2', 'H3K36me3.broad.rep-3', 'H3K27ac.narrow.rep-1',
                    'H3K9ac.narrow.rep-1','H4K20me1.narrow.rep-1','H3K4me2.narrow.rep-1'],
    'TFs':['CTCF.narrow.rep-1','RAD21.narrow.rep-1','SMC3.narrow.rep-1'],
    'no histone':[]
}

g_feature_to_use_dict = {
    'gene annotations':['all_gene'],
    'lncRNAs':['lncRNA.Liu'],
    'essential genes':['ess_gene.Morgens', 'ess_gene.Wang'],
    'all genes':['ess_gene.Morgens', 'ess_gene.Wang', 'lncRNA.Liu', 'all_gene'],
    'no gene':[]
}

atac_to_use_dict = {
    'ATAC':[ATAC],
    'no ATAC':[]
}

pr_pk_to_use_dict = {
    'use_pk': lambda x:x,
    'use_pr': lambda x:['pr_mean_{}.scaled'.format(i) for i in x]+['pr_mean_default.scaled'],
    'use_pr_and_pk': lambda x:['pr_mean_{}.scaled'.format(i) for i in x]+['pr_mean_default.scaled']+x
}
# end of feature dicts
############


expr_hist = []
ROC_AUC_hist = []
rep_num = 10

i = 0
# iterate thru combinations of features
for GC_name,GC_cols in GC_to_use_dict.items():
    for PC_name,PC_cols in PCs_to_use_dict.items():
        for histone_name, histone_cols in c_features_to_use_dict.items():
            for g_name, g_cols in g_feature_to_use_dict.items():
                for atac_name, atac_cols in atac_to_use_dict.items():
                    for p_name, p_function in pr_pk_to_use_dict.items():
                        expr_name = '+'.join(
                            [GC_name, PC_name+'PC', histone_name, g_feature_to_use_dict, 
                            atac_to_use_dict, pr_pk_to_use_dict]
                            )
                        expr_cols = GC_cols + ['PC{}'.format(i_) for i_ in PC_cols]
                        expr_cols = expr_cols + (
                            p_function(histone_cols+g_cols+atac_cols)
                        )
                        if len(expr_cols)==0:
                            continue
                        i+=1


                        print('------------')
                        print(expr_name, i)
                        r__ = dt[['Ess']+expr_cols].copy() 
                        
                        expr_hist.append(expr_name)
                        
                        ess_part = r__[r__['Ess']=='Ess'].copy().reset_index(drop=True)
                        noness_part = r__[r__['Ess']=='Noness'].copy().reset_index(drop=True)

                        y_ess = np.ones((len(ess_part),))
                        X_ess = ess_part[expr_cols].values
                        y_noness = np.zeros((len(noness_part),))
                        X_noness = noness_part[expr_cols].values

                        roc_hist_cur = []
                        pr_hist_cur = []

                        for rep_id in range(rep_num):
                            # split 0.3:0.7 for ess and noness individually
                            X_train_ess, X_test_ess, y_train_ess, y_test_ess = train_test_split(X_ess, y_ess, test_size=0.3, 
                                                                            random_state=0+20*rep_num,
                                                                            )
                            X_train_noness, X_test_noness, y_train_noness, y_test_noness = train_test_split(X_noness, y_noness, test_size=0.3, 
                                                                                        random_state=0+1+20*rep_num,
                                                                                        )

                            X_train = np.vstack((X_train_ess, X_train_noness))
                            X_test = np.vstack((X_test_ess, X_test_noness))
                            y_train = np.vstack((
                                np.array(y_train_ess).reshape((-1,1)), 
                                np.array(y_train_noness).reshape((-1,1))
                                )).reshape(-1,)
                            y_test = np.vstack((
                                np.array(y_test_ess).reshape((-1,1)), 
                                np.array(y_test_noness).reshape((-1,1)) 
                                )).reshape(-1,)

                            scaler = preprocessing.StandardScaler().fit(X_train)
                            X_train = scaler.transform(X_train)
                            X_test = scaler.transform(X_test)

                            lg = LogisticRegression(class_weight = 'balanced', max_iter=1000, 
                                                    random_state=0+2+20*rep_num, 
                                                    # solver='liblinear',
                                                    )
                            lg.fit(X_train, y_train,  )

                            # y_test_pred_prob = lg.predict_proba(X_test)[:,1] # proba of being predicted as ess
                            y_test_pred = lg.predict(X_test)
                            
                            # calc roc and pr
                            bac_this_model = balanced_accuracy_score(y_test, y_test_pred)
                            roc_hist_cur.append(bac_this_model)
                      
                        ROC_AUC_hist.append(np.mean(roc_hist_cur))
                        ROC_AUC_err_hist.append(np.std(roc_hist_cur))


tmp_dt = pd.DataFrame({
    'expr':expr_hist,
    'roc':ROC_AUC_hist,
    'roc_err':ROC_AUC_err_hist,
})

tmp_dt
# the expectation is that roc score < 0.7, 
# suggesting models trained on 1D epi-features cannot distinguish ess and noness hubs
