In [1]:
import pandas as pd
import pickle
from tqdm import tqdm
import numpy as np
from scipy.stats import hypergeom
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# read datasets
kinase_info = pd.read_table('PCorr_DGI_datasets/kinase_info.txt')
kinase_info = kinase_info[(kinase_info['Group'].isna()==False)&(kinase_info['HGNC Name'].isna()==False)]
groups = list(set(kinase_info['Group']))

dgs_posi = pickle.load(open('PCorr_DGI_datasets/dgs_posi.pkl', 'rb'))

In [3]:
def calculate_hypergeometric_pvalue(N, K, n, k):
    rv = hypergeom(N, K, n)
    p_value = rv.sf(k - 1)
    return p_value

def pvalue_to_stars(p_value):
    if p_value < 0.0001:
        return '****'
    elif p_value < 0.001:
        return '***'
    elif p_value < 0.01:
        return '**'
    elif p_value < 0.05:
        return '*'
    else:
        return ''

##### 1. Identifying kinases' inhibitors

In [None]:
for ge_key in ['CRISPR', 'RNAi']:
    r = pickle.load(open('PCorr_DGI_datasets/r_{}.pkl'.format(ge_key), 'rb'))
    r['avg'] = np.mean(r[['y_probs','y_probs_corum']],axis=1)
    r.index = ['_'.join(list(i)) for i in r.index]
    r['gene'] = [i.split('_')[2] for i in r.index]
    genes_all = set(r['gene'])

    kinase_info = kinase_info[kinase_info['HGNC Name'].isin(genes_all)]
    group_gene = {}
    gene_group = {}
    for group in groups:
        genes = set(kinase_info[kinase_info['Group'] == group]['HGNC Name'])
        group_gene[group] = genes
        for gene in genes:
            gene_group[gene] = group

    pairs_posi = ['_'.join(list(i)) for i in dgs_posi]

    group_posipair = {}
    for group in tqdm(groups):
        group_posipair[group] = set()
        genes = group_gene[group]
        for gene in genes:
            for pair in pairs_posi:
                if gene == pair.split('_')[2]:
                    group_posipair[group].add(pair)

    group_pair = {}
    for group in tqdm(groups):
        genes = group_gene[group]
        group_pair[group] = set(r[r['gene'].isin(genes)].index)

    percent_plot = pd.DataFrame()
    p_plot = pd.DataFrame()
    scores_cut = [0.5, 0.6, 0.7, 0.8, 0.9]
    for group in tqdm(groups):
        print(group)
        N = len(group_pair[group])
        K = len(group_pair[group] & group_posipair[group])
        for cut in scores_cut:
            if len(group_posipair[group]) >= 10:
                r_high = r[r['avg'] > cut]
                n = len(group_pair[group] & set(r_high.index))
                k = len(group_posipair[group] & set(r_high.index))
                p = calculate_hypergeometric_pvalue(N, K, n, k)
                print(N, K, n, k, p)
                if n != 0 and k != 0:
                    perc = k / n * 100
                    percent_plot.loc[group, cut] = 100 if perc == 100 else perc
                    p_plot.loc[group, cut] = p

    percent_plot = percent_plot[np.sum(percent_plot.isna(), axis=1) == 0]
    p_plot = p_plot[np.sum(p_plot.isna(), axis=1) == 0]

    percent_plot.columns = ['> {}'.format(str(i)) for i in percent_plot.columns]
    p_plot.columns = ['> {}'.format(str(i)) for i in p_plot.columns]
    fig, ax = plt.subplots(figsize=(6, 6))
    ax = sns.heatmap(data=percent_plot, cmap='Reds', annot=False,
                     linewidths=1, linecolor='white', # vmin=0, vmax=35,
                     cbar_kws={'shrink': 0.5})
    ax = sns.heatmap(data=percent_plot, cmap='Reds', annot=True, alpha=0, cbar=False,
                     linewidths=1, linecolor='white', annot_kws={"fontsize": 18, "color": "black"})
    cbar = ax.collections[0].colorbar
    cbar.ax.tick_params(labelsize=18)
    cbar.set_label('Percent of known inhibitors (%)', fontsize=26, labelpad=20)
    plt.xticks(rotation=30, fontsize=22)
    plt.yticks(rotation=0, fontsize=22)
    plt.title(ge_key, fontsize=30, pad=15)
    plt.ylabel('Kinase group', fontsize=23, labelpad=20)
    plt.xlabel('PCorr-DGI score', fontsize=23, labelpad=20)
    plt.show()

##### 2. Identifying Staurosporine' targets

In [None]:
data_kinase = pd.read_csv('PCorr_DGI_datasets/kinase_list.csv')
data_ATP = pd.read_table('PCorr_DGI_datasets/GO_0005524.ATP.binding.all.proteins.txt')
drug_zid = pickle.load(open('PCorr_DGI_datasets/drug_zid.pkl', 'rb'))

data_kinase = data_kinase[data_kinase['Gene names'].isna()==False]
kinases = set()
for i in data_kinase.index:
    kinases.add(data_kinase.loc[i,'Gene names'].split(' ')[0])
atps = set(data_ATP['SYMBOL'])
ka = kinases|atps

zid = drug_zid['staurosporine']

scores_cut = [0.5, 0.6, 0.7, 0.8, 0.9]

# CRISPR
ge_key = 'CRISPR'
r = pickle.load(open('PCorr_DGI_datasets/r_{}.pkl'.format(ge_key), 'rb'))
r['avg'] = np.mean(r[['y_probs', 'y_probs_corum']], axis=1)
r.index = ['_'.join(list(i)) for i in r.index]
r_stp = r[r.index.str.contains(zid)]
r_stp_genes = [i.split('_')[2] for i in r_stp.index]
r_stp_ka = set(r_stp_genes) & ka
N = len(r_stp_genes)
K = len(r_stp_ka)

n_lst = []
k_lst = []
for cut in tqdm(scores_cut):
    r_high_stp = r_stp[r_stp['avg'] > cut]
    r_stp_genes = [i.split('_')[2] for i in r_high_stp.index]
    r_stp_ka = set(r_stp_genes) & ka
    n = len(r_stp_genes)
    k = len(r_stp_ka)

    n_lst.append(n)
    k_lst.append(k)

y_list_c = [k / n * 100 for n, k in zip(n_lst, k_lst)]
p_list_c = [calculate_hypergeometric_pvalue(N, K, n, k) for n, k in zip(n_lst, k_lst)]

# RNAi
ge_key = 'RNAi'
r = pickle.load(open(file_dir + 'r_{}.pkl'.format(ge_key), 'rb'))
r['avg'] = np.mean(r[['y_probs', 'y_probs_corum']], axis=1)
r.index = ['_'.join(list(i)) for i in r.index]
r_stp = r[r.index.str.contains(zid)]
r_stp_genes = [i.split('_')[2] for i in r_stp.index]
r_stp_ka = set(r_stp_genes) & ka
N = len(r_stp_genes)
K = len(r_stp_ka)

n_lst = []
k_lst = []
for cut in tqdm(scores_cut):
    r_high_stp = r_stp[r_stp['avg'] > cut]
    r_stp_genes = [i.split('_')[2] for i in r_high_stp.index]
    r_stp_ka = set(r_stp_genes) & ka
    n = len(r_stp_genes)
    k = len(r_stp_ka)

    n_lst.append(n)
    k_lst.append(k)

y_list_r = [k / n * 100 for n, k in zip(n_lst, k_lst)]
p_list_r = [calculate_hypergeometric_pvalue(N, K, n, k) for n, k in zip(n_lst, k_lst)]

scores_info = ['> {}'.format(i) for i in scores_cut]

x = np.arange(0,2*len(scores_cut),2)

width=0.7
x1 = x-width/2
x2 = x+width/2

fig, ax=plt.subplots(figsize=(6, 6))
plt.bar(x1, y_list_c,facecolor='#eb6841',lw=1,width=0.7,label= 'CRISPR',edgecolor='black')
plt.bar(x2, y_list_r,facecolor='#00a0b0',lw=1,width=0.7,label='RNAi',edgecolor='black')

for a, b, c in zip(x1, y_list_c, p_list_c):
    plt.text(a-0.15, b+1, pvalue_to_stars(c), rotation=90, color='red', fontsize=20, va='bottom',ha='left')
for a, b, c in zip(x2, y_list_r, p_list_r):
    plt.text(a-0.15, b+1, pvalue_to_stars(c), rotation=90, color='red', fontsize=20, va='bottom',ha='left')

plt.xticks(x,scores_info,fontsize=22,rotation=30)
plt.yticks(fontsize=22)
plt.ylabel('Percentage of known targets (%)',fontsize=26,labelpad=20)
plt.xlabel('PCorr-DGI score',fontsize=26,labelpad=20)
plt.xlim(-1,len(scores_cut)*2-1)
plt.legend(loc=2, bbox_to_anchor=(1.05,1.0),fontsize=17,markerfirst=False,
           markerscale=1,scatterpoints=1,ncol=1,title_fontsize=17)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.show()