In [1]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
from scipy.spatial.distance import pdist, squareform

### 1. Read in the model fit results

In [2]:
# Specify random seed and epochs ----------------------------------------
random_seed=12
num_epochs=100
use_pred_single = False # True if using predicted single perturbation, False if using true single perturbation
filter_zeros = False

# Specify the model type and model name ----------------------------------------
model_type='morph' # Gears or morph or Control (for baseline) or Truth
if model_type == 'morph':
    representation_type='DepMap_GeneEffect'
    model_name = 'best_model'
    recon_loss = 'mmd'
    null_label = 'zeros'
    mxAlpha = 2.0
    tolerance_epochs = 20
elif model_type == 'Gears':
    model_name='model.pt'
elif model_type == 'Control':
    model_name=None
elif model_type == 'Truth':
    model_name=None

# Specify the number of genes to use -------------------------------------------
num_gene = 2500
n_subsamples = 1000 # whether to set n_subsamples in TheilSenRegressor

In [3]:
dataset_name = 'norman_k562_hvg'
dataset = dataset_name.replace('_hvg', '')
use_hvg = 'True' if 'hvg' in dataset_name else 'False'

In [None]:
parent_dir = f'/home/che/perturb-project/git/gene_ptb_prediction/gene_interaction_prediction/data/{dataset_name}/predict'
if model_type == 'Gears':
    if use_pred_single:
        raise ValueError('Gears model does not have predicted single perturbation')
    else:
        data_path = f'{parent_dir}/use_gt_single/num_gene_{num_gene}/Gears/seed_{random_seed}'
elif model_type == 'morph':
    if use_pred_single:
        data_path = f'{parent_dir}/use_pred_single/{representation_type}_{model_type}/epochs_{num_epochs}_seed_{random_seed}'
    else:
        data_path = f'{parent_dir}/use_gt_single/num_gene_{num_gene}/{representation_type}_{model_type}/recon_loss_{recon_loss}/null_label_{null_label}/epochs_{num_epochs}/tolerance_epochs_{tolerance_epochs}/mxAlpha_{mxAlpha}/seed_{random_seed}/{model_name}'
elif model_type == 'Control':
    data_path = f'{parent_dir}/use_gt_single/num_gene_{num_gene}/Control/seed_{random_seed}'
elif model_type == 'Truth':
    data_path = f'{parent_dir}/use_gt_single/num_gene_{num_gene}/Truth'

if n_subsamples is not None:
    if filter_zeros:
        model_fit_results_path = f'{data_path}/theilsen_results_n_subsamples_{n_subsamples}_filtered.pkl'
    else:
        model_fit_results_path = f'{data_path}/theilsen_results_n_subsamples_{n_subsamples}.pkl'
    with open(model_fit_results_path, 'rb') as f:
        model_fit_results = pickle.load(f)

print('Loaded model fit results from', model_fit_results_path)

In [None]:
model_fit_results[model_fit_results['combination'] == 'MAPK1+PRTG']

In [None]:
model_fit_results[model_fit_results['combination'] == 'FOXL2+HOXB9']

### 2. Calculate GI scores

In [7]:
# adapted from: https://gist.github.com/satra/aa3d19a12b74e9ab7941
def distcorr(X, Y):
    """ Compute the distance correlation function
    
    >>> a = [1,2,3,4,5]
    >>> b = np.array([1,2,9,4,4])
    >>> distcorr(a, b)
    0.762676242417
    """
    X = np.atleast_1d(X)
    Y = np.atleast_1d(Y)
    if np.prod(X.shape) == len(X):
        X = X[:, None]
    if np.prod(Y.shape) == len(Y):
        Y = Y[:, None]
    X = np.atleast_2d(X)
    Y = np.atleast_2d(Y)
    n = X.shape[0]
    if Y.shape[0] != X.shape[0]:
        raise ValueError('Number of samples must match')
    a = squareform(pdist(X))
    b = squareform(pdist(Y))
    A = a - a.mean(axis=0)[None, :] - a.mean(axis=1)[:, None] + a.mean()
    B = b - b.mean(axis=0)[None, :] - b.mean(axis=1)[:, None] + b.mean()
    
    dcov2_xy = (A * B).sum()/float(n * n)
    dcov2_xx = (A * A).sum()/float(n * n)
    dcov2_yy = (B * B).sum()/float(n * n)
    dcor = np.sqrt(dcov2_xy)/np.sqrt(np.sqrt(dcov2_xx) * np.sqrt(dcov2_yy))
    return dcor

In [None]:
df = model_fit_results.copy()
df.head()

In [None]:
# Calculate the required columns
df['model_fit'] = df.apply(lambda row: distcorr(row['g_ab'], row['combined_effect']), axis=1)
print('Finished model_fit')
df['magnitude'] = np.sqrt(df['c_a']**2 + df['c_b']**2)
print('Finished magnitude')

In [10]:
# quick chekcs
row_temp = df.iloc[0]
model_fit = distcorr(row_temp['g_ab'], row_temp['combined_effect'])
assert row_temp['model_fit'] == model_fit
magnitude = np.sqrt(row_temp['c_a']**2 + row_temp['c_b']**2)
assert row_temp['magnitude'] == magnitude

In [None]:
df['similarity'] = df.apply(lambda row: distcorr(np.stack([row['g_a'], row['g_b']], axis=1), row['g_ab']), axis=1)
print('Finished similarity')

# Calculate equality of contribution
def equality_of_contribution(row):
    dcor_a = distcorr(row['g_a'], row['g_ab'])
    dcor_b = distcorr(row['g_b'], row['g_ab'])
    min_dcor = min(dcor_a, dcor_b)
    max_dcor = max(dcor_a, dcor_b)
    return min_dcor / max_dcor

df['equality_of_contribution'] = df.apply(equality_of_contribution, axis=1)
print('Finished equality_of_contribution')

In [12]:
# quick checks
row_temp = df.iloc[10]
dcor_a = distcorr(row_temp['g_a'], row_temp['g_ab'])
dcor_b = distcorr(row_temp['g_b'], row_temp['g_ab'])
min_dcor = min(dcor_a, dcor_b)
max_dcor = max(dcor_a, dcor_b)
equality_of_contribution = min_dcor / max_dcor
assert row_temp['equality_of_contribution'] == equality_of_contribution

similarity = distcorr(np.stack([row_temp['g_a'], row_temp['g_b']], axis=1), row_temp['g_ab'])
assert row_temp['similarity'] == similarity

In [13]:
# calculate dominance: |log_10(c_a/c_b)|
df['dominance'] = np.abs(np.log10(np.abs(df['c_a'])/np.abs(df['c_b'])))

In [None]:
df

In [None]:
# save gi scores into pickle file
if n_subsamples is not None:
    if filter_zeros:
        with open(f'{data_path}/gi_scores_n_subsamples_{n_subsamples}_filtered.pkl', 'wb') as f:
            pickle.dump(df, f)
        print(f'Saved gi scores into {data_path}/gi_scores_n_subsamples_{n_subsamples}_filtered.pkl')
    else:
        with open(f'{data_path}/gi_scores_n_subsamples_{n_subsamples}.pkl', 'wb') as f:
            pickle.dump(df, f)
        print(f'Saved gi scores into {data_path}/gi_scores_n_subsamples_{n_subsamples}.pkl')
else:
    with open(f'{data_path}/gi_scores.pkl', 'wb') as f:
        pickle.dump(df, f)
    print(f'Saved gi scores into {data_path}/gi_scores.pkl')