# GI_pert_seen2

In [1]:
import os
import sys
import pickle
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, TheilSenRegressor
from dcor import distance_correlation, partial_distance_correlation

In [2]:
GIs = {
    'NEOMORPHIC': ['CBL+TGFBR2',
                  'KLF1+TGFBR2',
                  'MAP2K6+SPI1',
                  'SAMD1+TGFBR2',
                  'TGFBR2+C19orf26',
                  'TGFBR2+ETS2',
                  'CBL+UBASH3A',
                  'CEBPE+KLF1',
                  'DUSP9+MAPK1',
                  'FOSB+PTPN12',
                  'PLK4+STIL',
                  'PTPN12+OSR2',
                  'ZC3HAV1+CEBPE'],
    'ADDITIVE': ['BPGM+SAMD1',
                'CEBPB+MAPK1',
                'CEBPB+OSR2',
                'DUSP9+PRTG',
                'FOSB+OSR2',
                'IRF1+SET',
                'MAP2K3+ELMSAN1',
                'MAP2K6+ELMSAN1',
                'POU3F2+FOXL2',
                'RHOXF2BB+SET',
                'SAMD1+PTPN12',
                'SAMD1+UBASH3B',
                'SAMD1+ZBTB1',
                'SGK1+TBX2',
                'TBX3+TBX2',
                'ZBTB10+SNAI1'],
    'EPISTASIS': ['AHR+KLF1',
                 'MAPK1+TGFBR2',
                 'TGFBR2+IGDCC3',
                 'TGFBR2+PRTG',
                 'UBASH3B+OSR2',
                 'DUSP9+ETS2',
                 'KLF1+CEBPA',
                 'MAP2K6+IKZF3',
                 'ZC3HAV1+CEBPA'],
    'REDUNDANT': ['CDKN1C+CDKN1A',
                 'MAP2K3+MAP2K6',
                 'CEBPB+CEBPA',
                 'CEBPE+CEBPA',
                 'CEBPE+SPI1',
                 'ETS2+MAPK1',
                 'FOSB+CEBPE',
                 'FOXA3+FOXA1'],
    'POTENTIATION': ['CNN1+UBASH3A',
                    'ETS2+MAP7D1',
                    'FEV+CBFA2T3',
                    'FEV+ISL2',
                    'FEV+MAP7D1',
                    'PTPN12+UBASH3A'],
    'SYNERGY_SIMILAR_PHENO':['CBL+CNN1',
                            'CBL+PTPN12',
                            'CBL+PTPN9',
                            'CBL+UBASH3B',
                            'FOXA3+FOXL2',
                            'FOXA3+HOXB9',
                            'FOXL2+HOXB9',
                            'UBASH3B+CNN1',
                            'UBASH3B+PTPN12',
                            'UBASH3B+PTPN9',
                            'UBASH3B+ZBTB25'],
    'SYNERGY_DISSIMILAR_PHENO': ['AHR+FEV',
                                'DUSP9+SNAI1',
                                'FOXA1+FOXF1',
                                'FOXA1+FOXL2',
                                'FOXA1+HOXB9',
                                'FOXF1+FOXL2',
                                'FOXF1+HOXB9',
                                'FOXL2+MEIS1',
                                'IGDCC3+ZBTB25',
                                'POU3F2+CBFA2T3',
                                'PTPN12+ZBTB25',
                                'SNAI1+DLX2',
                                'SNAI1+UBASH3B'],
    'SUPPRESSOR': ['CEBPB+PTPN12',
                  'CEBPE+CNN1',
                  'CEBPE+PTPN12',
                  'CNN1+MAPK1',
                  'ETS2+CNN1',
                  'ETS2+IGDCC3',
                  'ETS2+PRTG',
                  'FOSB+UBASH3B',
                  'IGDCC3+MAPK1',
                  'LYL1+CEBPB',
                  'MAPK1+PRTG',
                  'PTPN12+SNAI1']
}


In [3]:
def get_coeffs(singles_expr, first_expr, second_expr, double_expr):
    results = {}
    results['ts'] = TheilSenRegressor(fit_intercept=False,
                          max_subpopulation=1e5,
                          max_iter=1000,
                          random_state=1000)   
    X = singles_expr
    y = double_expr
    results['ts'].fit(X, y.ravel())
    Zts = results['ts'].predict(X)
    results['c1'] = results['ts'].coef_[0]
    results['c2'] = results['ts'].coef_[1]
    results['mag'] = np.sqrt((results['c1']**2 + results['c2']**2))
    
    results['dcor'] = distance_correlation(singles_expr, double_expr)
    results['dcor_singles'] = distance_correlation(first_expr, second_expr)
    results['dcor_first'] = distance_correlation(first_expr, double_expr)
    results['dcor_second'] = distance_correlation(second_expr, double_expr)
    results['corr_fit'] = np.corrcoef(Zts.flatten(), double_expr.flatten())[0,1]
    results['dominance'] = np.abs(np.log10(results['c1']/results['c2']))
    results['eq_contr'] = np.min([results['dcor_first'], results['dcor_second']])/\
                        np.max([results['dcor_first'], results['dcor_second']])
    
    return results
    
from scipy.sparse import csr_matrix
from sklearn.utils import resample

def calculate_p_ctrl_vec(adata, num_samples=300, num_reps=100):

    np.random.seed(42)
    ctrl_adata = adata[adata.obs['condition'] == 'ctrl']
    if isinstance(ctrl_adata.X, csr_matrix):

        ctrl_data_dense = ctrl_adata.X.A
    else:
        ctrl_data_dense = ctrl_adata.X

    p_ctrl_list = []

    for _ in range(num_reps):
 
        sample = resample(ctrl_data_dense, n_samples=num_samples, random_state=np.random.randint(0, 1000))
        mean_sample = np.mean(sample, axis=0)
        p_ctrl_list.append(mean_sample)

    p_ctrl = np.array(p_ctrl_list)
    p_ctrl_vec = p_ctrl.mean(0)
    return p_ctrl_vec

    
def get_GI_params(preds, combo):
    
    singles_expr = np.array([preds[f'{combo[0]}+ctrl'], preds[f'{combo[1]}+ctrl']]).T
    first_expr = np.array(preds[f'{combo[0]}+ctrl']).T
    second_expr = np.array(preds[f'{combo[1]}+ctrl']).T
    double_expr = np.array(preds[combo[0]+'+'+combo[1]]).T
    
    return get_coeffs(singles_expr, first_expr, second_expr, double_expr)

def get_GI_type(gi_name):
    return [k for k in GIs.keys() if gi_name in GIs[k]][0].lower()

def get_p_delta_exp(subset_preds, adata):
    p_ctrl = calculate_p_ctrl_vec(adata)
    p_ctrl_list = p_ctrl[0].tolist()
    
    # Adjust each gene expression in subset_preds
    for key in subset_preds:
        # Get the gene expression values for the current key
        gene_expression = subset_preds[key]
        
        # Subtract the p_ctrl values from the gene expression values
        adjusted_expression = [gene - ctrl for gene, ctrl in zip(gene_expression, p_ctrl_list)]
        
        # Update the subset_preds dictionary with the adjusted gene expression values
        subset_preds[key] = adjusted_expression
    
    return subset_preds

In [4]:
def calculate_average_predictions(res):
    sums_counts = defaultdict(lambda: [0, 0])

    # Iterate over the perturbation categories and their corresponding predictions
    for i, cell in enumerate(res['pert_cat']):
        if sums_counts[cell][1] == 0:
            sums_counts[cell][0] = np.array(res['pred'][i])
        else:
            sums_counts[cell][0] += np.array(res['pred'][i])
        sums_counts[cell][1] += 1

    # Calculate the average predictions for each category
    preds_avg = {}
    for cell, (total, count) in sums_counts.items():
        preds_avg[cell] = (total / count).tolist()

    return preds_avg

def split_gene_combinations(all_combos):
    split_combinations = []

    # Iterate over each gene combination in all_combos
    for combo in all_combos:
        # Split the combination into individual genes based on the '+' separator
        genes = combo.split('+')
        
        # Append the list of genes to the split_combinations list
        split_combinations.append(genes)
    
    return split_combinations

### data

In [31]:
data_path = '/home/share/huadjyin/home/zhoumin3/zhoumin/benchmark_data/01A_total_re/'
with open(f'{data_path}03final/normanweissman2019/splits/normanweissman2019_simulation_1_0.75_subgroup.pkl', 'rb') as f:
     split1_subgroup = pickle.load(f)

In [32]:
res_path = '/home/share/huadjyin/home/zhoumin3/zhoumin/model_benchmark/01_A_results/'
with open(f'{res_path}NormanWeissman2019/gears/split1/NormanWeissman2019_split1_test_res.pkl', 'rb') as f:
     res = pickle.load(f)

In [33]:
import scanpy as sc
adata = sc.read_h5ad('/home/share/huadjyin/home/zhoumin3/zhoumin/benchmark_data/01A_total_re/03final/normanweissman2019/perturb_processed.h5ad')

#### pred_res

In [34]:
sums_counts = defaultdict(lambda: [0, 0])
for i, cell in enumerate(res['pert_cat']):
 
    if sums_counts[cell][1] == 0:
        sums_counts[cell][0] = np.array(res['pred'][i])
    else:
        sums_counts[cell][0] += np.array(res['pred'][i])
    sums_counts[cell][1] += 1

preds_avg = {}

for cell, (total, count) in sums_counts.items():
    preds_avg[cell] = (total / count).tolist()

In [35]:
combo = ['CBL', 'PTPN12']
keys_to_keep = {f'{combo[0]}+ctrl', f'{combo[1]}+ctrl', f'{combo[0]}+{combo[1]}'}
subset_preds = {key: preds_avg[key] for key in keys_to_keep if key in preds_avg}

In [36]:
subset_preds = get_p_delta_exp(subset_preds, adata)

In [37]:
results = get_GI_params(subset_preds, combo)

In [39]:
results

{'ts': TheilSenRegressor(fit_intercept=False, max_iter=1000,
                   max_subpopulation=100000.0, random_state=1000),
 'c1': 0.36723808882218545,
 'c2': 1.6805584749260956,
 'mag': 1.720215277669541,
 'dcor': 0.882962990282585,
 'dcor_singles': 0.7984883353325617,
 'dcor_first': 0.8317510361023311,
 'dcor_second': 0.8470858525590247,
 'corr_fit': 0.8964962838109543,
 'dominance': 0.660505909744072,
 'eq_contr': 0.981896974892961}

In [15]:
category =  get_GI_type(f'{combo[0]}+{combo[1]}')

#### metrics

In [16]:
metrics = ['c1', 'c2', 'mag', 'dcor', 'dcor_singles', 'dcor_first', 'dcor_second', 'corr_fit', 'dominance', 'eq_contr']
categories = ['synergy_similar_pheno', 'synergy_dissimilar_pheno', 'potentiation', 'additive', 'suppressor', 'neomorphic', 'redundant', 'epistasis']
dict_ = {}
for metric in metrics:
    dict_[metric] = {}
    for category in categories:
        dict_[metric][category] = []

In [18]:
for key, value in results.items():
    if key not in ['ts']:  
        dict_[key][category].append(value)

# all_combos

In [26]:
import scanpy as sc
adata = sc.read_h5ad('/home/share/huadjyin/home/zhoumin3/zhoumin/benchmark_data/01A_total_re/03final/normanweissman2019/perturb_processed.h5ad')

In [27]:
import numpy as np
all_combos = np.load('norman_combo_seen0_unique.npy', allow_pickle=True).item()

In [28]:
res_path = '/home/share/huadjyin/home/zhoumin3/zhoumin/model_benchmark/01_A_results/'

In [29]:
metrics = ['c1', 'c2', 'mag', 'dcor', 'dcor_singles', 'dcor_first', 'dcor_second', 'corr_fit', 'dominance', 'eq_contr']
categories = ['synergy_similar_pheno', 'synergy_dissimilar_pheno', 'potentiation', 'additive', 'suppressor', 'neomorphic', 'redundant', 'epistasis']
dict_ = {metric: {category: [] for category in categories} for metric in metrics}

In [30]:
csv_output_path = "./GI_out/norman_seen0_GI_metric.csv"
csv_df = pd.DataFrame()

In [31]:
for i in range(1, 6):
    with open(f'{res_path}NormanWeissman2019/gears/split{i}/NormanWeissman2019_split{i}_test_res.pkl', 'rb') as f:
        res = pickle.load(f)
    preds_avg = calculate_average_predictions(res)
    
    split_combos = all_combos[f'split{i}']
    split_combos = split_gene_combinations(split_combos)
    for combo in split_combos:
        combo_str = f'{combo[0]}+{combo[1]}'
        if combo_str in sum(GIs.values(), []):
            keys_to_keep = {f'{combo[0]}+ctrl', f'{combo[1]}+ctrl', f'{combo[0]}+{combo[1]}'}
            subset_preds = {key: preds_avg[key] for key in keys_to_keep if key in preds_avg}
            subset_preds = get_p_delta_exp(subset_preds, adata)
            results = get_GI_params(subset_preds, combo)
            category = get_GI_type(f'{combo[0]}+{combo[1]}')
            
            for key, value in results.items():
                if key not in ['ts']:  
                    dict_[key][category].append(value)
                    
            temp_df = pd.DataFrame(results, index=[combo_str]).T
            temp_df.columns = [combo_str]
            temp_df = temp_df.drop(index='ts', errors='ignore')
            
            temp_df.loc['category', combo_str] = category
            
            # csv_df
            if csv_df.empty:
                csv_df = temp_df
            else:
                csv_df = pd.concat([csv_df, temp_df], axis=1)
            
            print(f"---{combo[0]}+{combo[1]}---")
        else:
            print(f"---{combo[0]}+{combo[1]}---pass")
            continue

---CBL+PTPN12---
---CDKN1C+CDKN1A---
---C3orf72+FOXL2---pass
---CDKN1C+CDKN1B---pass
---CEBPB+PTPN12---
---ZBTB10+PTPN12---pass
---RHOXF2+SET---pass
---CDKN1B+CDKN1A---pass
---POU3F2+FOXL2---
---ETS2+CNN1---
---ETS2+IGDCC3---
---SGK1+S1PR2---pass
---CNN1+UBASH3A---
---FOXA1+FOXL2---
---JUN+CEBPA---pass
---UBASH3B+CNN1---
---POU3F2+CBFA2T3---
---KLF1+CLDN6---pass
---KLF1+TGFBR2---
---AHR+KLF1---
---FOXF1+HOXB9---
---SAMD1+TGFBR2---
---IGDCC3+MAPK1---
---UBASH3B+UBASH3A---pass
---PTPN12+PTPN9---pass
---CEBPB+CEBPA---


  results['dominance'] = np.abs(np.log10(results['c1']/results['c2']))


---MAP2K6+ELMSAN1---
---CEBPB+MAPK1---
---PTPN12+UBASH3A---
---UBASH3B+PTPN9---
---UBASH3B+PTPN12---


In [32]:
with open('./GI_out/norman_combo_seen0_GI_metric.pkl', 'wb') as f:
    pickle.dump(dict_, f)

In [33]:
csv_df.to_csv(csv_output_path)