In [None]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

In [None]:
import os
import json

from tqdm import tqdm
import numpy as np
import scipy.spatial
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import anndata
import scanpy as sc

In [None]:
import scipy.stats
import statsmodels.stats.multitest

def adjust_p_value_matrix_by_BH(p_val_mtx):
    '''Adjust the p-values in a matrix by the Benjamini/Hochberg method.
    The matrix should be symmetric.
    '''
    p_val_sequential_bh = statsmodels.stats.multitest.multipletests(
        p_val_mtx.reshape(-1), method='fdr_bh')[1]

            
    return p_val_sequential_bh.reshape(p_val_mtx.shape)

In [None]:
obs_df = pd.read_csv('adata_obs_l2.csv', index_col=0)
print(len(obs_df))

# Keep cells with enough UMI
obs_df = obs_df[(obs_df['num_umis'] > 20)]
obs_df

In [None]:
# Keep the perturbations with enough cells
pg_counts = obs_df['perturbed_gene'].value_counts()
obs_df = obs_df[obs_df['perturbed_gene'].isin(pg_counts[pg_counts > 50].index)]
obs_df

In [None]:
output_path = 'enrichment_results'
os.makedirs(output_path, exist_ok=True)

positive_count_df = pd.DataFrame(
    index=np.unique(obs_df['cluster']),
    columns=np.unique(obs_df['perturbed_gene']),
    dtype=int) 
log2fc_df = pd.DataFrame(
    index=np.unique(obs_df['cluster']),
    columns=np.unique(obs_df['perturbed_gene']),
    dtype=float)
pval_df = log2fc_df.copy()

for cluster_of_interest in log2fc_df.index:
    print(cluster_of_interest)

    for pg_of_interest in tqdm(log2fc_df.columns):
        contigency_table = pd.crosstab(obs_df['cluster'] == cluster_of_interest, 
                            obs_df['perturbed_gene'] == pg_of_interest)
        if contigency_table.shape != (2, 2):
            print(contigency_table.shape)
            continue

        results = scipy.stats.chi2_contingency(contigency_table)
        pval_df.loc[cluster_of_interest, pg_of_interest] = results[1] 

        positive_count_df.loc[cluster_of_interest, pg_of_interest] = contigency_table.values[1, 1]
        log2fc_df.loc[cluster_of_interest, pg_of_interest] = np.log2(
            contigency_table.values[1, 1] / results.expected_freq[1, 1] + 1e-6)
        

positive_count_df.to_parquet(os.path.join(output_path, 'l2_c_enrich_gene_positive_count.parquet'))
log2fc_df.to_parquet(os.path.join(output_path, 'l2_c_enrich_gene_log2fc.parquet'))
pval_df.to_parquet(os.path.join(output_path, 'l2_c_enrich_gene_pval.parquet'))

In [None]:
cep_dict = {
    'cluster': [],
    'perturbed_gene': [],
    'positive_count': [],
    'log2fc': [],
    'pval': []
}

for cluster in positive_count_df.index:
    for pg in positive_count_df.columns:
        cep_dict['cluster'].append(cluster)
        cep_dict['perturbed_gene'].append(pg)
        cep_dict['positive_count'].append(positive_count_df.loc[cluster, pg])
        cep_dict['log2fc'].append(log2fc_df.loc[cluster, pg])
        cep_dict['pval'].append(pval_df.loc[cluster, pg])

cep_df = pd.DataFrame(cep_dict)
cep_df['pval_adj'] = statsmodels.stats.multitest.multipletests(
        cep_df['pval'].values, method='fdr_bh')[1]

cep_df.to_parquet(os.path.join(output_path, 'l2_c_enrich_gene.parquet'))

In [None]:
enriched_gene_counts = cep_df[(cep_df['log2fc'] > 1) 
     & (cep_df['pval_adj'] < 0.001)
     & (cep_df['positive_count'] > 5)
]['cluster'].value_counts()
plt.bar(enriched_gene_counts.index, enriched_gene_counts.values)

In [None]:
enriched_gene_counts

In [None]:
cep_df[cep_df['log2fc'] > 1].sort_values('pval_adj')[:20]

In [None]:
cep_df[(cep_df['log2fc'] > 1) 
     & (cep_df['pval_adj'] < 0.001)
     & (cep_df['positive_count'] > 5)
     & (cep_df['cluster'].isin(['16_3']))
].sort_values(['pval_adj', 'log2fc'])

In [None]:
np.unique(cep_df[(cep_df['log2fc'] > 1) 
     & (cep_df['pval_adj'] < 0.001)
     & (cep_df['positive_count'] > 5)
     & (cep_df['cluster'].isin(['16_2']))
]['perturbed_gene'])

In [None]:
cep_df[(cep_df['perturbed_gene'] == 'non-targeting')].sort_values('log2fc')[-60:]