# Correlate TF (Tfap2b) Expression with Target Genes

The Links object gives us **GRN edges** (which genes Tfap2b regulates per cluster), but to see how expression actually correlates, we need the gene expression matrix from AnnData/Oracle.

This notebook:
1. Extracts target genes from filtered Links
2. Computes Spearman/Pearson correlations between Tfap2b and each target
3. Compares GRN coefficients vs. expression correlations
4. Visualizes results with heatmaps, scatter plots, and bar charts

In [1]:
import celloracle as co
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, pearsonr
from scipy.sparse import issparse
import os
import warnings
warnings.filterwarnings("ignore")

  from pkg_resources import get_distribution, DistributionNotFound


## 1. Configuration

**Update the paths below to match your files.**

In [2]:
LINKS_PATH = "celloracle_results/per_celltype/Epi_Kit+Elf5+_filtered.celloracle.links"

# oracle object stores the expression values
ORACLE_PATH = "celloracle_results/per_celltype/Epi_Kit+Elf5+.celloracle.oracle"

TF = "Tfap2b"
CLUSTER_COL = "sample"

OUTPUT_DIR = "tf_correlation_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/{TF}", exist_ok=True)

## 2. Load Links & Extract Target Genes

In [3]:
links = co.load_hdf5(file_path=LINKS_PATH)
print(f"Clusters in Links: {links.cluster}")

cluster1 = links.cluster[0]
cluster2 = links.cluster[1]
print(f"Cluster 1: {cluster1}")
print(f"Cluster 2: {cluster2}")

Clusters in Links: ['KO_DM', 'WT_DM']
Cluster 1: KO_DM
Cluster 2: WT_DM


In [4]:
df1 = links.filtered_links[cluster1]
df2 = links.filtered_links[cluster2]

targets1 = set(df1[df1["source"] == TF]["target"])
targets2 = set(df2[df2["source"] == TF]["target"])
all_targets = sorted(targets1 | targets2)

print(f"{cluster1}: {len(targets1)} targets")
print(f"{cluster2}: {len(targets2)} targets")
print(f"Union: {len(all_targets)} unique targets")

KO_DM: 210 targets
WT_DM: 123 targets
Union: 234 unique targets


In [5]:
# Build lookup of GRN coefficients per cluster
edges1 = df1[df1["source"] == TF].set_index("target")[["coef_mean"]].rename(
    columns={"coef_mean": f"GRN_coef_{cluster1}"}
)
edges2 = df2[df2["source"] == TF].set_index("target")[["coef_mean"]].rename(
    columns={"coef_mean": f"GRN_coef_{cluster2}"}
)
grn_coefs = edges1.join(edges2, how="outer")
grn_coefs.head()

Unnamed: 0_level_0,GRN_coef_KO_DM,GRN_coef_WT_DM
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0610040J01Rik,-0.108382,
1110019D14Rik,,0.05803
1700025G04Rik,-0.083259,
2610307P16Rik,,0.069129
4931406C07Rik,0.070238,0.070396


## 3. Load Expression Data

Choose **one** of the loading strategies below and comment out the other.

In [6]:
oracle = co.load_hdf5(file_path=ORACLE_PATH)
adata = oracle.adata.copy()

print(f"AnnData shape: {adata.shape}")
print(f"Layers: {list(adata.layers.keys())}")
print(f"Obs columns: {list(adata.obs.columns)}")

AnnData shape: (1811, 3000)
Layers: ['counts', 'raw_count', 'normalized_count', 'imputed_count']
Obs columns: ['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample', 'RNA_snn_res.0.5', 'seurat_clusters', 'RNA_snn_res.0.1', 'RNA_snn_res.1', 'RNA_snn_res.0.2', 'cluster_annot']


In [None]:
# Extract expression matrix (adata.X should be normalized, log-transformed)
X = adata.X.toarray() if issparse(adata.X) else np.array(adata.X)
expr_df = pd.DataFrame(X, index=adata.obs_names, columns=adata.var_names)

assert TF in expr_df.columns, f"{TF} not found in expression matrix!"

available_targets = [g for g in all_targets if g in expr_df.columns]m 
missing_targets = [g for g in all_targets if g not in expr_df.columns]
if missing_targets:
    print(f"Warning: {len(missing_targets)} targets not in expression matrix: {missing_targets}")
print(f"Analyzing {len(available_targets)} target genes")

Analyzing 234 target genes


## 4. Compute Correlations across ALL cells regardless of condition

In [None]:
# get expression vector for all cells of TF
tf_expr = expr_df[TF]
corr_results = []

# loop over every target gene predicted for TF
for gene in available_targets:
    # getting target gene expression vector across all cells 
    gene_expr = expr_df[gene]
    # compute correlation metrics between the TF expression vector and target gene expression vector across all cells
    r_spearman, p_spearman = spearmanr(tf_expr, gene_expr)
    r_pearson, p_pearson = pearsonr(tf_expr, gene_expr)
    corr_results.append({
        "target": gene,
        "spearman_r": r_spearman,
        "spearman_p": p_spearman,
        "pearson_r": r_pearson,
        "pearson_p": p_pearson,
        "in_cluster1": gene in targets1,
        "in_cluster2": gene in targets2,
    })

corr_df = pd.DataFrame(corr_results)
# join correlation results with the GRN regression coefficients 
# so each row now has both the raw expression correlation and CellOracle's inferred regulatory weight 
corr_df = corr_df.set_index("target").join(grn_coefs).reset_index()
corr_df = corr_df.sort_values("spearman_r", ascending=False)

print("--- Top positively correlated targets ---")
display(corr_df.head(10))
print("\n--- Top negatively correlated targets ---")
display(corr_df.tail(10))

--- Top positively correlated targets ---


Unnamed: 0,target,spearman_r,spearman_p,pearson_r,pearson_p,in_cluster1,in_cluster2,GRN_coef_KO_DM,GRN_coef_WT_DM
141,Papln,0.220862,1.903841e-21,0.214222,3.046495e-20,True,True,0.301066,0.143662
204,Slc5a7,0.199236,1.140839e-17,0.193283,1.058617e-16,True,True,0.405066,0.362612
230,Xkr6,0.188788,5.431494e-16,0.185216,1.935645e-15,True,False,0.19299,
92,Igfbp5,0.177484,2.778415e-14,0.171316,2.139082e-13,True,True,0.612643,0.23375
148,Pde7b,0.175484,5.43017e-14,0.175486,5.425277e-14,True,True,0.340694,0.156269
46,Ctnnd2,0.163174,2.82664e-12,0.155993,2.476568e-11,True,True,0.265526,0.148911
60,Erbb4,0.153119,5.743001e-11,0.140011,2.18161e-09,True,True,0.303254,0.181537
80,Gpc6,0.151959,8.029332e-11,0.145861,4.48004e-10,True,True,0.606107,0.308217
121,Mtmr9,0.149971,1.417203e-10,0.140633,1.849503e-09,True,True,0.309587,0.143602
178,Rora,0.149396,1.668082e-10,0.141216,1.583035e-09,True,True,0.165195,0.085683



--- Top negatively correlated targets ---


Unnamed: 0,target,spearman_r,spearman_p,pearson_r,pearson_p,in_cluster1,in_cluster2,GRN_coef_KO_DM,GRN_coef_WT_DM
193,Rps8,-0.085515,0.000269144,-0.084257,0.0003312801,True,True,-0.068325,-0.129022
143,Parp14,-0.091861,9.047288e-05,-0.098194,2.839791e-05,True,False,-0.096531,
140,Pak3,-0.098854,2.506133e-05,-0.101044,1.646795e-05,True,False,-0.190655,
195,Rspo1,-0.102715,1.188557e-05,-0.102824,1.163329e-05,True,True,-0.309986,-0.082756
203,Slc2a9,-0.115828,7.716491e-07,-0.115626,8.068231e-07,True,True,-0.150036,-0.059389
163,Prkg1,-0.134794,8.480217e-09,-0.131225,2.085179e-08,True,True,-0.42756,-0.125533
86,H2-Q7,-0.147198,3.091705e-10,-0.148302,2.270419e-10,True,True,-0.236059,-0.07156
85,H2-Q6,-0.147269,3.030621e-10,-0.149262,1.732081e-10,True,False,-0.235866,
24,B2m,-0.149096,1.815177e-10,-0.149274,1.726558e-10,True,False,-0.146411,
84,H2-K1,-0.18838,6.287321e-16,-0.170613,2.687201e-13,True,True,-0.292115,-0.158729


In [9]:
corr_df.to_csv(f"{OUTPUT_DIR}/{TF}/{TF}_target_correlations_all_cells.csv", index=False)
print("Saved all-cell correlations.")

Saved all-cell correlations.


## 5. Compute Correlations Per Cluster
## isolates each condition

In [None]:
cluster_corr_results = []
for cluster_name in [cluster1, cluster2]:
    # create a boolean array (True/False for each cell) identifying which of the cells belong to the current condition
    mask = adata.obs[CLUSTER_COL] == cluster_name
    # subset the TF expression vector to only cells in the current condition
    tf_expr_cl = expr_df.loc[mask, TF]
    # select the appropriate target gene set 
    cluster_targets = targets1 if cluster_name == cluster1 else targets2
    
    # loop over each target gene in this condition
    for gene in cluster_targets:
        if gene not in expr_df.columns:
            continue
        # pull target gene expression for only cells in the condition (WT or KO)
        gene_expr_cl = expr_df.loc[mask, gene]
        # cakculate spearman corr between TF and target using cells only from the condition
        r_sp, p_sp = spearmanr(tf_expr_cl, gene_expr_cl)
        cluster_corr_results.append({
            "cluster": cluster_name,
            "target": gene,
            "spearman_r": r_sp,
            "spearman_p": p_sp,
            "n_cells": mask.sum(),
        })

cluster_corr_df = pd.DataFrame(cluster_corr_results)
cluster_corr_df.to_csv(f"{OUTPUT_DIR}/{TF}/{TF}_target_correlations_per_cluster.csv", index=False)
display(cluster_corr_df.head(10))

Unnamed: 0,cluster,target,spearman_r,spearman_p,n_cells
0,KO_DM,Dpyd,0.022798,0.500392,876
1,KO_DM,Sh3rf1,-0.027697,0.41293,876
2,KO_DM,Fam168a,-0.046587,0.168319,876
3,KO_DM,Ano4,-0.011555,0.732717,876
4,KO_DM,Itm2b,0.028717,0.395928,876
5,KO_DM,Stat1,-0.027051,0.423921,876
6,KO_DM,Cdc14a,0.054137,0.10933,876
7,KO_DM,Pdzrn3,-0.009666,0.775127,876
8,KO_DM,Slc12a2,0.028192,0.404625,876
9,KO_DM,Nrg2,0.094546,0.005101,876


## 6. Visualizations

### 6A. Bar Plot — Spearman Correlations (All Cells)

In [11]:
fig, ax = plt.subplots(figsize=(12, max(6, len(available_targets) * 0.3)))
colors = corr_df["spearman_r"].apply(lambda x: "#c0392b" if x < 0 else "#2980b9")
ax.barh(corr_df["target"], corr_df["spearman_r"], color=colors)
ax.set_xlabel(f"Spearman correlation with {TF}")
ax.set_title(f"Expression correlation: {TF} vs. target genes (all cells)")
ax.axvline(0, color="black", linewidth=0.5)
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/{TF}/{TF}_correlation_barplot.png", dpi=300, bbox_inches="tight")
plt.show()

### 6B. GRN Coefficient vs. Expression Correlation

- Top-right (positive coef, positive correlation): CellOracle says TF-of-interest activates this gene, and cells with more TF-of-interest do express more of it. This is consistent evidence of activation.
- Bottom-left (negative coef, negative correlation): CellOracle says TF-of-interest represses this gene, and the expression data agrees. Consistent repression.
- Top-left or bottom-right (sign disagreement): The GRN model and raw expression disagree on direction. These are targets where the regulatory inference might be unreliable, or where confounding factors (like other co-regulators) are at play.

In [None]:
# every dot is one TF target gene, plotted with its CellOracle GRN coefficient on the x-axis and its Spearman expression correlation on the y-axis

for cluster_name, coef_col in [(cluster1, f"GRN_coef_{cluster1}"), (cluster2, f"GRN_coef_{cluster2}")]:
    subset = corr_df.dropna(subset=[coef_col])
    if subset.empty:
        continue
    fig, ax = plt.subplots(figsize=(7, 6))
    ax.scatter(subset[coef_col], subset["spearman_r"], alpha=0.7, edgecolors="k", linewidths=0.3)
    for _, row in subset.iterrows():
        ax.annotate(row["target"], (row[coef_col], row["spearman_r"]),
                     fontsize=6, alpha=0.7)
    ax.set_xlabel(f"GRN coefficient ({cluster_name})")
    ax.set_ylabel(f"Spearman correlation with {TF}")
    ax.set_title(f"GRN coef vs expression correlation\n{TF} targets in {cluster_name}")
    ax.axhline(0, color="gray", linestyle="--", linewidth=0.5)
    ax.axvline(0, color="gray", linestyle="--", linewidth=0.5)
    plt.tight_layout()
    plt.savefig(f"{OUTPUT_DIR}/{TF}/{TF}_coef_vs_corr_{cluster_name}.png", dpi=300, bbox_inches="tight")
    plt.show()

### 6C. Scatter Plots — Top Correlated Targets

In [None]:
# This code creates a 2×3 grid of scatter plots showing the raw expression relationship between TF and its 6 most extreme target genes

top_n = 6
top_genes = corr_df.nlargest(top_n // 2, "spearman_r")["target"].tolist() + \
            corr_df.nsmallest(top_n // 2, "spearman_r")["target"].tolist()

fig, axes = plt.subplots(2, top_n // 2, figsize=(5 * (top_n // 2), 9))
axes = axes.flatten()

# TF expression on the x-axis and the target gene's expression on the y-axis, with every cell as a single point
for i, gene in enumerate(top_genes):
    ax = axes[i]
    ax.scatter(expr_df[TF], expr_df[gene], alpha=0.1, s=5, c="steelblue")
    r_val = corr_df.loc[corr_df["target"] == gene, "spearman_r"].values[0]
    ax.set_xlabel(f"{TF} expression")
    ax.set_ylabel(f"{gene} expression")
    ax.set_title(f"{gene}\nSpearman r = {r_val:.3f}", fontsize=10)

plt.suptitle(f"Top correlated {TF} targets", fontsize=14, fontweight="bold", y=1.01)
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/{TF}/{TF}_top_target_scatterplots.png", dpi=300, bbox_inches="tight")
plt.show()