# Extract Target Genes for a TF of Interest from CellOracle Links

In [1]:
import celloracle as co
import pandas as pd
from matplotlib_venn import venn2
import matplotlib.pyplot as plt
import numpy as np

  from pkg_resources import get_distribution, DistributionNotFound


## 1. Load filtered Links object

In [2]:
# Update path to your filtered links file
links = co.load_hdf5(file_path="celloracle_results/per_celltype/Epi_Kit+Elf5+_filtered.celloracle.links")
print(f"Clusters: {links.cluster}")

Clusters: ['KO_DM', 'WT_DM']


In [3]:
# ---- Set your TF and cluster names here ----
tf_of_interest = "Tfap2b"
cluster1 = links.cluster[0]
cluster2 = links.cluster[1]
print(f"Cluster 1: {cluster1}")
print(f"Cluster 2: {cluster2}")

Cluster 1: KO_DM
Cluster 2: WT_DM


## 2. Get target genes per cluster

In [4]:
df1 = links.filtered_links[cluster1]
df2 = links.filtered_links[cluster2]

targets1 = set(df1[df1["source"] == tf_of_interest]["target"])
targets2 = set(df2[df2["source"] == tf_of_interest]["target"])

print(f"{cluster1}: {len(targets1)} targets")
print(f"{cluster2}: {len(targets2)} targets")

KO_DM: 210 targets
WT_DM: 123 targets


## 3. Set comparisons

In [5]:
only_cluster1 = sorted(targets1 - targets2)
only_cluster2 = sorted(targets2 - targets1)
in_both = sorted(targets1 & targets2)

print(f"Only in {cluster1}: {len(only_cluster1)}")
print(only_cluster1)

print(f"\nOnly in {cluster2}: {len(only_cluster2)}")
print(only_cluster2)

print(f"\nIn both clusters: {len(in_both)}")
print(in_both)

Only in KO_DM: 111
['0610040J01Rik', '1700025G04Rik', '9530026P05Rik', 'Abca1', 'Afap1l2', 'Airn', 'Alcam', 'Aldh1a3', 'Ank3', 'Ano3', 'Arhgap26', 'Arhgap6', 'Auts2', 'B2m', 'Bcl11a', 'Cachd1', 'Camk1d', 'Cdc14a', 'Cenpp', 'Commd10', 'Csmd1', 'D17H6S56E-5', 'Dapk1', 'Dlg2', 'Ehbp1', 'Esr1', 'Esrrg', 'Etv6', 'Fam13b', 'Fam168a', 'Foxp1', 'Fry', 'Garnl3', 'Glis3', 'Gm16599', 'Grik3', 'Grip1', 'H2-Q6', 'Hells', 'Hlf', 'Hp', 'Igf1r', 'Itm2b', 'Kcnd2', 'Khdrbs3', 'Kitl', 'Lama3', 'Lars2', 'Lbp', 'Lpgat1', 'Lpl', 'Lrrc7', 'Mapk4', 'Met', 'Mfge8', 'Mphosph8', 'Mrps6', 'Nav2', 'Neb', 'Nfib', 'Nrg2', 'Nrxn3', 'Ntn1', 'Ntn4', 'Nudt4', 'Nxn', 'Pak3', 'Parp14', 'Parp8', 'Patl2', 'Pcsk6', 'Pdzd2', 'Pdzrn3', 'Pfkfb3', 'Pik3r1', 'Pip5k1b', 'Pkp4', 'Pla2g4a', 'Ptn', 'Pvt1', 'Rabep2', 'Rcbtb2', 'Rhoj', 'Rnf213', 'Rps2', 'Rps20', 'Rps7', 'Rpsa', 'Runx1', 'Sgms1', 'Sh3rf1', 'Slc12a2', 'Slmap', 'Sox6', 'St6gal1', 'Stat1', 'Tanc2', 'Tgfb3', 'Thbs1', 'Tmsb4x', 'Tmtc2', 'Trf', 'Tshz2', 'Vegfc', 'Vgll4', 'Wfd

## 4. Summary DataFrame with coefficients from both clusters

In [6]:
# Build a merged table with coefs from both clusters
edges1 = df1[df1["source"] == tf_of_interest][["target", "coef_mean", "coef_abs", "p"]].copy()
edges1.columns = ["target", f"coef_mean_{cluster1}", f"coef_abs_{cluster1}", f"p_{cluster1}"]

edges2 = df2[df2["source"] == tf_of_interest][["target", "coef_mean", "coef_abs", "p"]].copy()
edges2.columns = ["target", f"coef_mean_{cluster2}", f"coef_abs_{cluster2}", f"p_{cluster2}"]

merged = pd.merge(edges1, edges2, on="target", how="outer")

# Label each gene
def label_membership(row):
    in1 = pd.notna(row[f"coef_mean_{cluster1}"])
    in2 = pd.notna(row[f"coef_mean_{cluster2}"])
    if in1 and in2:
        return "both"
    elif in1:
        return f"{cluster1}_only"
    else:
        return f"{cluster2}_only"

merged["membership"] = merged.apply(label_membership, axis=1)
merged = merged.sort_values("membership")
merged

Unnamed: 0,target,coef_mean_KO_DM,coef_abs_KO_DM,p_KO_DM,coef_mean_WT_DM,coef_abs_WT_DM,p_WT_DM,membership
116,Gm16599,-0.094161,0.094161,2.139341e-15,,,,KO_DM_only
96,Ehbp1,0.106536,0.106536,1.495721e-07,,,,KO_DM_only
146,Khdrbs3,-0.082384,0.082384,1.278374e-13,,,,KO_DM_only
94,0610040J01Rik,-0.108382,0.108382,7.052552e-13,,,,KO_DM_only
148,Wls,-0.081796,0.081796,5.026482e-08,,,,KO_DM_only
...,...,...,...,...,...,...,...,...
163,Rbm47,-0.075893,0.075893,2.847455e-07,0.070760,0.070760,1.125508e-08,both
77,Nr3c2,0.124198,0.124198,7.394234e-09,0.098574,0.098574,3.584222e-13,both
161,Tmprss13,0.076783,0.076783,5.751296e-15,0.066816,0.066816,1.923514e-16,both
159,Rpl17,-0.077913,0.077913,3.314497e-10,-0.059436,0.059436,1.329241e-11,both


## 5. Exports

In [7]:
merged.to_csv(f"extract_tf_genes_results/{tf_of_interest}_targets_cluster_comparison.csv", index=False)
print(f"Saved to {tf_of_interest}_targets_cluster_comparison.csv")

Saved to Tfap2b_targets_cluster_comparison.csv


In [8]:
fig, ax = plt.subplots(figsize=(12, 9))

v = venn2(
    [targets1, targets2],
    set_labels=(cluster1, cluster2),
    set_colors=("#4C72B0", "#DD8452"),
    alpha=0.7,
    ax=ax
)

# Helper: format gene list with line breaks
def format_genes(genes, per_line=3):
    genes = sorted(genes)
    lines = []
    for i in range(0, len(genes), per_line):
        lines.append(", ".join(genes[i:i+per_line]))
    return "\n".join(lines)

# Replace subset labels with gene names
# "10" = left only, "01" = right only, "11" = intersection
region_map = {
    "10": targets1 - targets2,
    "01": targets2 - targets1,
    "11": targets1 & targets2,
}

for region_id, genes in region_map.items():
    label = v.get_label_by_id(region_id)
    if label and genes:
        count = len(genes)
        gene_text = format_genes(genes, per_line=3)
        label.set_text(f"({count})\n{gene_text}")
        label.set_fontsize(7)
    elif label:
        label.set_text("(0)")

# Style set labels
for text in v.set_labels:
    if text:
        text.set_fontsize(14)
        text.set_fontweight("bold")

ax.set_title(f"{tf_of_interest} Target Genes by Cluster", fontsize=16, fontweight="bold")
plt.tight_layout()
plt.savefig(f"extract_tf_genes_results/{tf_of_interest}_venn_diagram_with_genes.png", dpi=300, bbox_inches="tight")
plt.show()