In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/GitHub/Biological-Foundation-Model/Notebooks')

!pip install -r ../requirements.txt

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.append("structural_probe_utils")
import numpy as np
import pandas as pd

import torch
import anndata as ad

from structural_probe_utils.structural_probe_utils import build_G_from_edges, train_A

In [None]:
# prepare gene embedding
# code borrowed from linear_perturbation_prediction-Paper
demo_adata = ad.read_h5ad("../models/scFoundation/demo.h5ad")
singlecell_model_path="../models/scFoundation/models.ckpt"
ckp = torch.load(singlecell_model_path)
gene_pos_emb = ckp['gene']['state_dict']['model.pos_emb.weight'].cpu().numpy()
gene_names = demo_adata.var.gene_name.tolist()
gene_names = gene_names + ["log10TotalCount1", "log10TotalCount2", "<pad>"]
print(gene_pos_emb.shape)
print(len(gene_names))

# # Msigdf hallmark keeps ~4,000 genes, still too many
# msig = Msigdb()
# gmt = msig.get_gmt(category="h.all", dbver="2023.1.Hs")
# hallmark_genes = set(g for genes in gmt.values() for g in genes)
# indices = [i for i, g in enumerate(gene_names) if g in hallmark_genes]

# L1000 landmark genes
landmark_df = pd.read_csv("../data/structural_probe_geneformer/L1000_landmark_genes.txt", sep="\t")

# Extract gene symbols (strip spaces just in case)
landmark_genes = set(landmark_df["Symbol"].str.strip())
indices = [i for i, g in enumerate(gene_names) if g in landmark_genes]

gene_pos_emb = gene_pos_emb[indices, :]
gene_names = [gene_names[i] for i in indices]
print(gene_pos_emb.shape)
print(len(gene_names))

(19267, 768)
19267


In [4]:
# prepare true gene network
STRING_homosapien = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Structural_Probe_Gene/data/9606.protein.links.v12.0.txt", sep=" ")
STRING_homosapien["protein1"] = STRING_homosapien["protein1"].str.replace("9606.", "", regex=False)
STRING_homosapien["protein2"] = STRING_homosapien["protein2"].str.replace("9606.", "", regex=False)

STRING_homosapien_alias = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Structural_Probe_Gene/data/9606.protein.aliases.v12.0.txt", sep="\t", header=None)
STRING_homosapien_alias = STRING_homosapien_alias[STRING_homosapien_alias[2].str.startswith("Ensembl_HGNC_symbol", na=False)][[0,1]].drop_duplicates()
STRING_homosapien_alias[0] = STRING_homosapien_alias[0].str.replace("9606.", "", regex=False)
ENSP_to_GeneSymbol = dict(zip(STRING_homosapien_alias[0], STRING_homosapien_alias[1]))

# Map both columns using the dictionary
STRING_homosapien["gene1"] = STRING_homosapien["protein1"].map(ENSP_to_GeneSymbol)
STRING_homosapien["gene2"] = STRING_homosapien["protein2"].map(ENSP_to_GeneSymbol)
STRING_gene_interaction = STRING_homosapien.dropna(subset=["gene1", "gene2", "combined_score"])
STRING_gene_interaction_high_conf = STRING_gene_interaction[STRING_gene_interaction["combined_score"]>700]

In [5]:
G_full = build_G_from_edges(STRING_gene_interaction_high_conf, gene_names, make_dense = True)

In [9]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score

n_splits = 10
n = len(gene_names)
random_state = 42
device = "cpu"


G_full = build_G_from_edges(STRING_gene_interaction_high_conf, gene_names, make_dense = True)

genes = np.array(gene_names)
X_full = np.asarray(gene_pos_emb)  # (n,d)
n = len(genes)

kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

acc_tr_0 = []
acc_tr_1 = []
acc_va_0 = []
acc_va_1 = []
f1_tr = []
f1_va = []
for fold_id, (train_idx, val_idx) in enumerate(kf.split(np.arange(n)), start=1):
    # Induced subgraphs and embeddings
    X_tr = X_full[train_idx]
    X_va = X_full[val_idx]
    G_tr = G_full[np.ix_(train_idx, train_idx)]
    G_va = G_full[np.ix_(val_idx,  val_idx)]

    # Train
    A, w, tr_loss, va_loss = train_A(
        X_tr, G_tr, X_va, G_va, steps = 200
    )

    # X_tr = torch.as_tensor(X_tr, dtype=torch.float32, device=device)
    # np.fill_diagonal(G_tr, 0)
    # G_tr = torch.from_numpy((G_tr > 0).astype(np.bool_)).to(device)
    # nt = X_tr.shape[0]
    # I, J = torch.triu_indices(nt, nt, offset=1, device=device)
    # y = G_tr[I, J].float()

    X_va = torch.as_tensor(X_va, dtype=torch.float32, device=device)
    np.fill_diagonal(G_va, 0)
    G_va = torch.from_numpy((G_va > 0).astype(np.bool_)).to(device)
    nv = X_va.shape[0]
    Iv, Jv = torch.triu_indices(nv, nv, offset=1, device=device)
    yv = G_va[Iv, Jv].float()

    with torch.no_grad():
      # delta = X_tr[I] - X_tr[J]
      # z = delta @ A
      # sqdist = (z * z).sum(dim=1)
      # logits = w - sqdist
      # prob = torch.sigmoid(logits).cpu().numpy()
      # pred = (prob > 0.5).astype(int)

      vdelta = X_va[Iv] - X_va[Jv]
      vz = vdelta @ A
      vsqdist = (vz * vz).sum(dim=1)
      vlogits = w - vsqdist
      vprob = torch.sigmoid(vlogits).cpu().numpy()
      vpred = (vprob > 0.5).astype(int)

      # acc_tr_0.append(accuracy_score(y[y == 0], pred[y == 0]))
      # acc_tr_1.append(accuracy_score(y[y == 1], pred[y == 1]))
      acc_va_0.append(accuracy_score(yv[yv == 0], vpred[yv == 0]))
      acc_va_1.append(accuracy_score(yv[yv == 1], vpred[yv == 1]))
      # f1_tr.append(f1_score(y, pred))
      f1_va.append(f1_score(yv, vpred))


In [None]:
print(acc_va_0)
print(acc_va_1)
print(f1_va)

[0.8846944831083633, 0.8850037722082691, 0.8942139967120261, 0.8706229244618775, 0.8799980146323751, 0.873070387910777, 0.8867804002453719, 0.8737032510071797, 0.8809259318263684, 0.8627073398915793]
[0.11456859971711457, 0.10130841121495326, 0.10526315789473684, 0.14210061782877317, 0.12710280373831775, 0.14747307373653687, 0.11243144424131626, 0.13157894736842105, 0.11542192046556742, 0.13417105796513984]
[0.002249042074671899, 0.002508829506056833, 0.002417088817942856, 0.0026569848997442034, 0.0024194660274057898, 0.0029915086531068415, 0.0023176062707265604, 0.0025797611394931825, 0.002136531547504163, 0.0025761862325805835]


The training sample is too few. Compare to training with cells, we have 10,000 cells and for each cell, we have hundreds of gene embeddings.