# Factor Analysis - Geneformer

## Loading

In [3]:
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/GitHub/Biological-Foundation-Model/Notebooks')

!pip install -r ../requirements.txt

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!cd "/content/drive/My Drive/Colab Notebooks/huggingface_models"
!pip freeze > "/content/drive/My Drive/Colab Notebooks/huggingface_models/requirements.txt"
%cd "/content/drive/My Drive/Colab Notebooks/huggingface_models/Geneformer"
!pip install .

# downgrade because transformer no longer has "AdamW": https://github.com/huggingface/transformers/issues/36954
!pip install transformers==4.40

# https://github.com/huggingface/peft/issues/2292
!pip install peft==0.10.0

# convert gene name (like AATF as in TRRUST) to ensembl id (ENS0000... as the token for Geneformer)
!pip install mygene

!pip install omnipath

/content/drive/My Drive/Colab Notebooks/huggingface_models/Geneformer
Processing /content/drive/My Drive/Colab Notebooks/huggingface_models/Geneformer
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting anndata (from geneformer==0.1.0)
  Downloading anndata-0.12.2-py3-none-any.whl.metadata (9.6 kB)
Collecting loompy (from geneformer==0.1.0)
  Downloading loompy-3.0.8.tar.gz (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting optuna (from geneformer==0.1.0)
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting optuna-integration (from geneformer==0.1.0)
  Downloading optuna_integration-4.5.0-py3-none-any.whl.metadata (12 kB)
Collecting ray (from geneformer==0.1.0)
  Downloading ray-2.49.1-cp312-cp312-manylinux2014_x86_64.whl.metadata (21 kB)
Collecting scanpy (from geneformer==0.1.0)
  Downloading scanpy-1.11.4-py

In [2]:
## prepare data
from datasets import load_dataset, load_from_disk
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import numpy as np
import pandas as pd

from geneformer import TOKEN_DICTIONARY_FILE
from geneformer import perturber_utils as pu
from geneformer import TranscriptomeTokenizer

# load the training data and the tokenizer from Geneformer
dataset = load_from_disk("/content/drive/My Drive/Colab Notebooks/datasets/geneformer/cell_type_train_data.dataset") # 249556 data points
with open("/content/drive/My Drive/Colab Notebooks/huggingface_models/Geneformer/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl", "rb") as f:
    token_dict = pickle.load(f)


# load model
model = pu.load_model("Pretrained", num_classes = 0, model_directory = "/content/drive/My Drive/Colab Notebooks/huggingface_models/Geneformer/gf-12L-30M-i2048", mode="eval")

ModuleNotFoundError: No module named 'geneformer'

## Factor Analysis

In [None]:
with open("/content/drive/MyDrive/Colab Notebooks/Structural_Probe_Gene/data/30M_6L_features/H_list_l6_STRING_10000.pkl", "rb") as f:
    H_list = pickle.load(f)

import random
random.seed(0)
cell_indices = random.sample(list(range(len(dataset))), 10000)
input_ids_list = [dataset[idx]["input_ids"] for idx in cell_indices]

In [None]:
# create the averaged feature matrix.

vocab_size = len(token_dict)
feature_dim = 256

count_vector = np.zeros(vocab_size)
sum_matrix = np.zeros((vocab_size, feature_dim))

for H, input_ids in zip(H_list, input_ids_list):
    for j in range(len(input_ids)):
        token_id = input_ids[j]
        sum_matrix[token_id] += H[j]
        count_vector[token_id] += 1

# Compute averages (avoid divide by zero)
avg_matrix = np.zeros_like(sum_matrix)
nonzero_indices = count_vector > 0
avg_matrix[nonzero_indices] = sum_matrix[nonzero_indices] / count_vector[nonzero_indices, np.newaxis]

# Factor analysis

In [None]:
from sklearn.decomposition import FactorAnalysis

n_factors = 10
fa = FactorAnalysis(n_components=n_factors, random_state=0)
X_fa = fa.fit_transform(avg_matrix)

In [None]:
avg_matrix.shape

(25426, 256)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming X_fa is the (n_tokens × n_factors) matrix from factor analysis
n_factors = X_fa.shape[1]
n_tokens = X_fa.shape[0]

# Plot each factor's values across tokens
for i in range(n_factors):
    plt.figure(figsize=(8, 3))
    plt.hist(X_fa[:, i])
    plt.xlabel("Token index")
    plt.ylabel("Factor score")
    plt.title(f"Factor {i + 1}")
    plt.tight_layout()
    plt.show()

NameError: name 'X_fa' is not defined