# BRCA1 Expression Prediction using Enformer

In [None]:

import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from Bio import SeqIO
import matplotlib.pyplot as plt


## Load Enformer Model

In [None]:

print("Loading Enformer...")
enformer = hub.load("https://tfhub.dev/deepmind/enformer/1")
model = enformer.model  # Access the callable model


## Load and Preprocess DNA Sequence

In [None]:

sequence = str(SeqIO.read("brca1_sequence.fa", "fasta").seq).upper()
SEQUENCE_LENGTH = 393216
sequence = sequence[:SEQUENCE_LENGTH].ljust(SEQUENCE_LENGTH, "N")

def one_hot_encode(seq):
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    arr = np.zeros((SEQUENCE_LENGTH, 4), dtype=np.float32)
    for i, base in enumerate(seq):
        if base in mapping:
            arr[i, mapping[base]] = 1.0
    return arr

inputs = tf.constant(one_hot_encode(sequence)[np.newaxis, ...])  # Shape: (1, 393216, 4)


## Run Prediction

In [None]:

print("Running prediction...")
outputs = model.predict_on_batch(inputs)
print("Available outputs:", outputs.keys())
print("Output shape (human):", outputs['human'].shape)


## Extract Ovary Prediction

In [None]:

target_names = [  # Partial list
    "CAGE:adipose_subcutaneous", "CAGE:adipose_visceral_omentum", "CAGE:adrenal_gland",
    "CAGE:artery_aorta", "CAGE:artery_coronary", "CAGE:artery_tibial", "CAGE:bladder",
    "CAGE:brain_amygdala", "CAGE:brain_anterior_cingulate_cortex_ba24", "CAGE:brain_caudate_basal_ganglia",
    "CAGE:brain_cerebellar_hemisphere", "CAGE:brain_cerebellum", "CAGE:brain_cortex",
    "CAGE:brain_frontal_cortex_ba9", "CAGE:brain_hippocampus", "CAGE:brain_hypothalamus",
    "CAGE:brain_nucleus_accumbens_basal_ganglia", "CAGE:brain_putamen_basal_ganglia",
    "CAGE:brain_spinal_cord_cervical_c-1", "CAGE:brain_substantia_nigra", "CAGE:breast_mammary_tissue",
    "CAGE:cells_cultured_fibroblasts", "CAGE:cells_ebv-transformed_lymphocytes", "CAGE:colon_sigmoid",
    "CAGE:colon_transverse", "CAGE:esophagus_gastroesophageal_junction", "CAGE:esophagus_mucosa",
    "CAGE:esophagus_muscularis", "CAGE:fallopian_tube", "CAGE:heart_atrial_appendage",
    "CAGE:heart_left_ventricle", "CAGE:kidney_cortex", "CAGE:liver", "CAGE:lung",
    "CAGE:minor_salivary_gland", "CAGE:muscle_skeletal", "CAGE:nerve_tibial", "CAGE:ovary",
    "CAGE:pancreas", "CAGE:pituitary", "CAGE:prostate", "CAGE:skin_not_sun_exposed_suprapubic",
    "CAGE:skin_sun_exposed_lower_leg", "CAGE:small_intestine_terminal_ileum", "CAGE:spleen",
    "CAGE:stomach", "CAGE:testis", "CAGE:thyroid", "CAGE:uterus", "CAGE:vagina", "CAGE:whole_blood"
]

ovary_index = target_names.index("CAGE:ovary")
center_bin = 448
window = 5
ovary_track = ovary_index

ovary_prediction = tf.reduce_mean(outputs['human'][0, center_bin - window : center_bin + window + 1, ovary_track])
print("Predicted BRCA1 expression in ovary (arbitrary units):", ovary_prediction.numpy())


## Visualization: Enformer vs GTEx

In [None]:

tissues = ['Ovary']
enformer_pred = [0.23]
gtex_tpm = [2.0]

x = range(len(tissues))
width = 0.4

fig, ax = plt.subplots()
ax.bar(x, enformer_pred, width=width, label='Enformer Prediction (a.u.)', color='skyblue')
ax.bar([i + width for i in x], gtex_tpm, width=width, label='GTEx Median TPM', color='salmon')

ax.set_ylabel('Expression Level')
ax.set_title('BRCA1 Expression in Ovary\n(Enformer Prediction vs GTEx TPM)')
ax.set_xticks([i + width/2 for i in x])
ax.set_xticklabels(tissues)
ax.legend()

plt.show()
