In [1]:
import os
from pathlib import Path
import sys

if "__project_dir__" not in globals():
    __project_dir__ = Path.cwd().parents[1].resolve()

sys.path.append(__project_dir__)
os.chdir(__project_dir__)

### Load data

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/preprocessed/tanno/test.csv")

In [4]:
df

Unnamed: 0,TRAV,CDR3A,TRAJ,TRBV,CDR3B,TRBJ,Epitope,MHCA,MHCB,clone_count
0,TRAV1-1,CAAAPGFQKLVF,TRAJ8,TRBV7-2,CASSPTVSQETQYF,TRBJ2-5,,,,1
1,TRAV1-1,CAAFAGYQKVTF,TRAJ13,TRBV7-2,CASSLGSRQGRGRRGYTF,TRBJ1-2,,,,1
2,TRAV1-1,CAAFANQAGTALIF,TRAJ15,TRBV7-6,CASSFNQDSRYEQYF,TRBJ2-7,,,,1
3,TRAV1-1,CAAKAAGNKLTF,TRAJ17,TRBV25-1,CASSEWGKGEAFF,TRBJ1-1,,,,1
4,TRAV1-1,CAALNQAGTALIF,TRAJ15,TRBV6-6,CASSPWDRARDEKLFF,TRBJ1-4,,,,1
...,...,...,...,...,...,...,...,...,...,...
84128,,CAVRVNSGNTPLVF,TRAJ29,TRBV30,CAWSSMAGVAQETQYF,TRBJ2-5,,,,1
84129,,CAVSDWAGKSTF,TRAJ27,TRBV24-1,CATSDPIGGPNTGELFF,TRBJ2-2,,,,1
84130,,CAVTPSNTGKLIF,TRAJ37,TRBV12-3,CASSLGAGDVGELFF,TRBJ2-2,,,,1
84131,,CGTNNNAGNMLTF,TRAJ39,TRBV30,CAWSVPGLGNEKLFF,TRBJ1-4,,,,1


### OLGA setup

In [5]:
from olga import load_model
import olga.generation_probability as pgen
import numpy as np

OLGA_PATH = Path(".venv") / "lib64" / "python3.11" / "site-packages" / "olga"
DEFAULT_HUMAN_T_BETA_PATH = OLGA_PATH / "default_models" / "human_T_beta"
DEFAULT_HUMAN_T_ALPHA_PATH = OLGA_PATH / "default_models" / "human_T_alpha"

In [6]:
def get_alpha_pgen_model():
    params_file_name = DEFAULT_HUMAN_T_ALPHA_PATH / "model_params.txt"
    marginals_file_name = DEFAULT_HUMAN_T_ALPHA_PATH / "model_marginals.txt"
    v_anchor_pos_file = DEFAULT_HUMAN_T_ALPHA_PATH / "V_gene_CDR3_anchors.csv"
    j_anchor_pos_file = DEFAULT_HUMAN_T_ALPHA_PATH / "J_gene_CDR3_anchors.csv"

    genomic_data = load_model.GenomicDataVJ()
    genomic_data.load_igor_genomic_data(params_file_name, v_anchor_pos_file, j_anchor_pos_file)

    generative_model = load_model.GenerativeModelVJ()
    generative_model.load_and_process_igor_model(marginals_file_name)

    pgen_model = pgen.GenerationProbabilityVJ(generative_model, genomic_data)

    return pgen_model

In [7]:
def get_beta_pgen_model():
    params_file_name = DEFAULT_HUMAN_T_BETA_PATH / "model_params.txt"
    marginals_file_name = DEFAULT_HUMAN_T_BETA_PATH / "model_marginals.txt"
    v_anchor_pos_file = DEFAULT_HUMAN_T_BETA_PATH / "V_gene_CDR3_anchors.csv"
    j_anchor_pos_file = DEFAULT_HUMAN_T_BETA_PATH / "J_gene_CDR3_anchors.csv"

    genomic_data = load_model.GenomicDataVDJ()
    genomic_data.load_igor_genomic_data(params_file_name, v_anchor_pos_file, j_anchor_pos_file)

    generative_model = load_model.GenerativeModelVDJ()
    generative_model.load_and_process_igor_model(marginals_file_name)

    pgen_model = pgen.GenerationProbabilityVDJ(generative_model, genomic_data)

    return pgen_model

In [8]:
def compute_pgen(model, cdr3, v, j) -> float:
    if pd.isna(cdr3):
        return None
    
    if pd.isna(v):
        v = None
    if pd.isna(j):
        j = None

    return model.compute_aa_CDR3_pgen(cdr3, v, j)

### Compute pGens

In [9]:
alpha_model = get_alpha_pgen_model()

df["alpha_pgen"] = df.apply(
    lambda row: compute_pgen(alpha_model, row["CDR3A"], row["TRAV"], row["TRAJ"]),
    axis=1
)

In [10]:
beta_model = get_beta_pgen_model()

df["beta_pgen"] = df.apply(
    lambda row: compute_pgen(beta_model, row["CDR3B"], row["TRBV"], row["TRBJ"]),
    axis=1
)

In [11]:
df

Unnamed: 0,TRAV,CDR3A,TRAJ,TRBV,CDR3B,TRBJ,Epitope,MHCA,MHCB,clone_count,alpha_pgen,beta_pgen
0,TRAV1-1,CAAAPGFQKLVF,TRAJ8,TRBV7-2,CASSPTVSQETQYF,TRBJ2-5,,,,1,3.862730e-10,2.374134e-10
1,TRAV1-1,CAAFAGYQKVTF,TRAJ13,TRBV7-2,CASSLGSRQGRGRRGYTF,TRBJ1-2,,,,1,1.875100e-10,3.013049e-14
2,TRAV1-1,CAAFANQAGTALIF,TRAJ15,TRBV7-6,CASSFNQDSRYEQYF,TRBJ2-7,,,,1,2.317307e-10,1.199900e-12
3,TRAV1-1,CAAKAAGNKLTF,TRAJ17,TRBV25-1,CASSEWGKGEAFF,TRBJ1-1,,,,1,2.131252e-07,1.664200e-10
4,TRAV1-1,CAALNQAGTALIF,TRAJ15,TRBV6-6,CASSPWDRARDEKLFF,TRBJ1-4,,,,1,2.143031e-08,8.744084e-13
...,...,...,...,...,...,...,...,...,...,...,...,...
84128,,CAVRVNSGNTPLVF,TRAJ29,TRBV30,CAWSSMAGVAQETQYF,TRBJ2-5,,,,1,5.401808e-07,1.619482e-13
84129,,CAVSDWAGKSTF,TRAJ27,TRBV24-1,CATSDPIGGPNTGELFF,TRBJ2-2,,,,1,2.175962e-09,7.669239e-11
84130,,CAVTPSNTGKLIF,TRAJ37,TRBV12-3,CASSLGAGDVGELFF,TRBJ2-2,,,,1,7.706339e-07,6.594617e-12
84131,,CGTNNNAGNMLTF,TRAJ39,TRBV30,CAWSVPGLGNEKLFF,TRBJ1-4,,,,1,1.443354e-07,7.180668e-10


In [12]:
pgen_df = df[["alpha_pgen", "beta_pgen"]]

pgen_df.to_csv("/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/preprocessed/tanno/test_pgens.csv", index=False)