In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
import os

os.__file__

In [None]:
from patch_gnn.data import load_ghesquire

data = load_ghesquire()

In [None]:
from functools import partial, singledispatch
from multipledispatch import dispatch

@dispatch(float)
def _split_delimiter(x, delimiter=";"):
    """Split delimiter helper function for floats."""
    return x

@dispatch(str)
def _split_delimiter(x, delimiter=";"):
    """Split delimiter helper function for strings."""
    return x.split(delimiter)

In [None]:
from patch_gnn.seqops import met_position

In [None]:
import janitor
processed_data = (
    data.dropna(subset=["accession"])
    .transform_column("isoforms", split_delimiter)
    .explode("isoforms")
    .transform_column("isoforms", partial(split_delimiter, delimiter=" ("))
    .transform_column("isoforms", lambda x: x[0] if isinstance(x, list) else x)
    .transform_column(
        "isoforms", lambda x: x.strip(" ") if isinstance(x, str) else x
    )
    .drop_duplicates("sequence")
    .join_apply(met_position, "met_position")
)


## Prototype for using 3D models

In [None]:
from pyprojroot import here
import os

models_path = here() / "data/ghesquire_2011/models"
protein_models = os.listdir(models_path)

In [None]:
import os

built_models = [f.strip(".pdb") for f in os.listdir(models_path)]
built_models

In [None]:
processed_data.query("accession in @built_models")

In [None]:
import networkx as nx

def get_node(G: nx.Graph, pos: int):
    node = [n for n, d in G.nodes(data=True) if d["residue_number"] == pos]
    if len(node) == 1:
        return node[0]
    raise Exception("Node not found!")

In [None]:
from proteingraph import read_pdb

In [None]:
from tqdm.auto import tqdm

In [None]:
def load_model(model: str):
    try:
        m = read_pdb(models_path / f"{model}.pdb")
        return model, m
    except Exception as e:
        print(e)
    
        
        

In [None]:
from joblib import Parallel, delayed

In [None]:
results = Parallel(n_jobs=-1)(delayed(load_model)(m) for m in built_models)

In [None]:
results = [r for r in results if r is not None]
dict(results)

In [None]:
processed_data.set_index("accession").loc['P15374']["met_position"]

In [None]:
from patch_gnn.graph import extract_neighborhood

met_graphs = dict()

for accession, g in tqdm(results):
    pos = processed_data.set_index("accession").loc[accession]["met_position"]
    try:
        metnode = get_node(g, pos)
        met_g = extract_neighborhood(g, metnode, 1)
        met_graphs[accession] = met_g
    except Exception as e:
        print(e)

In [None]:
len(met_graphs)

## Calculate degree of met node

In [None]:
met_dcs = dict()

for acc, g in tqdm(met_graphs.items()):
    pos = processed_data.set_index("accession").loc[acc]["met_position"]
    metnode = get_node(g, pos)
    met_dcs[acc] = nx.degree(g)[metnode]

In [None]:
processed_data.columns

In [None]:
processed_data.set_index("accession").join(pd.Series(met_dcs, name="met_degree")).dropna(subset=["met_degree"]).plot(kind="scatter", x="met_degree", y="ox_fwd_logit")