In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import torch
import torch_geometric.utils as geo_utils
from torch_geometric.loader import DataLoader


from utils import *
import graph_learning

In [2]:
df_syn = load_synergyage()
df_syn.head(1)

Unnamed: 0.1,Unnamed: 0,Pubmed ID,Temperature,Diet,Details,Wild type lifespan (days),Gene 1,Intervention on gene 1,Gene 1 single mutant lifespan (days),"Gene(s) 2(,3)",...,"Double (triple) mutant (genes 1,2,(3)) lifespan (days)",Phenotype description,Organism,genes,genes_str,num_genes,FC,LOGFC,PERCENT_CHANGE,LIFESPAN_CLASS
0,0,19461873,25,OP50,�Late L4 larvae growing at 20�C were transferr...,13.6,hif-1,RNAi,16.1,daf-16,...,12.4,Lifespan extension by�hif-1�did not require DA...,6239,"('daf-16', 'hif-1')",daf-16;hif-1,2,0.911765,-0.133267,-0.088235,NS


In [3]:
df_biogrid = load_biogrid()
df_biogrid.head(1)

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,TREMBL Accessions Interactor B,REFSEQ Accessions Interactor B,Ontology Term IDs,Ontology Term Names,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types,Organism Name Interactor A,Organism Name Interactor B
0,63578,177286,179791,42412,44810,CELE_AC7.2,CELE_W07G4.5,soc-2,W07G4.5,-,...,Q1HB02,NP_506261,-,-,-,-,-,-,Caenorhabditis elegans,Caenorhabditis elegans


In [4]:
def encode_intervention_class(intervention_class: str) -> torch.Tensor:        
    if intervention_class == "NS":
        code = [0, 1, 0]
    elif intervention_class == "PRO":
        code = [0, 0, 1]
    elif intervention_class == "ANTI":
        code = [1, 0 ,0]
    else:
        raise ValueError(f"Intervention class {intervention_class} not recognized.")
    return torch.as_tensor(code).type(torch.LongTensor)


# 0) Clean df_synergy (current setup):
#    a) Remove genes not in biogrid
biogrid_genes = get_biogrid_gene_names(df_biogrid)
# synergyage_genes = get_synergyage_genes(df_syn)
selected = []
for i, row in df_syn.iterrows():
    genes = row["genes_str"].split(";")
    row_has_gene_not_in_biogrid = False
    for gene in genes:
        if gene not in biogrid_genes:
            row_has_gene_not_in_biogrid = True
            break
    if row_has_gene_not_in_biogrid:
        continue
    else:
        selected.append(i)
df_syn = df_syn.iloc[selected, :].copy()
#    b) remove duplicated interventions (later)

# 1) Shuffle df_synergy
df_syn = df_syn.iloc[np.random.permutation(len(df_syn)).tolist()].copy()

In [5]:
# 2) Create array of graphs and target values
SAMPLE_SIZE = 10  #df_syn.shape[0]
graph_list = []
target_list = []
for i, row in df_syn.iloc[1:SAMPLE_SIZE].iterrows():
    intervention_genes = row["genes_str"].split(";")
    # For removing the nodes that are KO
    # biogrid_mask = ( (~df_biogrid[GENE_NAME_A_COL].isin(intervention_genes)) & (~df_biogrid[GENE_NAME_B_COL].isin(intervention_genes)))
    # df_biogrid_trimmed = df_biogrid.loc[biogrid_mask].copy()

    G = nx.from_pandas_edgelist(
        df_biogrid[[GENE_NAME_A_COL, GENE_NAME_B_COL]],
        source=GENE_NAME_A_COL,
        target=GENE_NAME_B_COL,
    )
    for node in G.nodes:
        if node in intervention_genes:
            G.nodes[node]["HAS_INTERVENTION"] = 1
        else:
            G.nodes[node]["HAS_INTERVENTION"] = 0
    graph_list.append(G)

    class_encoded: torch.Tensor = encode_intervention_class(row["LIFESPAN_CLASS"])
    target_list.append(class_encoded)


In [6]:
# 3) Create train and test datasets
data_list = []
for graph, target in zip(graph_list, target_list):
    data = geo_utils.from_networkx(graph, group_node_attrs=["HAS_INTERVENTION"])
    data.x = data.x.type(torch.FloatTensor)
    data.y = target
    assert data.y.type() == 'torch.LongTensor'
    data_list.append(data)

In [7]:
NUM_EPOCHS = 30
BATCH_SIZE = 16

loader: DataLoader = graph_learning.loader_from_datalist(data_list, batch_size=BATCH_SIZE)

In [8]:
model = graph_learning.create_GNN_model(1, 10, 3)
graph_learning.train_graph_classification(model, loader, loader, num_epochs = NUM_EPOCHS)

TypeError: unsupported format string passed to ModelMetrics.__format__

In [None]:
for data in loader:
    d = data
    break

In [None]:
np.argmax(d.y.reshape(9, 3), axis=1)

tensor([2, 2, 2, 1, 0, 2, 1, 0, 0])

In [None]:
d.y.reshape(9, 3)

tensor([[0, 0, 1],
        [0, 0, 1],
        [0, 0, 1],
        [0, 1, 0],
        [1, 0, 0],
        [0, 0, 1],
        [0, 1, 0],
        [1, 0, 0],
        [1, 0, 0]])