In [1]:
import torch 
import pandas as pd
emb_1 = torch.load(r"..\artifacts\train_embedding\BRCA\1_embedding.pt")

In [2]:
# load knowlege link and information 

df_protein = pd.read_csv(r"..\PPI\9606.protein.info.v12.0.txt", sep="\t")

df_protein.head()

Unnamed: 0,#string_protein_id,preferred_name,protein_size,annotation
0,9606.ENSP00000000233,ARF5,180,ADP-ribosylation factor 5; GTP-binding protein...
1,9606.ENSP00000000412,M6PR,277,Cation-dependent mannose-6-phosphate receptor;...
2,9606.ENSP00000001008,FKBP4,459,"Peptidyl-prolyl cis-trans isomerase FKBP4, N-t..."
3,9606.ENSP00000001146,CYP26B1,512,Cytochrome P450 26B1; Involved in the metaboli...
4,9606.ENSP00000002125,NDUFAF7,441,"Protein arginine methyltransferase NDUFAF7, mi..."


In [3]:
df_protein_link = pd.read_csv(r"..\PPI\9606.protein.links.v12.0.txt", sep="\s")
print(df_protein_link.shape)
df_protein_link.head()

  df_protein_link = pd.read_csv(r"..\PPI\9606.protein.links.v12.0.txt", sep="\s")


(13715404, 3)


Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000356607,173
1,9606.ENSP00000000233,9606.ENSP00000427567,154
2,9606.ENSP00000000233,9606.ENSP00000253413,151
3,9606.ENSP00000000233,9606.ENSP00000493357,471
4,9606.ENSP00000000233,9606.ENSP00000324127,201


In [4]:
df_protein_merged = pd.merge(df_protein_link, df_protein[['#string_protein_id','preferred_name']], left_on="protein1", right_on="#string_protein_id")
df_protein_merged.rename(columns={"preferred_name":"protein1_name"}, inplace=True)

df_protein_merged = pd.merge(df_protein_merged, df_protein[['#string_protein_id','preferred_name']], left_on="protein2", right_on="#string_protein_id")
df_protein_merged.rename(columns={"preferred_name":"protein2_name"}, inplace=True)

# drop columns
df_protein_merged.drop(columns=["#string_protein_id_x", "#string_protein_id_y", "protein1" , "protein2"], inplace=True)
df_protein_merged.head()


Unnamed: 0,combined_score,protein1_name,protein2_name
0,173,ARF5,RALGPS2
1,154,ARF5,FHDC1
2,151,ARF5,ATP6V1E1
3,471,ARF5,CYTH2
4,201,ARF5,PSD3


In [5]:
# load omic data header 1
df_omic_1 = pd.read_csv(r"..\artifacts\data_preprocessing\BRCA\1_featname.csv",header=None)
df_omic_1['gene_name'] = df_omic_1[0].apply(lambda x: x.split("|")[0])
df_omic_1 = df_omic_1.index.to_frame(name="gene_idx").join(df_omic_1)

print(df_omic_1.shape)
df_omic_1.head()

(1000, 3)


Unnamed: 0,gene_idx,0,gene_name
0,0,ABAT|18,ABAT
1,1,ABCF1|23,ABCF1
2,2,ABCF2|10061,ABCF2
3,3,ACOT4|122970,ACOT4
4,4,ACOT9|23597,ACOT9


In [6]:
# load omic data header 2
df_protein_merged = df_protein_merged.merge(df_omic_1[['gene_idx' , 'gene_name']] , left_on="protein1_name", right_on="gene_name" , how="left")
df_protein_merged.rename(columns={"gene_idx":"gene1_idx"}, inplace=True)

df_protein_merged = df_protein_merged.merge(df_omic_1[['gene_idx' , 'gene_name']] , left_on="protein2_name", right_on="gene_name" , how="left")
df_protein_merged.rename(columns={"gene_idx":"gene2_idx"}, inplace=True)

df_protein_merged.drop(columns=["gene_name_x", "gene_name_y"], inplace=True)

df_protein_merged.head()

Unnamed: 0,combined_score,protein1_name,protein2_name,gene1_idx,gene2_idx
0,173,ARF5,RALGPS2,,723.0
1,154,ARF5,FHDC1,,
2,151,ARF5,ATP6V1E1,,
3,471,ARF5,CYTH2,,
4,201,ARF5,PSD3,,


In [7]:
# filter rows with only gene1_idx and gene2_idx
df_filter_protein = df_protein_merged[df_protein_merged['gene1_idx'].notnull()][df_protein_merged['gene2_idx'].notnull()]

  df_filter_protein = df_protein_merged[df_protein_merged['gene1_idx'].notnull()][df_protein_merged['gene2_idx'].notnull()]


In [8]:
from tqdm import tqdm
knowledge_tensor = torch.zeros(1000 , 1000)
with tqdm(total=df_filter_protein.shape[0]) as pbar:
    for idx , row in df_filter_protein.iterrows():
        knowledge_tensor[int(row['gene1_idx']) , int(row['gene2_idx'])] += 1
        #knowledge_tensor[int(row['gene2_idx']) , int(row['gene1_idx'])] += 1
        pbar.update(1)
    

  0%|          | 0/90654 [00:00<?, ?it/s]

100%|██████████| 90654/90654 [00:11<00:00, 7663.99it/s] 


In [9]:
from amogel.utils.common import symmetric_matrix_to_coo , coo_to_pyg_data
coo_matrix = symmetric_matrix_to_coo(knowledge_tensor.numpy() , 1)
graph = coo_to_pyg_data(coo_matrix=coo_matrix , node_features=emb_1)
print(graph)

Data(x=[1000, 128], edge_index=[2, 90654], edge_attr=[90654, 1], num_nodes=1000)


In [10]:
# load the sample 
df_label_train  = pd.read_csv(r"..\artifacts\data_preprocessing\BRCA\labels_tr.csv", header=None)
df_label_test   = pd.read_csv(r"..\artifacts\data_preprocessing\BRCA\labels_te.csv", header=None)

df = pd.read_csv(r"../artifacts/data_preprocessing/BRCA/1_tr.csv", header=None)
graphs_training = []
with tqdm(total=df.shape[0]) as pbar:
    for idx , sample in df.iterrows():
        torch_sample = torch.tensor(sample.values, dtype=torch.float32).unsqueeze(-1)
        node_embedding = torch.concat([torch_sample , emb_1] , dim=-1)
        graph = coo_to_pyg_data(coo_matrix=coo_matrix , node_features=node_embedding , y = torch.tensor(df_label_train.iloc[idx].values , dtype=torch.long) )
        graphs_training.append(graph)
        pbar.update(1)
        

graphs_testing = []
df = pd.read_csv(r"../artifacts/data_preprocessing/BRCA/1_te.csv", header=None)
with tqdm(total=df.shape[0]) as pbar:
    for idx , sample in df.iterrows():
        torch_sample = torch.tensor(sample.values, dtype=torch.float32).unsqueeze(-1)
        node_embedding = torch.concat([torch_sample , emb_1] , dim=-1)
        graph = coo_to_pyg_data(coo_matrix=coo_matrix , node_features=node_embedding , y = torch.tensor(df_label_test.iloc[idx].values , dtype=torch.long))
        graphs_testing.append(graph)
        pbar.update(1)        

100%|██████████| 615/615 [00:03<00:00, 183.82it/s]
100%|██████████| 154/154 [00:00<00:00, 199.76it/s]


In [11]:
graphs_training[0]

Data(x=[1000, 129], edge_index=[2, 90654], edge_attr=[90654, 1], y=[1], num_nodes=1000)

In [55]:
# build simple GCN model for graph classification 
from torch_geometric.loader import DataLoader   
import pytorch_lightning as pl
from torchmetrics.classification import Accuracy , Precision , Recall , AUROC , ConfusionMatrix

train_loader = DataLoader(graphs_training, batch_size=32, shuffle=True)
test_loader = DataLoader(graphs_testing, batch_size=32, shuffle=False)

from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv , BatchNorm 
from torch_geometric.nn import global_mean_pool

class GCN(pl.LightningModule):
    def __init__(self, num_features ,  hidden_channels , output_class):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, output_class)
        self.criterion = torch.nn.CrossEntropyLoss()
        self.accuracy = Accuracy(task="multiclass", num_classes=output_class)
        self.precision = Precision(task="multiclass" , num_classes=output_class)
        self.recall = Recall(task="multiclass" , num_classes=output_class)
        self.auroc = AUROC(task="multiclass" ,num_classes=output_class)
        self.cfm_training = ConfusionMatrix(task="multiclass", num_classes=output_class)
        self.cfm_testing = ConfusionMatrix(task="multiclass", num_classes=output_class)

    def forward(self, x, edge_index, batch):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = x.relu()
        x = self.conv3(x, edge_index)

        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        return x

    def training_step(self, batch, batch_idx):
        x , edge_index , batch , y = batch.x , batch.edge_index , batch.batch , batch.y
        out = self(x, edge_index, batch)
        loss = self.criterion(out, y)
        acc = self.accuracy(out, y)
        self.cfm_training(out , y)
        
        self.log('train_loss' , loss , prog_bar=True)
        self.log('train_acc' , acc , prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x , edge_index , batch , y = batch.x , batch.edge_index , batch.batch , batch.y
        out = self(x, edge_index, batch)
        loss = self.criterion(out, y)
        acc = self.accuracy(out, y)
        preci = self.precision(out, y)
        rec = self.recall(out, y)
        auroc = self.auroc(out, y)
        cfm = self.cfm_testing(out, y)  
        
        self.log('val_loss' , loss , prog_bar=True, on_epoch=True)
        self.log('val_acc' , acc , prog_bar=True, on_epoch=True)
        self.log('val_preci' , preci , prog_bar=True, on_epoch=True)
        self.log('val_rec' , rec , prog_bar=True, on_epoch=True)
        self.log('val_auroc' , auroc , prog_bar=True , on_epoch=True)
    
    def on_train_epoch_end(self) -> None:
        
        cfm = self.cfm_training.compute().cpu().numpy()
        print("")
        print("-------- Confusion Matrix [Training] --------")
        print(cfm)
        
        self.cfm_training.reset()
        
    def on_validation_epoch_end(self):
        
        cfm = self.cfm_testing.compute().cpu().numpy()
        print("")
        print("-------- Confusion Matrix [Testing] --------")
        print(cfm)
        
        self.cfm_testing.reset()
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.0001)


model = GCN(num_features=129 , hidden_channels=64 , output_class=5)
trainer = pl.Trainer(max_epochs=100)
trainer.fit(model, train_loader, test_loader)

[2024-04-18 22:19:01,313: INFO: setup: GPU available: False, used: False]
[2024-04-18 22:19:01,314: INFO: setup: TPU available: False, using: 0 TPU cores]
[2024-04-18 22:19:01,315: INFO: setup: IPU available: False, using: 0 IPUs]
[2024-04-18 22:19:01,315: INFO: setup: HPU available: False, using: 0 HPUs]
[2024-04-18 22:19:01,321: INFO: model_summary: 
   | Name      | Type                      | Params
---------------------------------------------------------
0  | conv1     | GCNConv                   | 8.3 K 
1  | bn1       | BatchNorm                 | 128   
2  | conv2     | GCNConv                   | 4.2 K 
3  | bn2       | BatchNorm                 | 128   
4  | conv3     | GCNConv                   | 4.2 K 
5  | lin       | Linear                    | 325   
6  | criterion | CrossEntropyLoss          | 0     
7  | accuracy  | MulticlassAccuracy        | 0     
8  | precision | MulticlassPrecision       | 0     
9  | recall    | MulticlassRecall          | 0     
10 | auroc     

c:\Users\tchia\miniconda3\envs\gnn\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:01<00:01,  0.52it/s]



Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:03<00:00,  0.51it/s]
-------- Confusion Matrix [Testing] --------
[[ 0  0  0  0  8]
 [ 0  0  0  0  2]
 [ 0  0  0  0 38]
 [ 0  0  0  0 15]
 [ 0  0  0  0  1]]
                                                                           

c:\Users\tchia\miniconda3\envs\gnn\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
c:\Users\tchia\miniconda3\envs\gnn\lib\site-packages\pytorch_lightning\loops\fit_loop.py:298: The number of training batches (20) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 0: 100%|██████████| 20/20 [01:04<00:00,  0.31it/s, v_num=19, train_loss=1.550, train_acc=0.429] 
-------- Confusion Matrix [Testing] --------
[[  0   2  57   3  68]
 [  0   0  21   1  24]
 [  6   4 193  20 196]
 [  2   1  77   4  56]
 [  0   0  20   0  14]]
Epoch 0: 100%|██████████| 20/20 [01:15<00:00,  0.26it/s, v_num=19, train_loss=1.550, train_acc=0.429, val_loss=1.550, val_acc=0.545, val_preci=0.545, val_rec=0.545, val_auroc=0.486]
-------- Confusion Matrix [Training] --------
[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]
Epoch 1:   0%|          | 0/20 [00:00<?, ?it/s, v_num=19, train_loss=1.550, train_acc=0.429, val_loss=1.550, val_acc=0.545, val_preci=0.545, val_rec=0.545, val_auroc=0.486]         



Epoch 1: 100%|██████████| 20/20 [01:15<00:00,  0.27it/s, v_num=19, train_loss=1.480, train_acc=0.571, val_loss=1.550, val_acc=0.545, val_preci=0.545, val_rec=0.545, val_auroc=0.486]
-------- Confusion Matrix [Testing] --------
[[  1   0  98   4  27]
 [  0   0  32   1  13]
 [  2   0 324  16  77]
 [  1   1 108   6  24]
 [  0   0  24   1   9]]
Epoch 1: 100%|██████████| 20/20 [01:25<00:00,  0.23it/s, v_num=19, train_loss=1.480, train_acc=0.571, val_loss=1.530, val_acc=0.545, val_preci=0.545, val_rec=0.545, val_auroc=0.531]
-------- Confusion Matrix [Training] --------
[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]
Epoch 2:  65%|██████▌   | 13/20 [00:57<00:30,  0.23it/s, v_num=19, train_loss=1.540, train_acc=0.469, val_loss=1.530, val_acc=0.545, val_preci=0.545, val_rec=0.545, val_auroc=0.531]