In [1]:
!apt-get install -y python-rdkit librdkit1 rdkit-data
!pip install rdkit

Reading package lists... Done
Building dependency tree       
Reading state information... Done
E: Unable to locate package python-rdkit
Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2023.9.5


In [2]:
!pip install ogb

Collecting ogb
  Downloading ogb-1.3.6-py3-none-any.whl.metadata (6.2 kB)
Collecting outdated>=0.2.0 (from ogb)
  Downloading outdated-0.2.2-py2.py3-none-any.whl.metadata (4.7 kB)
Collecting littleutils (from outdated>=0.2.0->ogb)
  Downloading littleutils-0.2.2.tar.gz (6.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading ogb-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.8/78.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading outdated-0.2.2-py2.py3-none-any.whl (7.5 kB)
Building wheels for collected packages: littleutils
  Building wheel for littleutils (setup.py) ... [?25ldone
[?25h  Created wheel for littleutils: filename=littleutils-0.2.2-py3-none-any.whl size=7026 sha256=6984561169f03c214b088a463dec429693f03970eb0463c9cb9a0d7ebefbfde2
  Stored in directory: /root/.cache/pip/wheels/3d/fe/b0/27a9892da57472e538c7452a721a9cf463cc03cf7379889266
Successfully built littleutils
Installing collected packages: l

In [3]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.5.2-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.5.2-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.5.2


In [4]:
from ogb.lsc import PygPCQM4Mv2Dataset, PCQM4Mv2Evaluator
from ogb.graphproppred.mol_encoder import AtomEncoder,BondEncoder
import torch
import torch.nn.functional as F
from torch_geometric.nn import GINConv, GCNConv
from torch_geometric.nn.pool import global_add_pool
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch_geometric.loader import DataLoader
import os
import time
import random
import numpy as np
from tqdm.auto import tqdm
from torch_geometric.datasets import PCQM4Mv2

In [5]:
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
random.seed(42)
device = torch.device('cuda')

In [6]:
class GNN_graph(torch.nn.Module):
    def __init__(self, num_layers=5, emb_dim=100, drop_ratio=0.5, gnn_type='GIN'):
        super().__init__()
        self.num_layers = num_layers
        self.drop_ratio = drop_ratio
        self.atom_encoder = AtomEncoder(emb_dim)
        #self.bond_encoder = BondEncoder(emb_dim)
        self.mlp = torch.nn.Sequential(torch.nn.Linear(emb_dim, emb_dim), torch.nn.BatchNorm1d(emb_dim), torch.nn.ReLU(), torch.nn.Linear(emb_dim, emb_dim))
        
        self.graph_pool = global_add_pool
        self.linear_pred = torch.nn.Linear(emb_dim, 1)
        
        if self.num_layers<2:
            raise ValueError("Number of layers must be more than 1")
            
        self.convs = torch.nn.ModuleList()
        self.norms = torch.nn.ModuleList()
        
        for i in range(num_layers):
            if(gnn_type=='GIN'):
                self.convs.append(GINConv(self.mlp))
            elif(gnn_type=='GCN'):
                self.convs.append(GCNConv(emb_dim, emb_dim, normalize=False))
            else:
                ValueError("Invalid GNN type called")
                
            self.norms.append(torch.nn.BatchNorm1d(emb_dim))
            
    def forward(self, batched_data):
        x, edge_index, edge_attr, batch = batched_data.x, batched_data.edge_index, batched_data.edge_attr, batched_data.batch
        #edge_embedding = self.bond_encoder(edge_attr)
        h_list = [self.atom_encoder(x)]
        for layer in range(self.num_layers):

            h = self.convs[layer](h_list[layer], edge_index)
            h = self.norms[layer](h)

            if layer == self.num_layers - 1:
                #remove relu for the last layer
                h = F.dropout(h, self.drop_ratio, training = self.training)
            else:
                h = F.dropout(F.relu(h), self.drop_ratio, training = self.training)

            h_list.append(h)
            
            
        node_feat = h_list[-1]
        graph_feat = self.graph_pool(node_feat, batch)
        output = self.linear_pred(graph_feat)
        
        
        return output

In [7]:
def test(model, device, loader):
    model.eval()
    
    y_pred = []

    for step, batch in enumerate(tqdm(loader, desc="Iteration")):
        batch = batch.to(device)

        with torch.no_grad():
            pred = model(batch).view(-1,)

        y_pred.append(pred.detach().cpu())

    y_pred = torch.cat(y_pred, dim = 0)
    
    out = []
    for pred in y_pred:
        out.append(pred/2.0)

    return out

In [8]:
import pandas as pd
from ogb.utils import smiles2graph
from torch_geometric.data import Data

In [9]:
class OnTheFlyPCQMDataset(object):
    def __init__(self, smiles_list, smiles2graph=smiles2graph):
        super(OnTheFlyPCQMDataset, self).__init__()
        self.smiles_list = smiles_list 
        self.smiles2graph = smiles2graph

    def __getitem__(self, idx):
        '''Get datapoint with index'''
        data = Data()
        smiles, y = self.smiles_list[idx]
        graph = self.smiles2graph(smiles)

        data.__num_nodes__ = int(graph['num_nodes'])
        data.edge_index = torch.from_numpy(graph['edge_index']).to(torch.int64)
        data.edge_attr = torch.from_numpy(graph['edge_feat']).to(torch.int64)
        data.x = torch.from_numpy(graph['node_feat']).to(torch.int64)
        
        return data
    
    def __len__(self):
        '''Length of the dataset
        Returns
        -------
        int
            Length of Dataset
        '''
        return len(self.smiles_list)

In [11]:
df = pd.read_csv("/kaggle/input/btp-data/Btp - Sheet1.csv")
test_data = df.values.tolist()
onthefly_data = OnTheFlyPCQMDataset(test_data)
loader = DataLoader(onthefly_data, batch_size=10, shuffle=False)

In [12]:
gcn = GNN_graph(num_layers=5, emb_dim=200, drop_ratio=0.5, gnn_type='GCN')
gcn_checkpoint = torch.load('/kaggle/input/checkpoint-gcn/checkpoint (1).pt')
gcn.load_state_dict(gcn_checkpoint['model_state_dict'])
gcn.to(device)

gin = GNN_graph(num_layers=5, emb_dim=200, drop_ratio=0.5, gnn_type='GIN')
gin_checkpoint = torch.load('/kaggle/input/checkpoint-gin/checkpoint (2).pt')
gin.load_state_dict(gin_checkpoint['model_state_dict'])
gin.to(device)

GNN_graph(
  (atom_encoder): AtomEncoder(
    (atom_embedding_list): ModuleList(
      (0): Embedding(119, 200)
      (1): Embedding(5, 200)
      (2-3): 2 x Embedding(12, 200)
      (4): Embedding(10, 200)
      (5-6): 2 x Embedding(6, 200)
      (7-8): 2 x Embedding(2, 200)
    )
  )
  (mlp): Sequential(
    (0): Linear(in_features=200, out_features=200, bias=True)
    (1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Linear(in_features=200, out_features=200, bias=True)
  )
  (linear_pred): Linear(in_features=200, out_features=1, bias=True)
  (convs): ModuleList(
    (0-4): 5 x GINConv(nn=Sequential(
      (0): Linear(in_features=200, out_features=200, bias=True)
      (1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Linear(in_features=200, out_features=200, bias=True)
    ))
  )
  (norms): ModuleList(
    (0-4): 5 x BatchNorm1d(200, eps=1e-05, momentum=0.1, 

In [13]:
print("Predicting on test data...")
gin_pred = test(gin, device, loader)
gcn_pred = test(gcn, device, loader)

Predicting on test data...


Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
df_gin = pd.DataFrame(gin_pred)
df_gin.to_csv('gin_predictions.csv', index = False)

In [15]:
df_gcn = pd.DataFrame(gcn_pred)
df_gcn.to_csv('gcn_predictions.csv', index = False)