In [2]:
import os
import sys
import torch
import pickle
import random
import argparse
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from rdkit.Chem import AllChem
from rdkit import Chem

from torch_geometric.data import DataLoader,Data
from torch.utils.data import SubsetRandomSampler
import torch.nn as nn
from sklearn.metrics import accuracy_score
import json
from sklearn.model_selection import train_test_split
import datetime

In [3]:
def adj_list_to_adj_matrix(adj_list):
    num_nodes = len(adj_list)
    adj_matrix = torch.zeros((num_nodes, num_nodes), dtype=torch.float)
    for node, neighbors in adj_list.items():
        for neighbor in neighbors:
            adj_matrix[node][neighbor] = 1.0
            adj_matrix[neighbor][node] = 1.0
    return adj_matrix

def pad_sequence_to_length(sequence, length):
    if len(sequence) < length:
        pad_size = length - len(sequence)
        padding = torch.zeros(pad_size, *sequence.size()[1:], dtype=sequence.dtype, device=sequence.device)
        return torch.cat((sequence, padding), dim=0)
    else:
        return sequence[:length]
    

In [16]:
from torch.utils.data import Dataset
class SMILESDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]
    
def load_data(dataset, device):

    data_file = f"./original_datasets/{dataset}/{dataset}_train"
    file = open(data_file, "r")
    node_types = set()
    label_types = set()
    tr_len = 0
    for line in file:
        #print(line)
        #break
        tr_len += 1
        smiles = line.split("\t")[1]
        s = []
        mol = AllChem.MolFromSmiles(smiles)
        for atom in mol.GetAtoms():
            s.append(atom.GetAtomicNum())
        node_types |= set(s)
        label = line.split("\t")[2][:-1]
        #print(label)
        label_types.add(label)
        #print(label_types)
    file.close()

    te_len = 0
    data_file = f"./original_datasets/{dataset}/{dataset}_test"
    file = open(data_file, "r")
    for line in file:
        te_len += 1
        smiles = line.split("\t")[1]
        s = []
        mol = AllChem.MolFromSmiles(smiles)
        for atom in mol.GetAtoms():
            s.append(atom.GetAtomicNum())
        node_types |= set(s)
        label = line.split("\t")[2][:-1]
        label_types.add(label)
    file.close()

    print(tr_len)
    print(te_len)

    node2index = {n: i for i, n in enumerate(node_types)}
    label2index = {i: i for i in label_types}

    print(node2index)
    print(label2index)

    data_file = f"./original_datasets/{dataset}/{dataset}_train"
    file = open(data_file, "r")
    train_adjlists = []
    train_features = []
    train_sequence = []
    train_labels = torch.zeros(tr_len)
    for line in file:
        smiles = line.split("\t")[1]
        label = line.split("\t")[2][:-1]
        mol = AllChem.MolFromSmiles(smiles)
        feature = torch.zeros(len(mol.GetAtoms()), len(node_types))

        l = 0
        smiles_seq = []
        for atom in mol.GetAtoms():
            feature[l, node2index[atom.GetAtomicNum()]] = 1
            smiles_seq.append(node2index[atom.GetAtomicNum()])
            l += 1
        adj_list = defaultdict(list)
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            typ = bond.GetBondType()
            adj_list[i].append(j)
            adj_list[j].append(i)
            if typ == Chem.rdchem.BondType.DOUBLE:
                adj_list[i].append(j)
                adj_list[j].append(i)
            elif typ == Chem.rdchem.BondType.TRIPLE:
                adj_list[i].append(j)
                adj_list[j].append(i)
                adj_list[i].append(j)
                adj_list[j].append(i)

        train_labels[len(train_adjlists)]= int(label2index[label])
        #print("train:",train_labels)
        train_adjlists.append(adj_list)
        train_features.append(torch.FloatTensor(feature).to(device))
        train_sequence.append(torch.tensor(smiles_seq))
    file.close()

    data_file = f"./original_datasets/{dataset}/{dataset}_test"
    file = open(data_file, "r")
    test_adjlists = []
    test_features = []
    test_sequence = []
    test_labels = np.zeros(te_len)
    for line in file:
        smiles = line.split("\t")[1]
        label = line.split("\t")[2][:-1]
        mol = AllChem.MolFromSmiles(smiles)
        feature = torch.zeros(len(mol.GetAtoms()), len(node_types))
        l = 0
        smiles_seq = []
        for atom in mol.GetAtoms():
            feature[l, node2index[atom.GetAtomicNum()]] = 1
            smiles_seq.append(node2index[atom.GetAtomicNum()])
            l += 1
        adj_list = defaultdict(list)
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            typ = bond.GetBondType()
            adj_list[i].append(j)
            adj_list[j].append(i)
            if typ == Chem.rdchem.BondType.DOUBLE:
                adj_list[i].append(j)
                adj_list[j].append(i)
            elif typ == Chem.rdchem.BondType.TRIPLE:
                adj_list[i].append(j)
                adj_list[j].append(i)
                adj_list[i].append(j)
                adj_list[j].append(i)

        test_labels[len(test_adjlists)] = int(label2index[label])
        test_adjlists.append(adj_list)
        test_features.append(torch.FloatTensor(feature).to(device))
        test_sequence.append(torch.tensor(smiles_seq))
    file.close()

    train_data = {}
    train_data['adj_lists'] = train_adjlists
    train_data['features'] = train_features
    train_data['sequence'] = train_sequence

    test_data = {}
    test_data['adj_lists'] = test_adjlists
    test_data['features'] = test_features
    test_data['sequence'] = test_sequence

    return train_data, train_labels, test_data, test_labels

In [19]:
class CustomDataset(Dataset):
    def __init__(self, graph_data, sequence_data, labels):
        self.graph_data = graph_data
        self.sequence_data = sequence_data
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.graph_data[idx], self.sequence_data[idx], self.labels[idx]

In [23]:
batch_size = 1
d_name = "logp"
if torch.cuda.is_available():  # Check GPU availability
    device = torch.device("cuda:0")  # Set device to GPU
else:
    device = torch.device("cpu")

learning_rate = 0.001
train_data, train_labels, test_data, test_labels = load_data("logp", device=device)  # Pass device argument

# Move data tensors to GPU
train_data['sequence'] = [torch.Tensor(seq).to(device) for seq in train_data['sequence']]
test_data['sequence'] = [torch.Tensor(seq).to(device) for seq in test_data['sequence']]

padded_train_sequence = [pad_sequence_to_length(tensor, length=100) for tensor in train_data['sequence']]
padded_test_sequence = [pad_sequence_to_length(tensor, length=100) for tensor in test_data['sequence']]





input_dim_train = train_data['features'][0].size(-1)
input_dim_test = test_data['features'][0].size(-1)

adj_matrices_train = [adj_list_to_adj_matrix(adj_list) for adj_list in train_data['adj_lists']]
adj_matrices_test = [adj_list_to_adj_matrix(adj_list) for adj_list in test_data['adj_lists']]

data_list_train = [
    Data(
        x=torch.tensor(features, dtype=torch.float),
        edge_index=torch.nonzero(adj_matrix, as_tuple=False).t().contiguous(),
        y=torch.tensor(label, dtype=torch.float)
    )
    for features, adj_matrix, label in zip(train_data['features'], adj_matrices_train, train_labels)
]
data_list_test = [
    Data(
        x=torch.tensor(features, dtype=torch.float),
        edge_index=torch.nonzero(adj_matrix, as_tuple=False).t().contiguous(),
        y=torch.tensor(label, dtype=torch.float)
    )
    for features, adj_matrix, label in zip(test_data['features'], adj_matrices_test, test_labels)
]


# Combine both graph and sequence data into a single dataset
train_data = CustomDataset(data_list_train, padded_train_sequence, train_labels)
test_data = CustomDataset(data_list_test, padded_test_sequence, test_labels)

# Create a single loader for the combined dataset
train_dataloader = DataLoader(train_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

# Loop through the combined loader
for graph_data, seq_data, labels in train_dataloader:
    print('Graph data:', graph_data.x.shape, graph_data.edge_index.shape,graph_data.x)
    print('Sequence data:', seq_data[0].shape, seq_data[0])
    print('Labels:', labels)
    break




8000
1000
{5: 0, 6: 1, 7: 2, 8: 3, 9: 4, 14: 5, 15: 6, 16: 7, 17: 8, 35: 9, 50: 10, 53: 11}
{'0': '0', '1': '1'}


  x=torch.tensor(features, dtype=torch.float),
  y=torch.tensor(label, dtype=torch.float)
  x=torch.tensor(features, dtype=torch.float),


Graph data: torch.Size([21, 12]) torch.Size([2, 44]) tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 1., 0.

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    print("CUDA is available!")
    device = torch.device("cuda:0")
else:
    print("CUDA is not available.")
    device = torch.device("cpu")

print("Using device:", device)

CUDA is available!
Using device: cuda:0
