In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import torch
import numpy as np
from rdkit.Chem import AllChem
from rdkit import Chem
from collections import defaultdict



In [13]:
train_sequence = []
def load_data_long(dataset, device):
    mole_dict = {1: "H", 2: "He", 3: "Li", 4: "Be", 5: "B", 6: "C", 7: "N", 8: "O", 9: "F", 10: " Ne",
                11: "Na", 12:"Mg", 13: "Al", 14:"Si", 15:"P", 16: "S", 17: "Cl", 18:"Ar", 19:"K", 20:"Ca", 22:"Ti", 24:"Cr", 26:"Fe", 28:"Ni",
                29:"Cu", 31:"Ga", 32:"Ge", 34:"Se", 35:"Br", 40:"Zr", 44:"Ru", 45:"Rh", 46:"Pd", 47:"Ag", 50:"Sn", 51:"Sb", 52:"Te", 53: "I", 65:"Tb", 75:"Re", 77:"Ir", 78:"Pt", 79:"Au", 80:"Hg",
                81:"Tl", 82:"Pb", 83:"Bi"}

    pair_list = ["Br", "Cl", "Si", "Na", "Ca", "Ge", "Cu", "Au", "Sn", "Tb", "Pt", "Re", "Ru", "Bi", "Li", "Fe", "Sb", "Hg","Pb", "Se", "Ag","Cr","Pd","Ga","Mg","Ni","Ir","Rh","Te","Ti","Al","Zr","Tl"]

    data_file = f"./original_datasets/{dataset}/{dataset}_train"
    file = open(data_file, "r")
    node_types = set()
    label_types = set()
    tr_len = 0
    for line in file:
        tr_len += 1
        smiles = line.split("\t")[1]
        label = line.split("\t")[2].strip()
        if label != '':
          label_types.add(label)

        i = 0
        s = []
        while i < len(smiles):
            if i < len(smiles)-1 and (smiles[i] + smiles[i+1]) in pair_list:
                s.append(smiles[i] + smiles[i+1])
                i += 2
            else:
                s.append(smiles[i].upper())
                i += 1
        node_types |= set(s)

    file.close()


    print(tr_len)

    node2index = {n: i for i, n in enumerate(node_types)}
    label2index = {l: i for i, l in enumerate(label_types)}

    print(node2index)
    print(label2index)

    data_file = f"./original_datasets/{dataset}/{dataset}_train"
    file = open(data_file, "r")
    train_adjlists = []
    train_features = []
    train_sequence = []
    train_labels = torch.zeros(tr_len)
    for line in file:
        smiles = line.split("\t")[1]
        label = line.split("\t")[2].strip()
        if label != '':
          label_types.add(label)

        mol = AllChem.MolFromSmiles(smiles)
        graph_nodes = []
        for atom in mol.GetAtoms():
            graph_nodes.append(mole_dict[atom.GetAtomicNum()])
        # print(graph_nodes)
        i = 0
        s = 0
        while i < len(smiles):
            if i < len(smiles)-1 and (smiles[i] + smiles[i+1]) in pair_list:
                i += 2
            else:
                i += 1
            s += 1

        feature = torch.zeros(s, len(node_types))

        map = {}
        se_num = 0
        gr_num = 0
        i = 0
        smiles_seq = []
        while i < len(smiles):
            this_str = smiles[i]
            if i < len(smiles)-1 and (smiles[i] + smiles[i+1]) in pair_list:
                this_str = smiles[i] + smiles[i+1]
                i += 2
            else:
                this_str = this_str.upper()
                i += 1
            smiles_seq.append(node2index[this_str])
            if this_str in graph_nodes and this_str == mole_dict[mol.GetAtoms()[gr_num].GetAtomicNum()]:
                map[gr_num] = se_num
                gr_num += 1
            feature[se_num, node2index[this_str]] = 1
            se_num += 1

        adj_list = defaultdict(list)
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            # print(i,j)
            typ = bond.GetBondType()
            adj_list[map[i]].append(map[j])
            adj_list[map[j]].append(map[i])
            if typ == Chem.rdchem.BondType.DOUBLE:
                adj_list[map[i]].append(map[j])
                adj_list[map[j]].append(map[i])
            elif typ == Chem.rdchem.BondType.TRIPLE:
                adj_list[map[i]].append(map[j])
                adj_list[map[j]].append(map[i])
                adj_list[map[i]].append(map[j])
                adj_list[map[j]].append(map[i])

        train_labels[len(train_adjlists)]= int(label2index[label])
        #train_labels[len(train_adjlists)]= int(label)
        train_adjlists.append(adj_list)
        train_features.append(torch.FloatTensor(feature).to(device))
        train_sequence.append(torch.tensor(smiles_seq))
    file.close()


    train_data = {}
    train_data['sequence'] = train_sequence
    return train_data, train_labels

In [14]:
train_data, train_labels=load_data_long("logp", device="cuda" if torch.cuda.is_available() else "cpu")

8000
{'(': 0, 'P': 1, 'Sn': 2, 'Br': 3, '6': 4, '5': 5, 'I': 6, 'N': 7, 'B': 8, 'F': 9, '=': 10, '-': 11, '#': 12, '[': 13, 'S': 14, '2': 15, '1': 16, '@': 17, '\\': 18, 'C': 19, '3': 20, 'Si': 21, 'Cl': 22, 'H': 23, '/': 24, '+': 25, '4': 26, ')': 27, 'O': 28, ']': 29}
{'1': 0, '0': 1}


In [15]:
def pad_sequence_to_length(sequence, length):
    if len(sequence) < length:
        pad_size = length - len(sequence)
        padding = torch.zeros(pad_size, *sequence.size()[1:], dtype=sequence.dtype, device=sequence.device)
        return torch.cat((sequence, padding), dim=0)
    else:
        return sequence[:length]
train_data['sequence'] = [torch.Tensor(seq) for seq in train_data['sequence']]
padded_train_sequence = [pad_sequence_to_length(tensor, length=100) for tensor in train_data['sequence']]


In [16]:
padded_train_sequence

[tensor([19, 28, 19, 16, 19, 19,  0, 24, 19, 10,  7, 24,  7, 19,  0, 10, 28, 27,
         19, 19, 15, 19, 19, 19, 14, 15, 27, 19, 19,  0,  3, 27, 19, 16, 28,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 tensor([19, 28, 19,  0, 10, 28, 27, 19, 16, 19, 19, 19,  0, 19, 27, 19,  0, 11,
          7, 15, 19,  0, 19, 27, 19, 19,  0, 24, 19, 10, 19,  0, 24, 19, 12,  7,
         27, 19,  0, 10, 28, 27,  7, 19, 19, 20, 19, 19, 19,  0, 28, 19, 27, 19,
         19, 20, 27, 19, 15, 19, 27, 19, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 tensor([19,  7, 16, 19, 19, 19, 19, 16, 19, 19,  0, 10, 28, 27,  7, 24,  7, 10,
       

In [17]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
class SMILESDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

batch_size=1

# Assuming you have train_data, train_labels, val_data, and val_labels
train_data, val_data, train_labels, val_labels = train_test_split(padded_train_sequence, train_labels, test_size=0.2)

# Create datasets and data loaders for training and validation
train_dataset = SMILESDataset(train_data, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)

val_dataset = SMILESDataset(val_data, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [18]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, dim_feedforward, max_length=100):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward), num_layers=num_encoder_layers)
        self.fc = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.embedding(x)
        x = torch.transpose(x, 0, 1)
        x = self.transformer_encoder(x)
        x = torch.mean(x, dim=0)
        x = self.fc(x)
        #print(x.shape,x)
        return x


# Define hyperparameters
vocab_size = 100
d_model = 128
nhead = 4
num_encoder_layers = 3
dim_feedforward = 512
max_length = 100
batch_size = 2
num_epochs = 100

# Initialize model, loss, and optimizer
model = TransformerModel(vocab_size, d_model, nhead, num_encoder_layers, dim_feedforward, max_length)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)




In [25]:
train_accuracy=[]
predictions=[]
true_labels=[]
from sklearn.metrics import accuracy_score
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        #print(outputs)
        #print(targets)
        #print(outputs.squeeze())
        print(outputs,targets)
        loss = criterion(outputs[0], targets)
        loss.backward()
        optimizer.step()
        #break


    # Evaluate accuracy on training set
    model.eval()
    with torch.no_grad():
        predictions = []
        true_labels = []
        for inputs, targets in val_dataloader:
                  output = model(inputs)
                  #print(output)
                  #print(torch.sigmoid(output).cpu().numpy().flatten())
                  predictions.extend(torch.sigmoid(output).cpu().numpy().flatten())
                  true_labels.extend(targets.cpu().numpy())
                  #break

    true_labels = np.array(true_labels)  # Corrected indentation


    epoch_accuracy = accuracy_score(np.round(predictions), true_labels)
    train_accuracy.append(epoch_accuracy)
    print(f'Epoch {epoch + 1}/{num_epochs}, Accuracy: {epoch_accuracy:.4f}')

tensor([[0.1174]], grad_fn=<AddmmBackward0>) tensor([0.])
tensor([[-2.8708]], grad_fn=<AddmmBackward0>) tensor([0.])
tensor([[-4.5839]], grad_fn=<AddmmBackward0>) tensor([1.])
tensor([[-3.9217]], grad_fn=<AddmmBackward0>) tensor([1.])
tensor([[-2.8962]], grad_fn=<AddmmBackward0>) tensor([1.])
tensor([[-1.7931]], grad_fn=<AddmmBackward0>) tensor([0.])
tensor([[-0.9732]], grad_fn=<AddmmBackward0>) tensor([1.])
tensor([[-0.0639]], grad_fn=<AddmmBackward0>) tensor([1.])
tensor([[0.8608]], grad_fn=<AddmmBackward0>) tensor([0.])
tensor([[1.2099]], grad_fn=<AddmmBackward0>) tensor([1.])
tensor([[1.5151]], grad_fn=<AddmmBackward0>) tensor([1.])
tensor([[1.8464]], grad_fn=<AddmmBackward0>) tensor([0.])
tensor([[1.7934]], grad_fn=<AddmmBackward0>) tensor([0.])
tensor([[1.5738]], grad_fn=<AddmmBackward0>) tensor([0.])
tensor([[1.1684]], grad_fn=<AddmmBackward0>) tensor([0.])
tensor([[0.7601]], grad_fn=<AddmmBackward0>) tensor([1.])
tensor([[0.4450]], grad_fn=<AddmmBackward0>) tensor([1.])
tensor(

In [None]:
    # with open(file_name, 'a') as file:
    #     for epoch in range(num_epochs):
    #         losses = 0.0
    #         correct_predictions = 0
    #         total_predictions = 0
            
    #         for graph_data, seq_data,labels in train_dataloader:
    #             #print('Graph data:',graph_data.x.shape,graph_data.edge_index.shape)
    #             loss, output = model.train(graph_data, seq_data, labels)
    #             losses += loss.item()
                
    #             # Convert model output to predicted labels
    #             predicted_labels = torch.round(output).detach().to(device)  # Move to device
    #             target = graph_data.y.double().to(device)
                
    #             file.write(f'output:{output}\n')
    #             #print('Prediction:',predicted_labels,'Target',target)
    #             file.write(f'Prediction:{predicted_labels}, Target:{target}\n')
                
    #             # Compare with actual labels
    #             correct_predictions += (predicted_labels == target).sum().item()
    #             total_predictions += graph_data.y.size(0)
            
    #         train_accuracy = correct_predictions / total_predictions
    #         print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {losses:.4f}, Train Accuracy: {train_accuracy:.4f}')
    #         file.write(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {losses:.4f}, Train Accuracy: {train_accuracy:.4f}\n')
        
    # # Evaluation on test set
    # test_correct = 0
    # test_total = 0
    # output_dir = 'output'
    # os.makedirs(output_dir, exist_ok=True)
    # current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    # file_name = f'{output_dir}/test_accuracy_details_{current_time}.txt'

    # with open(file_name, 'a') as file:
    #     for graph_data, seq_data,label in test_dataloader:
    #         with torch.no_grad():
    #             output = model.test(graph_data, seq_data, epoch)
    #             predicted_labels = torch.round(output).detach()
    #             target = graph_data.y.double()
    #             test_correct += (predicted_labels == target).sum().item()
    #             test_total += graph_data.y.size(0)
            
    #         test_accuracy = test_correct / test_total
    #         print(f'Epoch {epoch + 1}/{num_epochs}, Test Accuracy: {test_accuracy:.4f}')
    #         file.write(f'Epoch {epoch + 1}/{num_epochs}, Test Accuracy: {test_accuracy:.4f}\n')    



In [None]:
    # # Evaluation on test set
    # test_correct = 0
    # test_total = 0
    # output_dir = 'output'
    # os.makedirs(output_dir, exist_ok=True)
    # current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    # file_name = f'{output_dir}/test_accuracy_details_{current_time}.txt'

    # with open(file_name, 'a') as file:
    #     for graph_data, seq_data,label in test_dataloader:
    #         with torch.no_grad():
    #             output = model.test(graph_data, seq_data, epoch)
    #             predicted_labels = torch.round(output).detach()
    #             target = graph_data.y.double()
    #             test_correct += (predicted_labels == target).sum().item()
    #             test_total += graph_data.y.size(0)
            
    #         test_accuracy = test_correct / test_total
    #         print(f'Epoch {epoch + 1}/{num_epochs}, Test Accuracy: {test_accuracy:.4f}')
    #         file.write(f'Epoch {epoch + 1}/{num_epochs}, Test Accuracy: {test_accuracy:.4f}\n')   