In [1]:
import torch
from datetime import datetime
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import torch
import numpy as np
from rdkit.Chem import AllChem
from rdkit import Chem
from collections import defaultdict
import os,time
from torch_geometric.data import Data
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

In [2]:
def load_data_long(dataset, device):
    mole_dict = {1: "H", 2: "He", 3: "Li", 4: "Be", 5: "B", 6: "C", 7: "N", 8: "O", 9: "F", 10: " Ne",
                11: "Na", 12:"Mg", 13: "Al", 14:"Si", 15:"P", 16: "S", 17: "Cl", 18:"Ar", 19:"K", 20:"Ca", 22:"Ti", 24:"Cr", 26:"Fe", 28:"Ni",
                29:"Cu", 31:"Ga", 32:"Ge", 34:"Se", 35:"Br", 40:"Zr", 44:"Ru", 45:"Rh", 46:"Pd", 47:"Ag", 50:"Sn", 51:"Sb", 52:"Te", 53: "I", 65:"Tb", 75:"Re", 77:"Ir", 78:"Pt", 79:"Au", 80:"Hg",
                81:"Tl", 82:"Pb", 83:"Bi"}

    pair_list = ["Br", "Cl", "Si", "Na", "Ca", "Ge", "Cu", "Au", "Sn", "Tb", "Pt", "Re", "Ru", "Bi", "Li", "Fe", "Sb", "Hg","Pb", "Se", "Ag","Cr","Pd","Ga","Mg","Ni","Ir","Rh","Te","Ti","Al","Zr","Tl"]

    data_file = f"./original_datasets/{dataset}/{dataset}_train"
    file = open(data_file, "r")
    node_types = set()
    label_types = set()
    tr_len = 0
    for line in file:
        tr_len += 1
        smiles = line.split("\t")[1]
        label = line.split("\t")[2][:-1]
        i = 0
        s = []
        while i < len(smiles):
            if i < len(smiles)-1 and (smiles[i] + smiles[i+1]) in pair_list:
                s.append(smiles[i] + smiles[i+1])
                i += 2
            else:
                s.append(smiles[i].upper())
                i += 1
        node_types |= set(s)
        label_types.add(label)
    file.close()

    te_len = 0
    data_file = f"./original_datasets/{dataset}/{dataset}_test"
    file = open(data_file, "r")
    for line in file:
        te_len += 1
        smiles = line.split("\t")[1]
        label = line.split("\t")[2][:-1]
        i = 0
        s = []
        while i < len(smiles):
            if i < len(smiles)-1 and (smiles[i] + smiles[i+1]) in pair_list:
                s.append(smiles[i] + smiles[i+1])
                i += 2
            else:
                s.append(smiles[i].upper())
                i += 1
        node_types |= set(s)
        label_types.add(label)
    file.close()

    #print(tr_len)
    #print(te_len)

    node2index = {n: i for i, n in enumerate(node_types)}
    label2index = {l: i for i, l in enumerate(label_types)}

    #print(node2index)
    #print(label2index)

    data_file = f"./original_datasets/{dataset}/{dataset}_train"
    file = open(data_file, "r")
    train_adjlists = []
    train_features = []
    train_sequence = []
    train_labels = torch.zeros(tr_len)
    for line in file:
        smiles = line.split("\t")[1]
        label = line.split("\t")[2][:-1]
        mol = AllChem.MolFromSmiles(smiles)
        graph_nodes = []
        for atom in mol.GetAtoms():
            graph_nodes.append(mole_dict[atom.GetAtomicNum()])
        # print(graph_nodes)
        i = 0
        s = 0
        while i < len(smiles):
            if i < len(smiles)-1 and (smiles[i] + smiles[i+1]) in pair_list:
                i += 2
            else:
                i += 1
            s += 1

        feature = torch.zeros(s, len(node_types))

        map = {}
        se_num = 0
        gr_num = 0
        i = 0
        smiles_seq = []
        while i < len(smiles):
            this_str = smiles[i]
            if i < len(smiles)-1 and (smiles[i] + smiles[i+1]) in pair_list:
                this_str = smiles[i] + smiles[i+1]
                i += 2
            else:
                this_str = this_str.upper()
                i += 1
            smiles_seq.append(node2index[this_str])
            if this_str in graph_nodes and this_str == mole_dict[mol.GetAtoms()[gr_num].GetAtomicNum()]:
                map[gr_num] = se_num
                gr_num += 1
            feature[se_num, node2index[this_str]] = 1
            se_num += 1

        adj_list = defaultdict(list)
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            # print(i,j)
            typ = bond.GetBondType()
            adj_list[map[i]].append(map[j])
            adj_list[map[j]].append(map[i])
            if typ == Chem.rdchem.BondType.DOUBLE:
                adj_list[map[i]].append(map[j])
                adj_list[map[j]].append(map[i])
            elif typ == Chem.rdchem.BondType.TRIPLE:
                adj_list[map[i]].append(map[j])
                adj_list[map[j]].append(map[i])
                adj_list[map[i]].append(map[j])
                adj_list[map[j]].append(map[i])

        # train_labels[len(train_adjlists)]= int(label2index[label])
        train_labels[len(train_adjlists)]= int(label)
        train_adjlists.append(adj_list)
        train_features.append(torch.FloatTensor(feature).to(device))
        train_sequence.append(torch.tensor(smiles_seq))
    file.close()

    data_file = f"./original_datasets/{dataset}/{dataset}_test"
    file = open(data_file, "r")
    test_adjlists = []
    test_features = []
    test_sequence = []
    test_labels = np.zeros(te_len)
    for line in file:
        smiles = line.split("\t")[1]
        # print(smiles)
        label = line.split("\t")[2][:-1]
        mol = AllChem.MolFromSmiles(smiles)
        graph_nodes = []
        for atom in mol.GetAtoms():
            graph_nodes.append(mole_dict[atom.GetAtomicNum()])
        # print(graph_nodes)
        i = 0
        s = 0
        while i < len(smiles):
            if i < len(smiles)-1 and (smiles[i] + smiles[i+1]) in pair_list:
                i += 2
            else:
                i += 1
            s += 1

        feature = torch.zeros(s, len(node_types))

        map = {}
        se_num = 0
        gr_num = 0
        i = 0
        smiles_seq = []
        while i < len(smiles):
            this_str = smiles[i]
            if i < len(smiles)-1 and (smiles[i] + smiles[i+1]) in pair_list:
                this_str = smiles[i] + smiles[i+1]
                i += 2
            else:
                this_str = this_str.upper()
                i += 1
            smiles_seq.append(node2index[this_str])
            if this_str in graph_nodes and this_str == mole_dict[mol.GetAtoms()[gr_num].GetAtomicNum()]:
                map[gr_num] = se_num
                gr_num += 1
            feature[se_num, node2index[this_str]] = 1
            se_num += 1

        adj_list = defaultdict(list)
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            # print(i,j)
            typ = bond.GetBondType()
            adj_list[map[i]].append(map[j])
            adj_list[map[j]].append(map[i])
            if typ == Chem.rdchem.BondType.DOUBLE:
                adj_list[map[i]].append(map[j])
                adj_list[map[j]].append(map[i])
            elif typ == Chem.rdchem.BondType.TRIPLE:
                adj_list[map[i]].append(map[j])
                adj_list[map[j]].append(map[i])
                adj_list[map[i]].append(map[j])
                adj_list[map[j]].append(map[i])

        # test_labels[len(test_adjlists)] = int(label2index[label])
        test_labels[len(test_adjlists)] = int(label)
        test_adjlists.append(adj_list)
        test_features.append(torch.FloatTensor(feature).to(device))
        test_sequence.append(torch.tensor(smiles_seq))
    file.close()

    train_data = {}
    train_data['adj_lists'] = train_adjlists
    train_data['features'] = train_features
     # Pad train_sequence to length 100
    padded_train_sequence = []
    for seq in train_sequence:
      padded_seq = torch.nn.functional.pad(seq, (0, 100 - len(seq)), 'constant', 0)
      padded_train_sequence.append(padded_seq)
      train_data['sequence'] = padded_train_sequence

    test_data = {}
    test_data['adj_lists'] = test_adjlists
    test_data['features'] = test_features
    padded_test_sequence = []
    for seq in test_sequence:
      padded_seq = torch.nn.functional.pad(seq, (0, 100 - len(seq)), 'constant', 0)
      padded_test_sequence.append(padded_seq)
      test_data['sequence'] = padded_test_sequence

    return train_data, train_labels, test_data, test_labels


class CustomDataset(Dataset):
    def __init__(self, data_list, sequence_list):
        self.data_list = data_list
        self.sequence_list = sequence_list

    def __getitem__(self, index):
        data = self.data_list[index]
        sequence = self.sequence_list[index]
        return data, sequence

    def __len__(self):
        return len(self.data_list)
    

def adj_list_to_adj_matrix(adj_list):
    num_nodes = max(adj_list.keys()) + 1
    adj_matrix = torch.zeros((num_nodes, num_nodes), dtype=torch.float)
    for node, neighbors in adj_list.items():
        for neighbor in neighbors:
            adj_matrix[node][neighbor] = 1.0
            adj_matrix[neighbor][node] = 1.0
    return adj_matrix




In [3]:
train_data, train_labels, test_data, test_labels=load_data_long("logp", device="cuda:0" if torch.cuda.is_available() else "cpu")
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
input_dim_train = train_data['features'][0].size(-1)
input_dim_test = test_data['features'][0].size(-1)


adj_matrices_train = [adj_list_to_adj_matrix(adj_list) for adj_list in train_data['adj_lists']]
adj_matrices_test = [adj_list_to_adj_matrix(adj_list) for adj_list in test_data['adj_lists']]



data_list_train = [Data(x=torch.tensor(features, dtype=torch.float),
                              edge_index=torch.nonzero(adj_matrix, as_tuple=False).t().contiguous(),
                              y=torch.tensor(label, dtype=torch.float))
                         for features, adj_matrix, label in zip(train_data['features'], adj_matrices_train, train_labels)]
data_list_test = [Data(x=torch.tensor(features, dtype=torch.float),
                                edge_index=torch.nonzero(adj_matrix, as_tuple=False).t().contiguous(),
                                y=torch.tensor(label, dtype=torch.float))
                            for features, adj_matrix, label in zip(test_data['features'], adj_matrices_test, test_labels)]

train_dataset = CustomDataset(data_list_train, train_data['sequence'])
test_dataset = CustomDataset(data_list_test, test_data['sequence'])

  data_list_train = [Data(x=torch.tensor(features, dtype=torch.float),
  y=torch.tensor(label, dtype=torch.float))
  data_list_test = [Data(x=torch.tensor(features, dtype=torch.float),


In [5]:
import torch
import torch.nn as nn

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, dim_feedforward, max_length=100):
        super(TransformerModel, self).__init__()
        self.device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward), num_layers=num_encoder_layers)
        self.fc = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.embedding(x).to(self.device)
        x = torch.transpose(x, 0, 1).to(self.device)
        x = self.transformer_encoder(x).to(self.device)
        x = torch.mean(x, dim=0).to(self.device)
        x = self.fc(x).to(self.device)
        x = x.mean(dim=0, keepdim=True).to(self.device)
        return x

# Define hyperparameters
vocab_size = 100
d_model = 100
nhead = 4
num_encoder_layers = 3
dim_feedforward = 512
max_length = 100
batch_size = 1
num_epochs = 100

# Initialize model, loss, and optimizer
model = TransformerModel(vocab_size, d_model, nhead, num_encoder_layers, dim_feedforward, max_length).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 204800 bytes.

4 HEADS 3 LAYERS

In [None]:
import torch.nn as nn
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime('%Y-%m-%d_%H-%M-%S')
data_name = "logp"  # Replace with the actual data name
type = "TE"  # Replace with the actual type

session_name = f'{data_name}_{formatted_datetime}/{type}'
folder_path = os.path.join('saved_models', session_name)
os.makedirs(folder_path, exist_ok=True)

output_dir_train = f'output/{data_name}/train/{type}'
os.makedirs(output_dir_train, exist_ok=True)
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
file_name_train = f'{output_dir_train}/train_accuracy_details_{current_time}.txt'

output_dir_test = f'output/{data_name}/test'
os.makedirs(output_dir_test, exist_ok=True)
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
file_name_test = f'{output_dir_test}/test_accuracy_details_{current_time}.txt'

best_train_accuracy = 0.0
best_test_accuracy = 0.0

# Training loop
reconstruction_weight = 0.1  # Weight for the reconstruction loss
with open(file_name_train, 'a') as file_train, open(file_name_test, 'a') as file_test:
    for epoch in range(50):
        total_correct = 0
        total_samples = 0
        true_labels_train = []
        pred_probs_train = []
        losses = 0.0

        for data_batch in train_dataset:
            graph_data_batch = data_batch[0]
            sequence_inputs = data_batch[1].to(device)
            sequence_targets = graph_data_batch.y

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            output= model(sequence_inputs)

            # Compute binary predictions
            binary_predictions = (output >= 0.5).float()

            # Compute batch accuracy
            batch_correct = (binary_predictions == sequence_targets).sum().item()
            total_correct += batch_correct
            total_samples += 1

            output = output.to(device)
            sequence_targets = sequence_targets.to(device)

            true_labels_train.append(sequence_targets.cpu().numpy().reshape(-1))
            pred_probs_train.append(output.detach().cpu().numpy())

            # Cast sequence_inputs to float
            sequence_inputs = sequence_inputs.float()


            # Compute loss
            loss = criterion(output, sequence_targets.view(-1))
            losses += loss.item()

            # Backward pass
            loss.backward()

            # Update weights
            optimizer.step()

        # Compute epoch accuracy
        epoch_train_accuracy = (total_correct / total_samples) * 100
        print(f"Epoch {epoch + 1}/{100}, Epoch Accuracy: {epoch_train_accuracy:.4f}")

        if epoch_train_accuracy >= best_train_accuracy:
            best_train_accuracy = epoch_train_accuracy
            model_path = os.path.join(folder_path, f'train_best_model_{best_train_accuracy:.3f}.pth')
            torch.save(model.state_dict(), model_path)
            print("Saved model with accuracy train model with accuracy{:.2f}% to {}".format(best_train_accuracy,
                                                                                             model_path))

        true_labels_train = np.concatenate(true_labels_train)
        pred_probs_train = np.concatenate(pred_probs_train)

        precision_train = precision_score(true_labels_train, (pred_probs_train >= 0.5).astype(int))
        recall_train = recall_score(true_labels_train, (pred_probs_train >= 0.5).astype(int))
        auc_roc_train = roc_auc_score(true_labels_train, pred_probs_train)
        f1_train = f1_score(true_labels_train, (pred_probs_train >= 0.5).astype(int))
        print(
            f"Train AUC-ROC: {auc_roc_train:.4f}, Train F1 Score: {f1_train:.4f} , Train Precision: {precision_train:.4f}, Train Recall: {recall_train:.4f}\n")
        file_train.write(
            f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {losses:.4f}, Train Accuracy: {epoch_train_accuracy:.4f}, Train AUC-ROC: {auc_roc_train:.4f}, Train F1 Score: {f1_train:.4f} , Train Precision: {precision_train:.4f}, Train Recall: {recall_train:.4f}\n')

        total_correct = 0
        total_samples = 0
        true_labels_test = []
        pred_probs_test = []

        for data_batch in test_dataset:
            graph_data_batch = data_batch[0]
            sequence_inputs = data_batch[1].to(device)                                                      
            sequence_targets = graph_data_batch.y

            output= model(sequence_inputs)
            binary_predictions = (output >= 0.5).float()

            batch_correct = (binary_predictions == sequence_targets).sum().item()
            total_correct += batch_correct
            total_samples += 1

            true_labels_test.append(sequence_targets.cpu().numpy().reshape(-1))
            pred_probs_test.append(output.detach().cpu().numpy())

        epoch_test_accuracy = (total_correct / total_samples) * 100
        print(f"Epoch Testing Accuracy : {epoch_test_accuracy:.4f}")

        if epoch_test_accuracy >= best_test_accuracy:
            best_test_accuracy = epoch_test_accuracy
            model_path = os.path.join(folder_path, f'test_best_model_{best_test_accuracy:.3f}.pth')
            torch.save(model.state_dict(), model_path)
            print("Saved model with Test Model with accuracy {:.2f}% to {}".format(best_test_accuracy, model_path))

        true_labels_test = np.concatenate(true_labels_test)
        pred_probs_test = np.concatenate(pred_probs_test)
        #pred_probs_test.append(output.detach().cpu().numpy().reshape(-1))
        
        #print(true_labels_test, pred_probs_test)
        precision_test = precision_score(true_labels_test, (pred_probs_test >= 0.5).astype(int))
        recall_test = recall_score(true_labels_test, (pred_probs_test >= 0.5).astype(int))
        auc_roc_test = roc_auc_score(true_labels_test, pred_probs_test)
        f1_test = f1_score(true_labels_test, (pred_probs_test >= 0.5).astype(int))
        print(
            f"Test AUC-ROC: {auc_roc_test:.4f}, Test F1 Score: {f1_test:.4f}, Test Precision: {precision_test:.4f}, Test Recall: {recall_test:.4f}\n")
        file_test.write(
            f'Epoch {epoch + 1}/{num_epochs}, Test Accuracy: {epoch_test_accuracy:.4f},Test AUC-ROC: {auc_roc_test:.4f}, Test F1 Score: {f1_test:.4f}, Test Precision: {precision_test:.4f}, Test Recall: {recall_test:.4f} \n')
file_test.close()
file_train.close()


  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Epoch 1/100, Epoch Accuracy: 75.2625
Saved model with accuracy train model with accuracy75.26% to saved_models\logp_2024-05-01_18-57-57/TE\train_best_model_75.263.pth
Train AUC-ROC: 0.8311, Train F1 Score: 0.7261 , Train Precision: 0.8164, Train Recall: 0.6538

Epoch Testing Accuracy : 80.6000
Saved model with Test Model with accuracy 80.60% to saved_models\logp_2024-05-01_18-57-57/TE\test_best_model_80.600.pth
Test AUC-ROC: 0.9098, Test F1 Score: 0.7651, Test Precision: 0.9433, Test Recall: 0.6436

Epoch 2/100, Epoch Accuracy: 81.2250
Saved model with accuracy train model with accuracy81.23% to saved_models\logp_2024-05-01_18-57-57/TE\train_best_model_81.225.pth
Train AUC-ROC: 0.8920, Train F1 Score: 0.7964 , Train Precision: 0.8731, Train Recall: 0.7321

Epoch Testing Accuracy : 82.8000
Saved model with Test Model with accuracy 82.80% to saved_models\logp_2024-05-01_18-57-57/TE\test_best_model_82.800.pth
Test AUC-ROC: 0.9129, Test F1 Score: 0.8130, Test Precision: 0.8718, Test Recall