In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import dgl
from dgl.nn import GraphConv
from bayes_opt import BayesianOptimization
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from tqdm import tqdm
import random
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import kendalltau
import datetime
import random
import string
import plotly.express as px
import MDAnalysis as mda
import scipy.io
from skopt import BayesSearchCV
from skopt.space import Real, Integer

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [4]:
with open('../final_coordinates.pkl','rb') as file:
    inp = pickle.load(file)
with open('s_list.pkl','rb') as file:
    s = pickle.load(file)

In [5]:
def pairwise_distances(x):
    square = torch.sum(x ** 2, dim=1, keepdim=True)
    distances = square + torch.transpose(square, 0, 1) - 2 * torch.matmul(x, torch.transpose(x, 0, 1))
    distances = torch.sqrt(distances)
    return distances

In [6]:
def getDataLoader(x, y, batch_size):
    tensor_inp = torch.Tensor(x)
    tensor_z = torch.Tensor(y)
    dataset = TensorDataset(tensor_inp,tensor_z)
    return DataLoader(dataset, batch_size)

In [7]:
inp_train, inp_val, s_train, s_val = train_test_split(inp, s, test_size=0.2, random_state=69)
inp_train, inp_test, s_train, s_test = train_test_split(inp_train, s_train, test_size=0.2, random_state=69)

In [8]:
print(inp_train.shape, inp_val.shape, inp_test.shape)
print(s_train.shape, s_val.shape, s_test.shape)

(25782, 3072, 3) (8058, 3072, 3) (6446, 3072, 3)
(25782, 1) (8058, 1) (6446, 1)


In [9]:
class GNN(torch.nn.Module):
    def __init__(self,hidden_dim):
        super(GNN, self).__init__()
        self.fc1 = nn.Linear(3, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.fc3 = nn.Linear(3072,1)

    def forward(self, x, adjacency_matrix):
        x = self.fc1(x)
        x = torch.relu(x)
#         for i in range(2):
        x = torch.matmul(adjacency_matrix, x) # Perform message passing using the adjacency matrix
#         x = torch.sigmoid(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = x.view(x.size(0), -1)
        x = self.fc3(x)
        x = torch.sigmoid(x)
#         x = torch.mean(x, dim=0)
        return x

In [10]:
def train(
    model,
    dataset,
    optimizer,
    loss,
    length_of_set,
    is_dev=False
):
    total_loss = 0
    y_pred, y_actual = [], []
    model.train()
    for x,y in tqdm(dataset):
        
        if not is_dev: optimizer.zero_grad()

        x = x.to(device)
        y = y.to(device)
        
        num_atoms = x.shape[1]
        batch_size = x.shape[0]
        adjacency_matrix = torch.zeros(batch_size, num_atoms, num_atoms).to(device)
        
        for i in range(batch_size):
            # Calculate pairwise distances between atoms
            distances = pairwise_distances(x[i])
            
            # Apply threshold to set adjacency values
            adjacency_matrix[i] = (distances < 7).float()
        
        pred = model(x,adjacency_matrix)
        curr_loss = loss(pred, y)
#         loss_batch = 0
#         loss_batch += curr_loss.item()
#         loss_batch = loss_batch*len(y)
        total_loss += curr_loss.item()*len(y)/(length_of_set)

        if not is_dev:
            curr_loss.backward()
            optimizer.step()

#         y_pred += pred
#         y_actual += y
        
        y_pred += pred.flatten().tolist()
        y_actual += y.flatten().tolist()
    R_square = r2_score(y_actual,y_pred)
    return total_loss,R_square

In [10]:
# train_dataloader = getDataLoader(inp_train, s_train)
# val_dataloader = getDataLoader(inp_val, s_val)
# test_dataloader = getDataLoader(inp_test, s_test)

TypeError: getDataLoader() missing 1 required positional argument: 'batch_size'

In [11]:
def objective_function(learning_rate, num_epochs, hidden_dim, batch_size):
    # Define your GNN model with the given hyperparameters
#     model = GNN(input_dim=3, hidden_dim=int(hidden_dim), output_dim=1)
    hidden_dim = int(hidden_dim)
    model = GNN(hidden_dim).to(device)
    learning_rate = learning_rate
    batch_size = int(batch_size)
    num_epochs = int(num_epochs)
    # Define loss and optimizer
    loss = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    train_dataloader = getDataLoader(inp_train, s_train, batch_size)
    val_dataloader = getDataLoader(inp_val, s_val, batch_size)
    
    for epoch in range(num_epochs):
        # Training loop
        train_loss, train_acc = train(model, train_dataloader, optimizer, loss, inp_train.shape[0])
        val_loss,val_acc = train(model, val_dataloader, optimizer, loss, inp_val.shape[0], True)
    
    for epoch in range(num_epochs):
        # Training loop
        train_loss, train_acc = train(model, train_dataloader, optimizer, loss, inp_train.shape[0])
        val_loss,val_acc = train(model, val_dataloader, optimizer, loss, inp_val.shape[0], True)
        # Validation or metric evaluation
        # Calculate validation loss or accuracy
    
#     train_loss,train_acc = train(model, train_dataloader, optimizer, loss, inp_train.shape[0])
#     val_loss,val_acc = train(model, val_dataloader, optimizer, loss, inp_val.shape[0], True)
    print(f"Learning Rate: {learning_rate}")
    print(f"Num Epochs: {num_epochs}")
    print(f"Hidden Dim: {hidden_dim}")
    print(f"Batch Size: {batch_size}")
    print(f"Objective Value: {val_acc}\n")

    # Save results to a text file
    with open('bayesian_optimization_results.txt', 'a') as file:
        file.write(f"Learning Rate: {learning_rate}\n")
        file.write(f"Num Epochs: {num_epochs}\n")
        file.write(f"Hidden Dim: {hidden_dim}\n")
        file.write(f"Batch Size: {batch_size}\n")
        file.write(f"Objective Value: {val_acc}\n\n")
    return val_acc

In [14]:
# def train_and_evaluate(model, criterion, optimizer, dataloader, num_epochs):
#     for epoch in range(int(num_epochs)):
#         model.train()
#         for batch in train_dataloader:
#             inputs, targets = batch
#             optimizer.zero_grad()
#             outputs = model(inputs)
#             loss = criterion(outputs, targets)
#             loss.backward()
#             optimizer.step()

#         # Evaluate the model on the validation set and report metrics
#         # You can calculate metrics like Mean Squared Error (MSE) or others
#         validation_loss = evaluate(model, criterion, val_dataloader)
#         print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {validation_loss:.4f}')
#     return validation_loss


In [12]:
pbounds = {
    'learning_rate': (0.0001 ,0.001),
    'num_epochs': (10, 20),
    'hidden_dim': (32, 128),
    'batch_size': (32,128)
}

optimizer = BayesianOptimization(
    f=objective_function,
    pbounds=pbounds,
    verbose=10,  # Verbosity level
    random_state=1,
)

# Run the optimization
optimizer.maximize(init_points=10, n_iter=10)

|   iter    |  target   | batch_... | hidden... | learni... | num_ep... |
-------------------------------------------------------------------------


 11%|████████████▍                                                                                                         | 38/359 [15:52<2:14:06, 25.07s/it]


KeyboardInterrupt: 