In [None]:
from google.colab import drive
import os
!pip install torch torchvision

drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/ubuntu/nmr_ML/ML'
os.chdir(folder_path)

print("Current working directory after changing:", os.getcwd())


In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# List of standard three-letter amino acid codes
amino_acids = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY',
               'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER',
               'THR', 'TRP', 'TYR', 'VAL']

# Function to convert one-hot encoded vector back to amino acid code
def onehot_to_amino_acid(onehot_vector):
    index = np.argmax(onehot_vector)
    return amino_acids[index]

# Function to convert sine and cosine back to angle in radians
def features_to_angle(sin_value, cos_value):
    angle = np.arctan2(sin_value, cos_value)
    return angle

# Function to convert a single row of features back to original data
def convert_to_raw(features):
    current_residue_onehot = features[0:20]
    preceding_residue_onehot = features[20:40]
    following_residue_onehot = features[40:60]

    current_psi_sin, current_psi_cos = features[60], features[61]
    current_phi_sin, current_phi_cos = features[62], features[63]
    preceding_psi_sin, preceding_psi_cos = features[64], features[65]
    preceding_phi_sin, preceding_phi_cos = features[66], features[67]
    following_psi_sin, following_psi_cos = features[68], features[69]
    following_phi_sin, following_phi_cos = features[70], features[71]

    current_residue = onehot_to_amino_acid(current_residue_onehot)
    preceding_residue = onehot_to_amino_acid(preceding_residue_onehot)
    following_residue = onehot_to_amino_acid(following_residue_onehot)

    current_psi = features_to_angle(current_psi_sin, current_psi_cos)
    current_phi = features_to_angle(current_phi_sin, current_phi_cos)
    preceding_psi = features_to_angle(preceding_psi_sin, preceding_psi_cos)
    preceding_phi = features_to_angle(preceding_phi_sin, preceding_phi_cos)
    following_psi = features_to_angle(following_psi_sin, following_psi_cos)
    following_phi = features_to_angle(following_phi_sin, following_phi_cos)

    original_data = {
        'current_residue_type': current_residue,
        'preceding_residue_type': preceding_residue,
        'following_residue_type': following_residue,
        'current_psi': current_psi,
        'current_phi': current_phi,
        'preceding_psi': preceding_psi,
        'preceding_phi': preceding_phi,
        'following_psi': following_psi,
        'following_phi': following_phi
    }

    return original_data





# Load data

In [26]:

data = np.load('data.npz')
X_data = data['X_data']
y_data = data['y_data']

print(f"The length of targets (y) is {y_data.shape[0]}.")
print(f"The length of features (X) is {X_data.shape[0]}.")

print(f"Each target vector has {y_data.shape[1]} elements.")
print(f"Each feature vector has {X_data.shape[1]} elements.")

The length of targets (y) is 236263.
The length of features (X) is 236263.
Each target vector has 4 elements.
Each feature vector has 72 elements.


# Evaluate the data

In [27]:
convert_to_raw(X_data[1])

{'current_residue_type': 'LYS',
 'preceding_residue_type': 'HIS',
 'following_residue_type': 'GLU',
 'current_psi': 2.335979009857667,
 'current_phi': -2.397794347843768,
 'preceding_psi': -2.9750281952083197,
 'preceding_phi': -2.1131378107335728,
 'following_psi': 2.6625422691195926,
 'following_phi': -1.6817917076502755}

# Split the data into training and testing sets

In [13]:


# Set random seed for reproducibility
random_seed = 127
torch.manual_seed(random_seed)
np.random.seed(random_seed)

# Setup the code for device-agnostic operation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Convert your data into PyTorch tensors and move to the appropriate device
X_tensor = torch.tensor(X_data, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y_data, dtype=torch.float32).to(device)

# Create a TensorDataset
dataset = TensorDataset(X_tensor, y_tensor)

# Shuffle the dataset with the specified seed
shuffle_indices = torch.randperm(len(dataset))
shuffled_dataset = [dataset[i] for i in shuffle_indices]

# Split the dataset into training and test sets
train_size = int(0.8 * len(shuffled_dataset))
test_size = len(shuffled_dataset) - train_size
train_dataset, test_dataset = random_split(shuffled_dataset, [train_size, test_size], generator=torch.Generator().manual_seed(random_seed))

# Create data loaders
batch_size = 10240
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# Form the model

In [14]:

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(72, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.4)

        self.layer2 = nn.Linear(512, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.dropout2 = nn.Dropout(0.4)

        self.layer3 = nn.Linear(512, 512)
        self.bn3 = nn.BatchNorm1d(512)
        self.dropout3 = nn.Dropout(0.4)

        self.layer4 = nn.Linear(512, 256)
        self.bn4 = nn.BatchNorm1d(256)
        self.dropout4 = nn.Dropout(0.4)

        self.layer5 = nn.Linear(256, 256)
        self.bn5 = nn.BatchNorm1d(256)
        self.dropout5 = nn.Dropout(0.4)

        self.layer6 = nn.Linear(256, 128)
        self.bn6 = nn.BatchNorm1d(128)
        self.dropout6 = nn.Dropout(0.4)

        self.layer7 = nn.Linear(128, 128)
        self.bn7 = nn.BatchNorm1d(128)
        self.dropout7 = nn.Dropout(0.4)

        self.layer8 = nn.Linear(128, 64)
        self.bn8 = nn.BatchNorm1d(64)
        self.dropout8 = nn.Dropout(0.4)

        self.layer9 = nn.Linear(64, 64)
        self.bn9 = nn.BatchNorm1d(64)
        self.dropout9 = nn.Dropout(0.4)

        self.layer10 = nn.Linear(64, 32)
        self.bn10 = nn.BatchNorm1d(32)
        self.dropout10 = nn.Dropout(0.4)

        self.layer11 = nn.Linear(32, 32)
        self.bn11 = nn.BatchNorm1d(32)
        self.dropout11 = nn.Dropout(0.4)

        self.layer12 = nn.Linear(32, 16)
        self.bn12 = nn.BatchNorm1d(16)
        self.dropout12 = nn.Dropout(0.4)

        self.layer13 = nn.Linear(16, 16)
        self.bn13 = nn.BatchNorm1d(16)
        self.dropout13 = nn.Dropout(0.4)

        self.layer14 = nn.Linear(16, 8)
        self.bn14 = nn.BatchNorm1d(8)
        self.dropout14 = nn.Dropout(0.4)

        self.layer15 = nn.Linear(8, 8)
        self.bn15 = nn.BatchNorm1d(8)
        self.dropout15 = nn.Dropout(0.4)

        self.layer16 = nn.Linear(8, 8)
        self.bn16 = nn.BatchNorm1d(8)
        self.dropout16 = nn.Dropout(0.4)

        self.layer17 = nn.Linear(8, 8)
        self.bn17 = nn.BatchNorm1d(8)
        self.dropout17 = nn.Dropout(0.4)

        self.layer18 = nn.Linear(8, 8)
        self.bn18 = nn.BatchNorm1d(8)
        self.dropout18 = nn.Dropout(0.4)

        self.layer19 = nn.Linear(8, 8)
        self.bn19 = nn.BatchNorm1d(8)
        self.dropout19 = nn.Dropout(0.4)

        self.layer20 = nn.Linear(8, 8)
        self.bn20 = nn.BatchNorm1d(8)
        self.dropout20 = nn.Dropout(0.4)

        self.output_layer = nn.Linear(8, 4)

    def forward(self, x):
        x = torch.relu(self.bn1(self.layer1(x)))
        x = self.dropout1(x)

        x = torch.relu(self.bn2(self.layer2(x)))
        x = self.dropout2(x)

        x = torch.relu(self.bn3(self.layer3(x)))
        x = self.dropout3(x)

        x = torch.relu(self.bn4(self.layer4(x)))
        x = self.dropout4(x)

        x = torch.relu(self.bn5(self.layer5(x)))
        x = self.dropout5(x)

        x = torch.relu(self.bn6(self.layer6(x)))
        x = self.dropout6(x)

        x = torch.relu(self.bn7(self.layer7(x)))
        x = self.dropout7(x)

        x = torch.relu(self.bn8(self.layer8(x)))
        x = self.dropout8(x)

        x = torch.relu(self.bn9(self.layer9(x)))
        x = self.dropout9(x)

        x = torch.relu(self.bn10(self.layer10(x)))
        x = self.dropout10(x)

        x = torch.relu(self.bn11(self.layer11(x)))
        x = self.dropout11(x)

        x = torch.relu(self.bn12(self.layer12(x)))
        x = self.dropout12(x)

        x = torch.relu(self.bn13(self.layer13(x)))
        x = self.dropout13(x)

        x = torch.relu(self.bn14(self.layer14(x)))
        x = self.dropout14(x)

        x = torch.relu(self.bn15(self.layer15(x)))
        x = self.dropout15(x)

        x = torch.relu(self.bn16(self.layer16(x)))
        x = self.dropout16(x)

        x = torch.relu(self.bn17(self.layer17(x)))
        x = self.dropout17(x)

        x = torch.relu(self.bn18(self.layer18(x)))
        x = self.dropout18(x)

        x = torch.relu(self.bn19(self.layer19(x)))
        x = self.dropout19(x)

        x = torch.relu(self.bn20(self.layer20(x)))
        x = self.dropout20(x)

        x = self.output_layer(x)
        return x




# Perform training

## start with initial training

In [15]:
# Set device-agnostic code
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model and move it to the appropriate device
model = NeuralNetwork().to(device)

# Define loss function and optimizer
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.1)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    epoch_loss /= len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss}")

# Save the model
model_path = 'model1.pth'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Epoch 1/10, Loss: 10599.368935032895
Epoch 2/10, Loss: 4227.565031352796
Epoch 3/10, Loss: 1696.3647782175165
Epoch 4/10, Loss: 1283.5053839432567
Epoch 5/10, Loss: 1102.294488204153
Epoch 6/10, Loss: 1049.1299984580592
Epoch 7/10, Loss: 1018.8326448139392
Epoch 8/10, Loss: 1003.9022056178043
Epoch 9/10, Loss: 999.7113133480674
Epoch 10/10, Loss: 996.4855443050986
Model saved to model1.pth


## decrease the learning rate


In [None]:
def continue_training(output_model_path, input_model_path, num_epochs, learning_rate, folder_path):
    # Set device-agnostic code
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Change directory to the folder path
    os.chdir(folder_path)

    # Load the saved model state dictionary
    model.load_state_dict(torch.load(input_model_path))

    # Move the model to the appropriate device
    model.to(device)

    # Set the model to training mode
    model.train()

    # Define loss function and optimizer with the new learning rate
    criterion = nn.MSELoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Continue training the model
    for epoch in range(num_epochs):
        epoch_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        epoch_loss /= len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss}")

    # Save the model again if desired
    torch.save(model.state_dict(), output_model_path)
    print(f"Model saved to {output_model_path}")


In [None]:
folder_path = '/content/drive/MyDrive/ubuntu/nmr_ML/ML'
input_model_path = 'model1.pth'
output_model_path = 'model2.pth'
num_epochs = 50
learning_rate = 1

continue_training(output_model_path, input_model_path, num_epochs, learning_rate, folder_path)

Epoch 1/50, Loss: 1541.3535284745067
Epoch 2/50, Loss: 732.6538407175165
Epoch 3/50, Loss: 542.0742396304482
Epoch 4/50, Loss: 431.31489161441203
Epoch 5/50, Loss: 339.6386863306949
Epoch 6/50, Loss: 264.0363906057257
Epoch 7/50, Loss: 204.4819625051398
Epoch 8/50, Loss: 156.5204001978824
Epoch 9/50, Loss: 119.82713679263466
Epoch 10/50, Loss: 91.85629955090974
Epoch 11/50, Loss: 69.93937080784848
Epoch 12/50, Loss: 54.222267753199525
Epoch 13/50, Loss: 42.33594251933851
Epoch 14/50, Loss: 33.84304508410002
Epoch 15/50, Loss: 27.681411743164062
Epoch 16/50, Loss: 23.418004688463714
Epoch 17/50, Loss: 20.26957301089638
Epoch 18/50, Loss: 18.29769606339304
Epoch 19/50, Loss: 16.78705260628148
Epoch 20/50, Loss: 15.90304244192023
Epoch 21/50, Loss: 15.22057392722682
Epoch 22/50, Loss: 14.82616454676578
Epoch 23/50, Loss: 14.546436209427682
Epoch 24/50, Loss: 14.37779722715679
Epoch 25/50, Loss: 14.277581767032022
Epoch 26/50, Loss: 14.219071990565249
Epoch 27/50, Loss: 14.188923735367624


In [None]:
folder_path = '/content/drive/MyDrive/ubuntu/nmr_ML/ML'
input_model_path = 'model2.pth'
output_model_path = 'model3.pth'
num_epochs = 50
learning_rate = 0.01

continue_training(output_model_path, input_model_path, num_epochs, learning_rate, folder_path)

Epoch 1/50, Loss: 14.104311892860814
Epoch 2/50, Loss: 14.105124473571777
Epoch 3/50, Loss: 14.102893227025083
Epoch 4/50, Loss: 14.105619079188296
Epoch 5/50, Loss: 14.116596322310599
Epoch 6/50, Loss: 14.098235732630679
Epoch 7/50, Loss: 14.095079873737536
Epoch 8/50, Loss: 14.095170974731445
Epoch 9/50, Loss: 14.105327857168097
Epoch 10/50, Loss: 14.112155261792635
Epoch 11/50, Loss: 14.104861911974455
Epoch 12/50, Loss: 14.12009711014597
Epoch 13/50, Loss: 14.11834736874229
Epoch 14/50, Loss: 14.126983542191354
Epoch 15/50, Loss: 14.095727017051296
Epoch 16/50, Loss: 14.093542349965949
Epoch 17/50, Loss: 14.112457576550936
Epoch 18/50, Loss: 14.09437064120644
Epoch 19/50, Loss: 14.096977033113179
Epoch 20/50, Loss: 14.117434049907484
Epoch 21/50, Loss: 14.100650486193205
Epoch 22/50, Loss: 14.098786705418638
Epoch 23/50, Loss: 14.106543691534744
Epoch 24/50, Loss: 14.101406247992264
Epoch 25/50, Loss: 14.102708916915091
Epoch 26/50, Loss: 14.109168404027036
Epoch 27/50, Loss: 14.09

In [28]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Function to calculate accuracy (for regression tasks, we use MSE)
def calculate_accuracy(model, test_loader, criterion, device):
    model.eval()
    test_loss = 0.0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            test_loss += loss.item()

    test_loss /= len(test_loader)
    print(f"Test Loss (MSE): {test_loss}")

# Set device-agnostic code
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the trained model
model_path = 'model3.pth'
model = NeuralNetwork().to(device)
model.load_state_dict(torch.load(model_path))

# Define the loss function (MSE for regression tasks)
criterion = nn.MSELoss().to(device)

# Create DataLoader for the test set
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Calculate accuracy on the test set
calculate_accuracy(model, test_loader, criterion, device)


Test Loss (MSE): 13.927884330284934
