In [None]:
!pip install numpy pandas keras tensorflow biopython biopandas matplotlib spektral


Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting biopandas
  Downloading biopandas-0.4.1-py2.py3-none-any.whl (878 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m879.0/879.0 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting spektral
  Downloading spektral-1.3.1-py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.1/140.1 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython, biopandas, spektral
Successfully installed biopandas-0.4.1 biopython-1.83 spektral-1.3.1


In [None]:
import os
import numpy as np
import pandas as pd
from biopandas.pdb import PandasPdb
from scipy.sparse import csr_matrix
import zipfile

# Specify the path to the zip file
zip_file_path = 'PDB str-20240520T081153Z-001.zip'  # Replace with the actual path of the uploaded zip file

# Specify the directory where you want to extract the contents
extraction_path = 'extracted_folder'

# Create the extraction directory if it doesn't exist
os.makedirs(extraction_path, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

# Example property dictionary (values are illustrative)
atom_properties = {
    'H': [0.1, 0.2, 0.3, 0.4, 0.5],
    'C': [0.2, 0.3, 0.4, 0.5, 0.6],
    'N': [0.3, 0.4, 0.5, 0.6, 0.7],
    'O': [0.4, 0.5, 0.6, 0.7, 0.8],
    'S': [0.5, 0.6, 0.7, 0.8, 0.9],
    # Add more atom types and their properties as needed
}

def preprocess_pdb_to_graph(pdb_file):
    ppdb = PandasPdb().read_pdb(pdb_file)
    atoms = ppdb.df['ATOM']

    coords = atoms[['x_coord', 'y_coord', 'z_coord']].values
    atom_types = atoms['element_symbol'].values

    # Generate node features with additional properties
    node_features = []
    for atom in atom_types:
        properties = atom_properties.get(atom, [0, 0, 0, 0, 0])  # Default properties if atom not in dictionary
        node_features.append(properties)
    node_features = np.array(node_features)

    # Create edges based on distance
    from scipy.spatial.distance import pdist, squareform

    dist_matrix = squareform(pdist(coords))
    threshold = 4.0  # Threshold for considering an edge
    adj_matrix = (dist_matrix < threshold).astype(np.float32)
    np.fill_diagonal(adj_matrix, 0)  # Remove self-loops

    return {'node_features': node_features, 'adj_matrix': adj_matrix}

# Test the extraction and preprocessing
extracted_files = os.listdir(extraction_path)
print(f'Extracted files: {extracted_files}')

# Look inside the 'PDB str' directory
pdb_dir = os.path.join(extraction_path, 'PDB str')
pdb_files = [f for f in os.listdir(pdb_dir) if f.endswith('.pdb')]
print(f'PDB files: {pdb_files}')

if pdb_files:
    pdb_file = os.path.join(pdb_dir, pdb_files[0])
    graph = preprocess_pdb_to_graph(pdb_file)
    print(f'Graph node features:\n{graph["node_features"]}')
    print(f'Graph adjacency matrix:\n{graph["adj_matrix"]}')
else:
    print('No PDB files found in the "PDB str" directory.')


Extracted files: ['PDB str']
PDB files: ['AF-Q8GR83-F1-model_v4.pdb', 'AF-P63000-F1-model_v4.pdb', 'AF-Q8BIQ5-F1-model_v4.pdb', 'AF-P0A6J3-F1-model_v4.pdb', 'AF-Q6L209-F1-model_v4.pdb', 'AF-Q72LL1-F1-model_v4.pdb', 'AF-P43609-F1-model_v4.pdb', 'AF-Q5SK53-F1-model_v4.pdb', 'AF-Q72JG7-F1-model_v4.pdb', 'AF-O05519-F1-model_v4.pdb', 'AF-Q72LB1-F1-model_v4.pdb', 'AF-Q6L1A4-F1-model_v4.pdb', 'AF-Q9LUH8-F1-model_v4.pdb', 'AF-O87197-F1-model_v4.pdb', 'AF-Q5SJG0-F1-model_v4.pdb', 'AF-Q3UBY5-F1-model_v4.pdb', 'AF-Q72JU8-F1-model_v4.pdb', 'AF-G5EG62-F1-model_v4.pdb', 'AF-Q3TJ39-F1-model_v4.pdb', 'AF-Q6L0H3-F1-model_v4.pdb', 'AF-Q9D938-F1-model_v4.pdb', 'AF-Q6KZ44-F1-model_v4.pdb', 'AF-P0ACY3-F1-model_v4.pdb', 'AF-Q9H0K1-F1-model_v4.pdb', 'AF-Q72K70-F1-model_v4.pdb', 'AF-P56690-F1-model_v4.pdb', 'AF-P56194-F1-model_v4.pdb', 'AF-Q8W4I9-F1-model_v4.pdb', 'AF-Q8N8S7-F1-model_v4.pdb', 'AF-Q5SJN7-F1-model_v4.pdb', 'AF-O01963-F1-model_v4.pdb', 'AF-Q72HB6-F1-model_v4.pdb', 'AF-Q72IH8-F1-model_v4.pdb', 'A

GNN

In [None]:
!pip install biopython numpy networkx




In [None]:
import os
import zipfile

# Specify the path to the zip file
zip_file_path = '/content/PDB str-20240520T081153Z-001.zip'  # Replace with the actual path of the uploaded zip file

# Specify the directory where you want to extract the contents
extraction_path = '/content/extracted_folder'

# Create the extraction directory if it doesn't exist
os.makedirs(extraction_path, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

In [None]:
# Install PyTorch if not already installed
!pip install torch

# Install torch_scatter, torch_sparse, torch_cluster, and torch_spline_conv
# Find the correct versions for your PyTorch version at https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html
!pip install torch-scatter -f https://data.pyg.org/whl/torch-$(python -c "import torch; print(torch.__version__)").html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-$(python -c "import torch; print(torch.__version__)").html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-$(python -c "import torch; print(torch.__version__)").html
!pip install torch-spline-conv -f https://data.pyg.org/whl/torch-$(python -c "import torch; print(torch.__version__)").html

# Install torch-geometric
!pip install torch-geometric



Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
import os
import numpy as np
import networkx as nx
from Bio.PDB import PDBParser
from sklearn.preprocessing import StandardScaler
from torch_geometric.utils import from_networkx
from torch_geometric.data import Data
import torch

# Mapping from element symbols to atomic numbers
element_to_atomic_number = {
    'H': 1, 'He': 2, 'Li': 3, 'Be': 4, 'B': 5, 'C': 6, 'N': 7, 'O': 8, 'F': 9, 'Ne': 10,
    'Na': 11, 'Mg': 12, 'Al': 13, 'Si': 14, 'P': 15, 'S': 16, 'Cl': 17, 'Ar': 18, 'K': 19, 'Ca': 20,
    # Include all elements as needed
}

def parse_pdb(file_path):
    parser = PDBParser()
    structure = parser.get_structure('protein', file_path)
    return structure

def get_atom_features(atom):
    atomic_number = element_to_atomic_number.get(atom.element, 0)
    x, y, z = atom.coord
    return [atomic_number, x, y, z]

def create_graph_from_structure(structure):
    graph = nx.Graph()
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    atom_id = atom.get_serial_number()
                    features = get_atom_features(atom)
                    graph.add_node(atom_id, features=features)

                    for other_atom in residue.get_atoms():
                        if atom != other_atom:
                            distance = atom - other_atom
                            if distance < 4.0:
                                other_atom_id = other_atom.get_serial_number()
                                graph.add_edge(atom_id, other_atom_id, weight=distance)
    return graph

def preprocess_pdb_files(pdb_files):
    graphs = []
    all_features = []

    for pdb_file in pdb_files:
        structure = parse_pdb(pdb_file)
        graph = create_graph_from_structure(structure)
        graphs.append(graph)

        for _, data in graph.nodes(data=True):
            all_features.append(data['features'])

    all_features = np.array(all_features)
    scaler = StandardScaler()
    scaler.fit(all_features)

    for graph in graphs:
        for node in graph.nodes:
            graph.nodes[node]['features'] = scaler.transform([graph.nodes[node]['features']])[0]

    return graphs

def networkx_to_pyg(graph):
    # Extract node features into a tensor
    node_features = np.array([graph.nodes[n]['features'] for n in graph.nodes])
    node_features = torch.tensor(node_features, dtype=torch.float)

    # Extract edge indices
    edge_index = np.array(list(graph.edges)).T
    edge_index = torch.tensor(edge_index, dtype=torch.long)

    return Data(x=node_features, edge_index=edge_index)

# Directory containing PDB files
pdb_directory = '/content/extracted_folder/PDB str'
pdb_files = [os.path.join(pdb_directory, file) for file in os.listdir(pdb_directory) if file.endswith('.pdb')]

# Preprocess PDB files
graphs = preprocess_pdb_files(pdb_files)

# Convert NetworkX graphs to PyTorch Geometric Data objects
pyg_graphs = [networkx_to_pyg(graph) for graph in graphs]

# Example: Accessing node features and edge index for the first graph
data = pyg_graphs[0]

# PyTorch Geometric stores node features in `x` and edge indices in `edge_index`
node_features = data.x
edge_index = data.edge_index

print("Node features:\n", node_features)
print("Edge index:\n", edge_index)


Node features:
 tensor([[ 0.4046, -0.8463,  0.2758, -1.3642],
        [-0.5909, -0.7909,  0.2633, -1.3984],
        [-0.5909, -0.7519,  0.2010, -1.3691],
        ...,
        [-0.5909, -0.3513,  1.7452, -0.2062],
        [ 0.4046, -0.3361,  1.8026, -0.1721],
        [ 0.4046, -0.3895,  1.6903, -0.1893]])
Edge index:
 tensor([[   1,    1,    1,  ..., 4210, 4210, 4208],
        [   2,    3,    4,  ..., 4208, 4209, 4209]])


In [None]:
from torch_geometric.data import InMemoryDataset

class CustomDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(CustomDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = self.load_data()

    def load_data(self):
        # Load your preprocessed data here
        pyg_graphs = []  # List of Data objects
        # Add your pyg_graphs to the list
        data, slices = self.collate(pyg_graphs)
        return data, slices

    def len(self):
        return len(self.data.y)

    def get(self, idx):
        return self.data.__getitem__(idx)


In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GNNModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


In [None]:
from torch_geometric.data import InMemoryDataset, Data

class CustomDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(CustomDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = self.process()

    def process(self):
        # Load and process your preprocessed data here
        pyg_graphs = []  # List of Data objects
        # Populate pyg_graphs with your preprocessed graphs

        data_list = [self.data_to_pyg(graph) for graph in pyg_graphs]
        return self.collate(data_list)

    def data_to_pyg(self, graph):
        # Convert a single graph (in whatever format you have) to PyTorch Geometric Data
        # Example implementation:
        node_features = graph.node_features  # Replace with actual data extraction method
        edge_index = graph.edge_index  # Replace with actual data extraction method
        # Assuming node_features and edge_index are tensors
        return Data(x=node_features, edge_index=edge_index)

    def len(self):
        return len(self.data.y)

    def get(self, idx):
        return self.data.__getitem__(idx)



In [None]:
import torch
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from scipy.stats import pearsonr

def compute_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def compute_r2(y_true, y_pred):
    return r2_score(y_true, y_pred)

def compute_pearson(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]  # Pearson returns a tuple, we need the coefficient only


In [None]:
def validate(model, loader):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch in loader:
            output = model(batch)
            y_true.extend(batch.y.cpu().numpy())
            y_pred.extend(output.cpu().numpy())

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    rmse = compute_rmse(y_true, y_pred)
    r2 = compute_r2(y_true, y_pred)
    pearson = compute_pearson(y_true, y_pred)

    return rmse, r2, pearson


In [None]:
from Bio.PDB import PDBParser

def parse_pdb(file_path):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('structure', file_path)
    atoms = []
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    atoms.append(atom)
    return atoms


In [None]:
import networkx as nx

def create_graph(atoms):
    G = nx.Graph()
    for atom in atoms:
        G.add_node(atom.serial_number, element=atom.element, coord=atom.coord)
    # Add edges (bonds) based on distance criteria (e.g., covalent bonds)
    for i, atom1 in enumerate(atoms):
        for j, atom2 in enumerate(atoms):
            if i < j:
                distance = atom1 - atom2
                if distance < 1.6:  # Example distance threshold for covalent bonds
                    G.add_edge(atom1.serial_number, atom2.serial_number, weight=distance)
    return G


In [None]:
import torch
from torch_geometric.data import Data

def networkx_to_pyg(graph):
    edge_index = torch.tensor(list(graph.edges), dtype=torch.long).t().contiguous()
    node_features = torch.tensor([list(graph.nodes[node]['coord']) for node in graph.nodes], dtype=torch.float)
    return Data(x=node_features, edge_index=edge_index)

# Process multiple PDB files
def preprocess_pdb_files(pdb_files):
    pyg_graphs = []
    for file in pdb_files:
        atoms = parse_pdb(file)
        graph = create_graph(atoms)
        pyg_graph = networkx_to_pyg(graph)
        pyg_graphs.append(pyg_graph)
    return pyg_graphs


In [None]:
import pandas as pd
import os
import torch
from Bio.PDB import PDBParser
import networkx as nx
from torch_geometric.data import Data, InMemoryDataset, DataLoader
from torch_geometric.nn import GCNConv
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr
import numpy as np
import torch.nn.functional as F

# Step 1: Parse PDB Files
def parse_pdb(file_path):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('structure', file_path)
    atoms = []
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    atoms.append(atom)
    return atoms

# Step 2: Create Graphs
def create_graph(atoms):
    G = nx.Graph()
    for atom in atoms:
        G.add_node(atom.serial_number, element=atom.element, coord=atom.coord)
    for i, atom1 in enumerate(atoms):
        for j, atom2 in enumerate(atoms):
            if i < j:
                distance = atom1 - atom2
                if distance < 1.6:  # Example distance threshold for covalent bonds
                    G.add_edge(atom1.serial_number, atom2.serial_number, weight=distance)
    return G

# Step 3: Convert to PyTorch Geometric Data Objects
def networkx_to_pyg(graph):
    edge_index = torch.tensor(list(graph.edges), dtype=torch.long).t().contiguous()
    node_features = torch.tensor([list(graph.nodes[node]['coord']) for node in graph.nodes], dtype=torch.float)
    return Data(x=node_features, edge_index=edge_index)

# Process multiple PDB files and include melting temperatures
def preprocess_pdb_files(pdb_dir, csv_file):
    df = pd.read_csv(csv_file)
    pyg_graphs = []

    for _, row in df.iterrows():
        pdb_file = os.path.join(pdb_dir, row['pdb_file'])
        melting_temp = row['melting_temp']
        atoms = parse_pdb(pdb_file)
        graph = create_graph(atoms)
        pyg_graph = networkx_to_pyg(graph)
        pyg_graph.y = torch.tensor([melting_temp], dtype=torch.float)  # Add target variable
        pyg_graphs.append(pyg_graph)

    return pyg_graphs

# Directory and file paths
pdb_dir = '/content/extracted_folder/PDB str'  # Folder containing all PDB files
csv_file = '/content/PDB+TM - Sheet1 (1).csv'  # CSV file with PDB file names and melting temperatures

# Preprocess the data
pyg_graphs = preprocess_pdb_files(pdb_dir, csv_file)

# Save preprocessed data
processed_path = '/content/your_dataset/processed'
os.makedirs(processed_path, exist_ok=True)
torch.save(pyg_graphs, f'{processed_path}/data.pt')

# Step 5: Create a Custom Dataset
class CustomDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(CustomDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []  # List of raw file names (if any)

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        pass  # Download to `self.raw_dir`.

    def process(self):
        pass  # This method is called only if the processed data does not exist

dataset_path = '/content/your_dataset'
dataset = CustomDataset(root=dataset_path)

# Step 6: Train Your GNN
class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(dataset[0].num_node_features, 16)
        self.conv2 = GCNConv(16, 1)  # For regression, use 1 output feature

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x  # Assuming regression problem

model = GCN()

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.MSELoss()

# Training loop
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

for epoch in range(100):  # Number of epochs
    model.train()
    for data in train_loader:
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = loss_fn(out, data.y)
        loss.backward()
        optimizer.step()

    # Optionally, evaluate on validation/test set
    model.eval()
    preds, targets = [], []
    for data in train_loader:  # Change to validation_loader if you have one
        with torch.no_grad():
            out = model(data.x, data.edge_index)
            preds.append(out.cpu().numpy())
            targets.append(data.y.cpu().numpy())

    preds = np.concatenate(preds)
    targets = np.concatenate(targets)

    rmse = np.sqrt(mean_squared_error(targets, preds))
    r2 = r2_score(targets, preds)
    pearson_corr, _ = pearsonr(targets.flatten(), preds.flatten())

    print(f'Epoch: {epoch}, Loss: {loss.item()}, RMSE: {rmse}, R2: {r2}, Pearson: {pearson_corr}')

# Save the trained model
model_path = 'model.pth'
torch.save(model.state_dict(), model_path)
print(f'Model saved to {model_path}')
