In [5]:
import pandas as pd
import torch
import numpy as np
import pickle

class ProteinDataSFProcessor:
    def __init__(self, foldseek_file, pesto_pickle_file):
        self.foldseek_file = foldseek_file
        self.pesto_pickle_file = pesto_pickle_file
        
        self.foldseek = {}
        self.pesto_protein_dict = {}
        self.element_mapping = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 
                                'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 
                                'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 
                                'V': 17, 'W': 18, 'Y': 19, 'X': 20}
    
    def load_foldseek_encodings(self):
        """Loads FoldSeek encodings"""
        with open(self.foldseek_file, 'r') as file:
            for line in file:
                name, seq = line.strip().split('\t', 1)
                encoding = []
                for s in seq:
                    vector = np.zeros(21)
                    vector[self.element_mapping[s]] = 1
                    encoding.append(vector)
                encoding_tensor = torch.tensor(encoding, dtype=torch.float32)
                self.foldseek[name] = encoding_tensor
    
    def load_pesto_pickle(self):
        """Loads PeSTO protein data from the pickle file."""
        with open(self.pesto_pickle_file, 'rb') as file:
            self.pesto_protein_dict = pickle.load(file)
    
    def process_data(self):
        """Executes the full data loading process."""
        self.load_foldseek_encodings()
        self.load_pesto_pickle()
        print("Data loading complete.")


In [None]:
# the path of feature file
foldseek_file = "./feature/GCN.SF.foldseek.feature.txt"
pesto_pickle_file = "./feature/GCN.SF.PeSTO.feature.pkl"

# Initialize and process the data
processor = ProteinDataSFProcessor(foldseek_file, pesto_pickle_file)
processor.process_data()
foldseek = processor.foldseek
pesto_protein_dict = processor.pesto_protein_dict

In [None]:
import os
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt
from torch_geometric.data import InMemoryDataset, Data
from tqdm import tqdm
from Bio.PDB import PDBParser
import numpy as np
from multiprocessing import Pool
import scipy.spatial.distance as dist
import biotite.structure.io.pdb as pdb

def get_edge(coords,protein_name, threshold=8):
    """Compute edge connections for a protein."""
    # Calculate distance matrix 
    dists = dist.cdist(coords, coords)
    edges = []
    filter_edges = []
    for i in range(len(dists)):
        for j in range(len(dists)):
            if i != j and dists[i,j] < threshold and abs(i-j) != 1:
                edges.append((i, j))
           
    return edges

def generate_edge_index(esm_file,pdb_file,protein_name,threshold=8):
    protein_name = protein_name       
    with open(pdb_file,'r') as f:
        model = pdb.PDBFile.read(f).get_structure(model=1)
    
    x = torch.tensor(pesto_protein_dict[protein_name], dtype=torch.float32)
    esm2 = pesto_protein_dict[protein_name]
    coord = [i.coord for i in model if i.atom_name=="CA"]
    edge_index = torch.tensor(get_edge(coord,protein_name,threshold=8)).T

    return Data(x=x, edge_index=edge_index ,esm2=esm2 )

# Generate the graph and merge features
def generate_protein_graphs(pdb_list_file, pdb_dir, esm_dir, threshold):
    with open(pdb_list_file,'r') as file:
        f_names = [protein_name.rstrip() for protein_name in file]
        pdb_files = [os.path.join(pdb_dir, protein_name + '.pdb') for protein_name in f_names]
        esm_files = [os.path.join(esm_dir,protein_name + '.pt') for protein_name in f_names]
        p = Pool(5)
        result = [[protein_name, p.apply_async(generate_edge_index,(esm_file,pdb_file,protein_name,8))] for esm_file,pdb_file,protein_name in zip(esm_files,pdb_files,f_names)]  
        p.close()
        p.join()
        protein_dict = {k:v.get() for (k,v) in result} 
        for protein_name,data in protein_dict.items():
            esm_fea = protein_dict[protein_name]['x']
            protein_dict[protein_name]['x'] = torch.cat((foldseek[protein_name],protein_dict[protein_name]['x']), dim=1)
    
    return protein_dict

pdb_list_file = './pdb.list.txt'
pdb_dir = '/your/path/PDB/'
esm_dir = '/your/path/isoform_esm2/'
threshold = 8
protein_dict = generate_protein_graphs(pdb_list_file, pdb_dir, esm_dir, threshold)

# The feature of protein
for protein_name, data in protein_dict.items():
    print(f"{protein_name}: {data.x.shape}")


In [4]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn import metrics
import random
import pandas as pd
import scipy.sparse as spp

class ProteinDataLoader:
    def __init__(self, protein_dict, train_file, test_file, batch_size=3, seed=2066):
        self.device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
        self.protein_dict = protein_dict
        self.batch_size = batch_size
        self.seed = seed
        self.setup_seed(self.seed)
        
        self.train_file = train_file
        self.test_file = test_file
        self.train_df = pd.read_csv(self.train_file, sep="\t", header=None)
        self.test_df = pd.read_csv(self.test_file, sep="\t", header=None)
        
        self.test_data = self.generate_data(self.test_df)
        self.test_dataset = ProteinDataset(self.test_data)
        self.test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_fn)

        self.raw_train_data = self.generate_data(self.train_df)
        self.raw_train_dataset = ProteinDataset(self.raw_train_data)
        self.train_loader = DataLoader(self.raw_train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=self.collate_fn)
    
    def setup_seed(self, seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.backends.cudnn.deterministic = True
    
    def generate_data(self, df):
        """Generate data list from dataframe and protein dictionary."""
        data_list = []
        for i in range(len(df)):
            protein1, protein2, label = df.iloc[i]
            data1 = self.protein_dict[protein1]
            data2 = self.protein_dict[protein2]
            data_list.append((data1, data2, torch.tensor([label], dtype=torch.float).to(self.device)))
        return data_list

    def collate_fn(self, batch):
        data1 = [item[0] for item in batch]
        data2 = [item[1] for item in batch]
        label = torch.stack([item[2] for item in batch])
        return [data1, data2, label]

class ProteinDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def weights_init(m):
     if isinstance(m, (nn.Conv1d, nn.Linear)):
       nn.init.kaiming_normal_(m.weight, mode='fan_in',
                                 nonlinearity='leaky_relu')
       
train_file = 'train.txt'
test_file = 'test.txt'
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

data_loader = ProteinDataLoader(protein_dict, train_file, test_file)
test_loader = data_loader.test_loader
raw_train_loader = data_loader.train_loader

In [5]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

# define model
class GCN(torch.nn.Module):
    def __init__(self, num_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, 512).to(device)
        self.conv2 = GCNConv(512, 256).to(device)
        
        self.cnn1 = torch.nn.Conv1d(1, 8, kernel_size=2, stride=1) # (16,510)
        self.normal_layer3 = torch.nn.Linear(511, 511)

        self.cnn2 = torch.nn.Conv1d(8, 16, kernel_size=2, stride=1) #(16,254)
        self.normal_layer4 = torch.nn.Linear(510, 510)

        self.fc1 = torch.nn.Linear(16*510, 2560).to(device)
        self.fc2 = torch.nn.Linear(2560, 512).to(device)
        self.fc3 = torch.nn.Linear(512, 128).to(device)
        self.fc4 = torch.nn.Linear(128, 1).to(device)
        self.fc5 = torch.nn.Linear(128, 1).to(device)
        self.elu = torch.nn.ELU()
        self.dropout = torch.nn.Dropout(0.5).to(device) 

    def forward(self, data1, data2):
        x1, edge_index1 = data1.x.to(device), data1.edge_index.to(device)
        x2, edge_index2 = data2.x.to(device), data2.edge_index.to(device)

        x1 = F.leaky_relu(self.conv1(x1, edge_index1))   
        x1 = F.leaky_relu(self.conv2(x1, edge_index1))
        x1 = torch.mean(x1, 0, keepdim=True)  # Average pooling of the features of all nodes

        x2 = F.leaky_relu(self.conv1(x2, edge_index2))
        x2 = F.leaky_relu(self.conv2(x2, edge_index2))
        x2 = torch.mean(x2, 0, keepdim=True)  # Average pooling of the features of all nodes
        out = torch.cat((x1,x2),dim=1)
        
        out = F.leaky_relu(self.cnn1(out))
        out = self.normal_layer3(out)
        out = F.leaky_relu(self.cnn2(out))
        out = self.normal_layer4(out).view(-1)
        
        out = self.fc1(out)
        # out = self.dropout(out)
        out = self.fc2(out)
        # out = self.dropout(out)
        out = self.fc3(out)
        # out = self.dropout(out)
        out = self.fc4(out)
        return out

In [None]:
import torch
import numpy as np
import time
from sklearn.metrics import roc_auc_score
from zzd.utils.assess import multi_scores as scores
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau

class EarlyStopping:
    def __init__(self, patience=5, delta=0, verbose=False):
        self.patience = patience
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.delta = delta
        self.verbose = verbose

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.delta:
            self.counter += 1
            if self.verbose:
                print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

def train_and_evaluate_model(raw_train_loader, test_loader, model_class, device, num_epochs,lr):
    for kf in range(5):
        print(f"kf:{kf}")
        best_auc = 0
        best_model_path = f'DeepISO.SF.GCN.pth'
        model = model_class(27, 1).to(device)
        model.apply(weights_init)
        criterion = torch.nn.BCEWithLogitsLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)
        # early_stopping = EarlyStopping(patience=20, verbose=True)

        for epoch in range(num_epochs):
            start_time = time.time()
            model.train()
            total_loss = 0
            y_trues, y_preds = [], []
            for data1, data2, label in raw_train_loader:
                data1 = [d1.to(device) for d1 in data1]
                data2 = [d2.to(device) for d2 in data2]
                out = [model(data1[i], data2[i]) for i in range(len(data1))]
                out = torch.stack(out)
                loss = criterion(out, label)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                y_trues.append(label.cpu().tolist()[0])
                y_preds.append(out.cpu().detach().tolist()[0])
                total_loss += loss.item()
            epoch_loss = total_loss / len(raw_train_loader)

            # Evaluate on validation set
            model.eval()
            with torch.no_grad():
                val_losses, val_y_trues, val_y_preds = [], [], []
                for data1, data2, label in test_loader:
                    data1 = [d1.to(device) for d1 in data1]
                    data2 = [d2.to(device) for d2 in data2]
                    out = [model(data1[i], data2[i]) for i in range(len(data1))]
                    out = torch.stack(out)
                    out = torch.sigmoid(out)
                    loss = criterion(out, label)
                    val_y_trues.extend(label.cpu().tolist())
                    val_y_preds.extend(out.cpu().detach().tolist())
                    val_losses.append(loss.item())
                valid_loss = np.average(val_losses)
                scheduler.step(valid_loss)

                auc = roc_auc_score(val_y_trues, val_y_preds)
                print(f"epoch:{epoch}, train_loss:{epoch_loss:.4f}, val_loss:{valid_loss:.4f}, auc:{auc:.4f}, time:{time.time() - start_time:.1f}s")

                # Early stopping
                # early_stopping(valid_loss)
                # if early_stopping.early_stop:
                #     print("Early stopping")
                #     break

                if auc > best_auc:
                    best_auc = auc
                    torch.save(model.state_dict(), best_model_path)
                    print(f"Saved model with AUC: {best_auc:.4f}")

            # Evaluation on test set
            model.eval()
            with torch.no_grad():
                test_y_trues, test_y_preds = [], []
                for data1, data2, label in test_loader:
                    data1 = [d1.to(device) for d1 in data1]
                    data2 = [d2.to(device) for d2 in data2]
                    out = [model(data1[i], data2[i]) for i in range(len(data1))]
                    out = torch.stack(out)
                    out = torch.sigmoid(out)
                    test_y_preds.append(out.cpu().detach().tolist())
                    test_y_trues.append(label.cpu().tolist())
            # Save test results
            pred_table = np.hstack((np.genfromtxt(test_file, str), np.array(test_y_preds).reshape(-1, 1)))
            scores(pred_table[:, -2], pred_table[:, -1], show=True)

        print("\n")

train_and_evaluate_model(raw_train_loader, test_loader, GCN, device, num_epochs=100, lr=0.0001)


In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn import metrics
import random
import pandas as pd
import scipy.sparse as spp

class ProteinDataLoader:
    def __init__(self, protein_dict, train_file, test_file, batch_size=3, seed=2066):
        self.device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
        self.protein_dict = protein_dict
        self.batch_size = batch_size
        self.seed = seed
        self.setup_seed(self.seed)
        
        self.train_file = train_file
        self.test_file = test_file
        self.train_df = pd.read_csv(self.train_file, sep="\t", header=None)
        self.test_df = pd.read_csv(self.test_file, sep="\t", header=None)
        
        self.test_data = self.generate_data(self.test_df)
        self.test_dataset = ProteinDataset(self.test_data)
        self.test_loader = DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_fn)

        self.train_data = self.generate_data(self.train_df)
        self.train_dataset = ProteinDataset(self.train_data)
        self.train_loader = DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=False, collate_fn=self.collate_fn)
    
    def setup_seed(self, seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.backends.cudnn.deterministic = True
    
    def generate_data(self, df):
        """Generate data list from dataframe and protein dictionary."""
        data_list = []
        for i in range(len(df)):
            protein1, protein2, label = df.iloc[i]
            data1 = self.protein_dict[protein1]
            data2 = self.protein_dict[protein2]
            data_list.append((data1, data2, torch.tensor([label], dtype=torch.float).to(self.device)))
        return data_list

    def collate_fn(self, batch):
        data1 = [item[0] for item in batch]
        data2 = [item[1] for item in batch]
        label = torch.stack([item[2] for item in batch])
        return [data1, data2, label]

class ProteinDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def weights_init(m):
     if isinstance(m, (nn.Conv1d, nn.Linear)):
       nn.init.kaiming_normal_(m.weight, mode='fan_in',
                                 nonlinearity='leaky_relu')
       
train_file = 'train.txt'
test_file = 'test.txt'
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

data_loader = ProteinDataLoader(protein_dict, train_file, test_file)
test_loader = data_loader.test_loader
train_loader = data_loader.train_loader

In [7]:
from zzd.utils.assess import multi_scores as scores
def load_model(model_path, input_dim, output_dim, device):
    model = GCN(input_dim, output_dim).to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    return model

def evaluate_model(model, data_loader, device):
    y_tures = []
    y_preds = []
    
    with torch.no_grad():
        for data1, data2, label in data_loader:
            data1 = [d1.to(device) for d1 in data1]
            data2 = [d2.to(device) for d2 in data2]
            out = []
            for i in range(len(data1)):
                o = model(data1[i], data2[i])
                out.append(o)
            out = torch.stack(out)
            out = torch.sigmoid(out)
            y_preds.append(out.to('cpu').detach().tolist())
            y_tures.append(label.to('cpu').tolist())
    
    y_preds = [item for sublist in y_preds for item in sublist]
    y_pred = np.array(y_preds)
    
    return y_tures, y_pred

def save_predictions_to_file(pred_table, output_file):
    with open(output_file, "w") as f:
        for i in range(len(pred_table)):
            f.write(f"{pred_table[i, 0]}\t{pred_table[i, 1]}\t{pred_table[i, -2]}\t{pred_table[i, -1]}\n")

def run_evaluation(model_path, test_file, train_file, test_loader, train_loader, device):
    try:
        model = load_model(model_path, 27, 1, device)
        # test dataset
        y_trues, y_preds = evaluate_model(model, test_loader, device)
        pred_table = np.hstack((np.genfromtxt(test_file, str), y_preds.reshape(-1, 1)))
        result_score = scores(pred_table[:, -2], pred_table[:, -1], show=True)  # 评估预测结果
        save_predictions_to_file(pred_table, "DeepISO.SFGCN.test.txt")
        # train dataset
        y_trues, y_preds = evaluate_model(model, train_loader, device)
        pred_table = np.hstack((np.genfromtxt(train_file, str), y_preds.reshape(-1, 1)))
        result_score = scores(pred_table[:, -2], pred_table[:, -1], show=True)  # 评估预测结果
        save_predictions_to_file(pred_table, "DeepISO.SFGCN.train.txt")

    except Exception as e:
        print("There is an error when loading or predicting with the model：", str(e))


In [None]:
run_evaluation('DeepISO.SF.GCN.pth', 
               test_file='test.txt', 
               train_file='train.txt', 
               test_loader=test_loader, 
               train_loader=train_loader, 
               device='cuda:1')


In [None]:
from zzd.utils.assess import multi_scores as scores
def load_model(model_path, device):
    model = torch.load(model_path).to(device)
    model.eval()

    return model

def evaluate_model(model, data_loader, device):
    y_tures = []
    y_preds = []
    
    with torch.no_grad():
        for data1, data2, label in data_loader:
            data1 = [d1.to(device) for d1 in data1]
            data2 = [d2.to(device) for d2 in data2]
            out = []
            for i in range(len(data1)):
                o = model(data1[i], data2[i])
                out.append(o)
            out = torch.stack(out)
            out = torch.sigmoid(out)
            y_preds.append(out.to('cpu').detach().tolist())
            y_tures.append(label.to('cpu').tolist())
    
    y_preds = [item for sublist in y_preds for item in sublist]
    y_pred = np.array(y_preds)
    
    return y_tures, y_pred

def save_predictions_to_file(pred_table, output_file):
    with open(output_file, "w") as f:
        for i in range(len(pred_table)):
            f.write(f"{pred_table[i, 0]}\t{pred_table[i, 1]}\t{pred_table[i, -2]}\t{pred_table[i, -1]}\n")

def run_evaluation(model_path, test_file, train_file, test_loader, train_loader, device):
    try:
        model = load_model(model_path, device)
        # test dataset
        y_trues, y_preds = evaluate_model(model, test_loader, device)
        pred_table = np.hstack((np.genfromtxt(test_file, str), y_preds.reshape(-1, 1)))
        result_score = scores(pred_table[:, -2], pred_table[:, -1], show=True)  # 评估预测结果
        save_predictions_to_file(pred_table, "DeepISO.SFGCN.test.txt")
        # train dataset
        y_trues, y_preds = evaluate_model(model, train_loader, device)
        pred_table = np.hstack((np.genfromtxt(train_file, str), y_preds.reshape(-1, 1)))
        result_score = scores(pred_table[:, -2], pred_table[:, -1], show=True)  # 评估预测结果
        save_predictions_to_file(pred_table, "DeepISO.SFGCN.train.txt")

    except Exception as e:
        print("There is an error when loading or predicting with the model：", str(e))


In [None]:
run_evaluation('DeepISO.SF.GCN.pt', 
               test_file='test.txt', 
               train_file='train.txt', 
               test_loader=test_loader, 
               train_loader=train_loader, 
               device='cuda:1')
