In [1]:
import torch
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
import pandas as pd
from torch_geometric.data import Data
from torch_geometric.utils import negative_sampling
edges_df = pd.read_csv('GNN/protein_interactions.csv')

col_name = ['protein']
for i in range(1,769):
  col_name.append('feature'+str(i))
features_df = pd.read_csv('GNN/gene_embedding_GeneLLM_2.csv', header=None, names=col_name)
labels_df = pd.read_csv('GNN/solubility.csv')
edges_df['combined_score'][7] = 594
edges_df_cleaned = edges_df.dropna()
node_id_f = [node_id for node_id in features_df['protein']]
node_id_e1 = [node_id for node_id in edges_df_cleaned['protein1']]
node_id_e2 = [node_id for node_id in edges_df_cleaned['protein2']]
node_id_e = list(set(node_id_e1 + node_id_e2))
print(len(node_id_f))
print(len(node_id_e))
labels_df.rename(columns={'Gene name': 'protein'}, inplace=True)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  edges_df = pd.read_csv('GNN/protein_interactions.csv')
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a

14450
18838


In [2]:

nodes_in_features = set(features_df['protein'])

filtered_edges_df = edges_df[
    edges_df['protein1'].isin(nodes_in_features) & edges_df['protein2'].isin(nodes_in_features)
]
# 过滤特征数据集，保留共同节点的特征
filtered_features_df = features_df

In [3]:
# 将第二列中的两种字符映射为0和1
unique_values = labels_df['Count_Category'].unique()
print(unique_values)
mapping = {unique_values[0]: 0, unique_values[1]: 1}

labels_df['Count_Category'] = labels_df['Count_Category'].map(mapping)
# 显示更新后的DataFrame
print(labels_df)

['High Count' 'Low Count']
       protein Solubility  Label  Word_Count  Count_Category
0        ERAP2   Membrane      0         117               0
1     ADAMTSL5    Soluble      1          28               1
2      TBC1D30   Membrane      0          55               0
3       KCNK18   Membrane      0         184               0
4         NDNF    Soluble      1         129               0
...        ...        ...    ...         ...             ...
1374   TRABD2B   Membrane      0          96               0
1375      RPS9    Soluble      1         205               0
1376  SLC22A16   Membrane      0          93               0
1377      FBN3    Soluble      1          90               0
1378      BDH2    Soluble      1         102               0

[1379 rows x 5 columns]


In [4]:
node_id_to_index = {node_id: i for i, node_id in enumerate(filtered_features_df['protein'])}
# 确保edge_index是按照这个新的索引顺序排列的
source_indices = [node_id_to_index[node_id] for node_id in filtered_edges_df['protein1']]
target_indices = [node_id_to_index[node_id] for node_id in filtered_edges_df['protein2']]
edge_index = torch.tensor([source_indices, target_indices], dtype=torch.long)
filtered_edges_df['combined_score'] = pd.to_numeric(filtered_edges_df['combined_score'], errors='coerce', downcast='float')
edge_weight = torch.tensor(filtered_edges_df['combined_score'].values, dtype=torch.float)

# 重排特征矩阵以匹配edge_index的顺序
features = filtered_features_df.iloc[:, 1:].values
#features = features[[node_id_to_index[node_id] for node_id in filtered_features_df['protein']], :]

# 转换为PyTorch张量
features_tensor = torch.tensor(features, dtype=torch.float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_edges_df['combined_score'] = pd.to_numeric(filtered_edges_df['combined_score'], errors='coerce', downcast='float')


In [5]:
print(len(labels_df))
nodes_in_labels = set(labels_df['protein'])
nodes_in_filter_features = set(filtered_features_df['protein'])
common_nodes_labels = nodes_in_labels.intersection(nodes_in_filter_features)

# 过滤
filtered_labels_df = labels_df[labels_df['protein'].isin(common_nodes_labels)].reset_index(drop=True)
print(len(filtered_labels_df))
print(len(features_tensor))

1379
1379
14450


In [6]:
label_indices = [node_id_to_index[node_id] for node_id in filtered_labels_df['protein']]
num_nodes = 14450
labels = torch.full((num_nodes,), -1, dtype=torch.long)
for i, index in enumerate(filtered_labels_df['Label']):
    labels[label_indices[i]] = index

labels_tensor = torch.tensor(labels, dtype=torch.long)

  labels_tensor = torch.tensor(labels, dtype=torch.long)


In [7]:
filtered_labels_df.to_csv('GNN/new_labels.csv')

In [8]:
filtered_labels_df

Unnamed: 0,protein,Solubility,Label,Word_Count,Count_Category
0,ERAP2,Membrane,0,117,0
1,ADAMTSL5,Soluble,1,28,1
2,TBC1D30,Membrane,0,55,0
3,KCNK18,Membrane,0,184,0
4,NDNF,Soluble,1,129,0
...,...,...,...,...,...
1374,TRABD2B,Membrane,0,96,0
1375,RPS9,Soluble,1,205,0
1376,SLC22A16,Membrane,0,93,0
1377,FBN3,Soluble,1,90,0


In [9]:
highinfo_indices = [
    node_id_to_index[node_id]
    for node_id, weight in zip(filtered_labels_df['protein'], filtered_labels_df['Count_Category'])
    if weight == 0
]
lowinfo_indices = [
    node_id_to_index[node_id]
    for node_id, weight in zip(filtered_labels_df['protein'], filtered_labels_df['Count_Category'])
    if weight == 1
]

In [10]:
data = Data(x=features_tensor, edge_index=edge_index, y=labels_tensor, edge_attr=edge_weight)

print("x:", data.x.shape, data.x.dtype)
print("edge_index:", data.edge_index.shape, data.edge_index.dtype)
print("labels:", data.y.shape, data.y.dtype)
print("edge_weight:", data.edge_attr.shape, data.edge_attr.dtype)

x: torch.Size([14450, 768]) torch.float32
edge_index: torch.Size([2, 9503503]) torch.int64
labels: torch.Size([14450]) torch.int64
edge_weight: torch.Size([9503503]) torch.float32


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv

class GAT(nn.Module):
    def __init__(self, num_layers, in_dim, num_hidden, num_classes, heads, activation, dropout, negative_slope, residual):
        super(GAT, self).__init__()
        self.num_layers = num_layers
        self.dropout = dropout
        self.gat_layers = nn.ModuleList()
        self.activation = activation

        # Input projection (no residual)
        self.gat_layers.append(GATConv(
            in_dim, num_hidden, heads=heads[0],
            dropout=dropout, negative_slope=negative_slope, concat=True, add_self_loops=True))

        # Hidden layers
        for l in range(1, num_layers):
            # Due to multi-head, the in_dim = num_hidden * num_heads
            self.gat_layers.append(GATConv(
                num_hidden * heads[l-1], num_hidden, heads=heads[l],
                dropout=dropout, negative_slope=negative_slope, concat=True, add_self_loops=True))

        # Output projection
        self.gat_layers.append(GATConv(
            num_hidden * heads[-2], num_classes, heads=heads[-1],
            dropout=dropout, negative_slope=negative_slope, concat=False, add_self_loops=True))

    def forward(self, x, edge_index):
        h = x
        for l, layer in enumerate(self.gat_layers[:-1]):
            h = layer(h, edge_index)
            if self.activation:
                h = self.activation(h)
            if l < self.num_layers - 1:
                h = F.dropout(h, p=self.dropout, training=self.training)

        # Output projection
        logits = self.gat_layers[-1](h, edge_index)
        return logits

In [12]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, 128)
        self.conv2 = GCNConv(128, hidden_dim)
        self.fc1 = torch.nn.Linear(hidden_dim, 128)
        self.fc2 = torch.nn.Linear(128, 64)
        self.fc3 = torch.nn.Linear(64, num_classes)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x, edge_index):
        # 保存初始特征
        initial_features = x

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.sigmoid(x)   #应用 Sigmoid 激活函数进行逻辑回归
        return x

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch.nn import Linear, ModuleList, Dropout

class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes, num_layers, activation, dropout):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_dim)
        self.convs = ModuleList([GCNConv(hidden_dim, hidden_dim) for _ in range(num_layers - 2)])
        self.conv_last = GCNConv(hidden_dim, num_classes)
        self.activation = activation
        self.dropout = Dropout(dropout)
        self.num_layers = num_layers

    def forward(self, x, edge_index):
        # 输入层
        x = self.conv1(x, edge_index)
        x = self.activation(x)
        x = self.dropout(x)
        
        # 隐藏层
        for conv in self.convs:
            x = conv(x, edge_index)
            x = self.activation(x)
            x = self.dropout(x)

        # 输出层
        x = self.conv_last(x, edge_index)
        return x



In [13]:
from __future__ import division

import torch


def accuracy(pred, target):
    r"""Computes the accuracy of correct predictions.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.

    :rtype: int
    """
    return (pred == target).sum().item() / target.numel()



def true_positive(pred, target, num_classes):
    r"""Computes the number of true positive predictions.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.
        num_classes (int): The number of classes.

    :rtype: :class:`LongTensor`
    """
    out = []
    for i in range(num_classes):
        out.append(((pred == i) & (target == i)).sum())

    return torch.tensor(out)



def true_negative(pred, target, num_classes):
    r"""Computes the number of true negative predictions.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.
        num_classes (int): The number of classes.

    :rtype: :class:`LongTensor`
    """
    out = []
    for i in range(num_classes):
        out.append(((pred != i) & (target != i)).sum())

    return torch.tensor(out)



def false_positive(pred, target, num_classes):
    r"""Computes the number of false positive predictions.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.
        num_classes (int): The number of classes.

    :rtype: :class:`LongTensor`
    """
    out = []
    for i in range(num_classes):
        out.append(((pred == i) & (target != i)).sum())

    return torch.tensor(out)



def false_negative(pred, target, num_classes):
    r"""Computes the number of false negative predictions.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.
        num_classes (int): The number of classes.

    :rtype: :class:`LongTensor`
    """
    out = []
    for i in range(num_classes):
        out.append(((pred != i) & (target == i)).sum())

    return torch.tensor(out)



def precision(pred, target, num_classes):
    r"""Computes the precision:
    :math:`\frac{\mathrm{TP}}{\mathrm{TP}+\mathrm{FP}}`.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.
        num_classes (int): The number of classes.

    :rtype: :class:`Tensor`
    """
    tp = true_positive(pred, target, num_classes).to(torch.float)
    fp = false_positive(pred, target, num_classes).to(torch.float)

    out = tp / (tp + fp)
    out[torch.isnan(out)] = 0

    return out



def recall(pred, target, num_classes):
    r"""Computes the recall:
    :math:`\frac{\mathrm{TP}}{\mathrm{TP}+\mathrm{FN}}`.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.
        num_classes (int): The number of classes.

    :rtype: :class:`Tensor`
    """
    tp = true_positive(pred, target, num_classes).to(torch.float)
    fn = false_negative(pred, target, num_classes).to(torch.float)

    out = tp / (tp + fn)
    out[torch.isnan(out)] = 0

    return out



def f1_score(pred, target, num_classes):
    r"""Computes the :math:`F_1` score:
    :math:`2 \cdot \frac{\mathrm{precision} \cdot \mathrm{recall}}
    {\mathrm{precision}+\mathrm{recall}}`.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.
        num_classes (int): The number of classes.

    :rtype: :class:`Tensor`
    """
    prec = precision(pred, target, num_classes)
    rec = recall(pred, target, num_classes)

    score = 2 * (prec * rec) / (prec + rec)
    score[torch.isnan(score)] = 0

    return score

In [14]:
from sklearn.preprocessing import label_binarize
def train_model_scheduler(model, masked_features, labels, edge_index, optimizer, criterion, scheduler, train_mask):
    model.train()  # 设置模型为训练模
    optimizer.zero_grad()  # 清空梯度
    out = model(masked_features, edge_index)  # 获取模型输出
    loss = criterion(out[train_mask], data.y[train_mask])  # 计算损失值，只针对训练集的节点
    loss.backward()  # 反向传播
    optimizer.step()  # 更新模型参数
    scheduler.step(loss)
    return loss.item()

def train_model(model, masked_features, labels, edge_index, optimizer, criterion, train_mask):
    model.train()  # 设置模型为训练模式
    optimizer.zero_grad()  # 清空梯度
    out = model(masked_features, edge_index) # 获取模型输出
    loss = criterion(out[train_mask], data.y[train_mask])  # 计算损失值，只针对训练集的节点
    loss.backward()  # 反向传播
    optimizer.step()  # 更新模型参数
    return loss.item()

def evaluate_model(model, features, labels, edge_index, mask):
    model.eval()  # 设置模型为评估模式
    with torch.no_grad():  # 关闭梯度计算
        # 获取模型输出，这里假设输出已经是经过sigmoid的概率
        probabilities = model(features, edge_index)
        if probabilities.shape[1] == 2:  # 假设有两个输出（每个类一个概率）
            positive_probs = probabilities[mask, 1]  # 选择正类概率
        else:
            positive_probs = probabilities[mask]  # 如果只有一个输出，假设已经是正类概率
        val_f1 = torch.mean(f1_score(torch.argmax(probabilities[mask],dim=1), labels[mask], num_classes=2)).cpu().numpy()
        auc_score = roc_auc_score(labels[mask].cpu().numpy(), positive_probs.cpu().numpy())

    return val_f1, auc_score

In [15]:
from sklearn.metrics import roc_auc_score
import numpy as np
# 实例化模型
device = torch.device('cuda:1')
data = data.to(device)
model = GCN(num_features=features.shape[1], hidden_dim=64, num_classes=2, num_layers=1, activation=F.relu, dropout=0.5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=0.001)
loss_fn = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=100, verbose=True)

label_indices = torch.tensor(label_indices, dtype=torch.long)
print(label_indices)
# 随机打乱有标签的节点索引
labeled_indices = label_indices[torch.randperm(label_indices.size(0))]
#print(labeled_indices)
labeled_indices = label_indices

# 定义训练和测试集的大小
num_labeled = labeled_indices.size(0)
num_train = int(num_labeled * 0.8)
num_test = num_labeled - num_train
print(num_test)

# 创建训练和测试掩码
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

train_mask[labeled_indices[:num_train]] = True
test_mask[labeled_indices[num_train:num_train+num_test]] = True
print(test_mask)
num_epochs = 3000
for epoch in range(num_epochs):
    train_loss = train_model_scheduler(model, data.x, data.y, data.edge_index, optimizer, loss_fn, scheduler, train_mask)
    test_f1, test_auc = evaluate_model(model, data.x, data.y, data.edge_index, test_mask)
    
    if (epoch+1) % 100 == 0: 
        print(f'Epoch {epoch}: Train Loss:{train_loss:.4f}, Macro_F1: {test_f1:.4f}, AUC_score: {test_auc:.4f}')

tensor([   67,    81,    93,  ..., 14398, 14407, 14426])
276
tensor([False, False, False,  ..., False, False, False])
Epoch 99: Train Loss:0.6419, Macro_F1: 0.6143, AUC_score: 0.6718
Epoch 199: Train Loss:0.6157, Macro_F1: 0.6577, AUC_score: 0.6993
Epoch 299: Train Loss:0.6156, Macro_F1: 0.6393, AUC_score: 0.7225
Epoch 399: Train Loss:0.5911, Macro_F1: 0.6662, AUC_score: 0.7484
Epoch 499: Train Loss:0.5711, Macro_F1: 0.6975, AUC_score: 0.7679
Epoch 599: Train Loss:0.5778, Macro_F1: 0.6953, AUC_score: 0.7749
Epoch 699: Train Loss:0.5780, Macro_F1: 0.6843, AUC_score: 0.7893
Epoch 799: Train Loss:0.5477, Macro_F1: 0.7321, AUC_score: 0.8005
Epoch 899: Train Loss:0.5434, Macro_F1: 0.7267, AUC_score: 0.8047
Epoch 999: Train Loss:0.5893, Macro_F1: 0.6885, AUC_score: 0.8025
Epoch 1099: Train Loss:0.5423, Macro_F1: 0.7232, AUC_score: 0.8281
Epoch 1199: Train Loss:0.5513, Macro_F1: 0.7423, AUC_score: 0.8339
Epoch 1299: Train Loss:0.5344, Macro_F1: 0.7425, AUC_score: 0.8334
Epoch 1399: Train Loss

In [16]:
num_layers=1
n_heads = 4
heads = ([n_heads] * num_layers) + [1]
heads

[4, 1]

In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np
device = torch.device('cuda:1')
data = data.to(device)

model = GAT(num_layers=1, 
            in_dim=features.shape[1], 
            num_hidden=64, 
            num_classes=2, 
            heads = heads, 
            activation=F.elu, dropout=0.6, negative_slope=0.2, residual=True).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)
loss_fn = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=200, verbose=True)

label_indices = torch.tensor(label_indices, dtype=torch.long)
print(label_indices)
# 随机打乱有标签的节点索引
labeled_indices = label_indices[torch.randperm(label_indices.size(0))]
#print(labeled_indices)
labeled_indices = label_indices

# 定义训练和测试集的大小
num_labeled = labeled_indices.size(0)
num_train = int(num_labeled * 0.8)
num_test = num_labeled - num_train
print(num_test)

# 创建训练和测试掩码
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

train_mask[labeled_indices[:num_train]] = True
test_mask[labeled_indices[num_train:num_train+num_test]] = True
print(test_mask)
num_epochs = 1000
for epoch in range(num_epochs):
    train_loss = train_model_scheduler(model, data.x, data.y, data.edge_index, optimizer, loss_fn, scheduler, train_mask)
    #train_loss = train_model(model, data.x, data.y, data.edge_index, optimizer, loss_fn, train_mask)
    test_acc, test_auc = evaluate_model(model, data.x, data.y, data.edge_index, test_mask)
    
    if (epoch+1) % 50 == 0: 
        print(f'Epoch {epoch}: Train Loss: {train_loss:.4f}, Macro_F1: {test_acc:.4f}, AUC_score: {test_auc:.4f}')

tensor([   67,    81,    93,  ..., 14398, 14407, 14426])
276
tensor([False, False, False,  ..., False, False, False])


  label_indices = torch.tensor(label_indices, dtype=torch.long)


Epoch 49: Train Loss: 0.6736, Macro_F1: 0.5624, AUC_score: 0.6681
Epoch 99: Train Loss: 0.6420, Macro_F1: 0.6173, AUC_score: 0.6781
Epoch 149: Train Loss: 0.5824, Macro_F1: 0.6718, AUC_score: 0.7556
Epoch 199: Train Loss: 0.5505, Macro_F1: 0.5969, AUC_score: 0.7927
Epoch 249: Train Loss: 0.5571, Macro_F1: 0.5955, AUC_score: 0.8173
Epoch 299: Train Loss: 0.4780, Macro_F1: 0.7533, AUC_score: 0.8389
Epoch 349: Train Loss: 0.4594, Macro_F1: 0.8090, AUC_score: 0.8536
Epoch 399: Train Loss: 0.4841, Macro_F1: 0.7825, AUC_score: 0.8562
Epoch 449: Train Loss: 0.4332, Macro_F1: 0.8076, AUC_score: 0.8626
Epoch 499: Train Loss: 0.4207, Macro_F1: 0.7824, AUC_score: 0.8631
Epoch 549: Train Loss: 0.4246, Macro_F1: 0.8029, AUC_score: 0.8649
Epoch 599: Train Loss: 0.4019, Macro_F1: 0.7861, AUC_score: 0.8691
Epoch 649: Train Loss: 0.3975, Macro_F1: 0.7862, AUC_score: 0.8676
Epoch 699: Train Loss: 0.3990, Macro_F1: 0.8110, AUC_score: 0.8746
Epoch 749: Train Loss: 0.3859, Macro_F1: 0.7934, AUC_score: 0.87