In [1]:
import networkx as nx
import obonet
import torch
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
GO_graph = obonet.read_obo("GNN/go-basic.obo")

go_edges = []
for u, v, data in GO_graph.edges(data=True):
    go_edges.append([u, v])
go_edges_df = pd.DataFrame(go_edges, columns=['Source', 'Target']).dropna()
print(go_edges_df.head())
col_name = ['GO']
for i in range(1,769):
  col_name.append('feature'+str(i))
go_features_df = pd.read_csv("GNN/go_terms_embeddings.csv", skiprows=1, names=col_name).dropna()
print(go_features_df.head())

       Source      Target
0  GO:0000001  GO:0048308
1  GO:0000001  GO:0048311
2  GO:0000002  GO:0007005
3  GO:0000003  GO:0008150
4  GO:0000006  GO:0005385
           GO  feature1  feature2  feature3  feature4  feature5  feature6  \
0  GO:0000001 -1.168093 -0.355214  0.265877 -0.710051  0.515028 -0.525165   
1  GO:0000002 -1.185879 -0.098765  0.388240 -0.295556  0.327296 -0.119842   
2  GO:0000003  0.063323 -0.199995  0.151511 -0.942141  0.109313  0.015316   
3  GO:0000005  0.163135  0.301527  0.219680  0.094342 -0.129769  0.225696   
4  GO:0000006 -0.641113 -0.541363  0.413941  0.699345  0.461507 -0.497388   

   feature7  feature8  feature9  ...  feature759  feature760  feature761  \
0 -0.186588 -0.161192  0.186984  ...   -1.350874   -0.991801   -0.648123   
1  0.399882 -0.035890  0.853417  ...   -1.086927   -0.842870   -0.385764   
2  0.633298  0.507875  0.665548  ...    0.174185    0.351648    0.138497   
3  0.357577  0.819992  0.852388  ...   -0.084025   -0.291103   -0.003621   
4

In [3]:
gene_features_df = pd.read_csv('GNN/new_gene_features.csv').dropna()
gene_features_df = gene_features_df.drop(gene_features_df.columns[0], axis=1)
print(gene_features_df)

      protein  feature1  feature2  feature3  feature4  feature5  feature6  \
0         FES  0.339602 -0.030744 -0.901381  0.100888  0.886443  0.383596   
1      HADHA  -0.131799 -0.025745 -0.677301 -0.053545  0.971046  0.180315   
2      SLC7A7  0.385693 -0.070692 -0.847796 -0.022054  0.959772  0.085487   
3        LCK   0.650428  0.014479 -0.866163  0.053508  0.951529  0.269402   
4       HSPA2  0.322262  0.017484 -0.849302  0.046401  0.920429  0.463832   
...       ...       ...       ...       ...       ...       ...       ...   
14445   BPY2C -0.840158 -0.042814 -0.853394 -0.049438  0.943925  0.104337   
14446    CLPS -0.270716 -0.036871 -0.915350 -0.013635  0.972046  0.016017   
14447    DNER  0.228932 -0.033579 -0.907262  0.010446  0.961684  0.524211   
14448    SOX7  0.140491  0.033339 -0.806014 -0.072016  0.938781  0.339959   
14449  CXCL14 -0.570266 -0.011502 -0.741149 -0.096209  0.967244  0.426519   

       feature7  feature8  feature9  ...  feature759  feature760  feature76

In [4]:
col_name = ['Target', 'Source']
go_protein_df = pd.read_csv(
    "GNN/mart_export.txt", 
    skiprows=1, 
    names=col_name, 
    usecols=[1, 2]  # 使用列的索引来指定
).dropna()
print(go_protein_df.head())

    Target      Source
0    MT-TF  GO:0030533
1    MT-TF  GO:0006412
4  MT-RNR2  GO:0003735
5  MT-RNR2  GO:0005840
6   MT-TL1  GO:0030533


In [5]:
go_features_df.set_index('GO', inplace=True)
gene_features_df.set_index('protein', inplace=True)
combined_features = pd.concat([gene_features_df, go_features_df])
print(combined_features)

            feature1  feature2  feature3  feature4  feature5  feature6  \
FES         0.339602 -0.030744 -0.901381  0.100888  0.886443  0.383596   
HADHA      -0.131799 -0.025745 -0.677301 -0.053545  0.971046  0.180315   
SLC7A7      0.385693 -0.070692 -0.847796 -0.022054  0.959772  0.085487   
LCK         0.650428  0.014479 -0.866163  0.053508  0.951529  0.269402   
HSPA2       0.322262  0.017484 -0.849302  0.046401  0.920429  0.463832   
...              ...       ...       ...       ...       ...       ...   
GO:2001313  0.174428  0.194728 -0.284376  0.282102 -0.713190 -0.272055   
GO:2001314  0.025886  0.306214 -0.254303  0.253673 -0.533680 -0.269355   
GO:2001315  0.027134  0.241391 -0.227353  0.317366 -0.726657 -0.197968   
GO:2001316  0.139543  0.028883  0.899480  0.152932  0.576852  0.330342   
GO:2001317  0.083064  0.090899  0.888541  0.309920  0.403966  0.202783   

            feature7  feature8  feature9  feature10  ...  feature759  \
FES        -0.192082 -0.032063 -0.15486

In [7]:
combined_edges = pd.concat([go_edges_df, go_protein_df])
print(combined_edges)

            Source      Target
0       GO:0000001  GO:0048308
1       GO:0000001  GO:0048311
2       GO:0000002  GO:0007005
3       GO:0000003  GO:0008150
4       GO:0000006  GO:0005385
...            ...         ...
456584  GO:0032880     PLEKHM2
456585  GO:0010008     PLEKHM2
456586  GO:0019894     PLEKHM2
456587  GO:0032418     PLEKHM2
456588  GO:0042267     PLEKHM2

[477028 rows x 2 columns]


In [8]:
labels_df = pd.read_csv('GNN/solubility.csv')
labels_df.rename(columns={'Gene name': 'protein'}, inplace=True)
labels_df.set_index('protein', inplace=True)
labels_df.dropna()

Unnamed: 0_level_0,Solubility,Label,Word_Count,Count_Category
protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ERAP2,Membrane,0,117,High Count
ADAMTSL5,Soluble,1,28,Low Count
TBC1D30,Membrane,0,55,High Count
KCNK18,Membrane,0,184,High Count
NDNF,Soluble,1,129,High Count
...,...,...,...,...
TRABD2B,Membrane,0,96,High Count
RPS9,Soluble,1,205,High Count
SLC22A16,Membrane,0,93,High Count
FBN3,Soluble,1,90,High Count


In [9]:
features_df = pd.DataFrame(combined_features, index=combined_features.index)

In [10]:
node_index_map = {node_name: i for i, node_name in enumerate(features_df.index)}


In [11]:
default_feature = [0.0] * 768

# 更新node_index_map和node_features
for node in combined_edges['Source']:
    if node not in node_index_map:
        new_index = len(features_df)
        features_df.loc[node] = default_feature  # 第一列可能是节点名称，如果不是则需要调整
        node_index_map[node] = new_index

for node in combined_edges['Target']:
    if node not in node_index_map:
        new_index = len(features_df)
        features_df.loc[node] = default_feature  # 第一列可能是节点名称，如果不是则需要调整
        node_index_map[node] = new_index


In [12]:
features_df.to_csv('GNN/go_protein_features.csv')


In [13]:
combined_edges.to_csv('GNN/GO_gene_edges.csv')

In [13]:
features_df
#node_features = torch.tensor(features_df.values, dtype=torch.float64)


Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,...,feature759,feature760,feature761,feature762,feature763,feature764,feature765,feature766,feature767,feature768
GO:0000001,-1.168093,-0.355214,0.265877,-0.710051,0.515028,-0.525165,-0.186588,-0.161192,0.186984,-0.663847,...,-1.350874,-0.991801,-0.648123,-0.361629,-0.914965,-0.506993,0.389760,0.207266,0.070705,0.938593
GO:0000002,-1.185879,-0.098765,0.388240,-0.295556,0.327296,-0.119842,0.399882,-0.035890,0.853417,-1.038589,...,-1.086927,-0.842870,-0.385764,0.175797,-1.223772,-0.999628,0.101473,-0.051212,0.048775,0.780470
GO:0000003,0.063323,-0.199995,0.151511,-0.942141,0.109313,0.015316,0.633298,0.507875,0.665548,0.460263,...,0.174185,0.351648,0.138497,0.119273,-0.295167,-0.331179,0.102570,-0.524301,-0.139264,0.761573
GO:0000005,0.163135,0.301527,0.219680,0.094342,-0.129769,0.225696,0.357577,0.819992,0.852388,0.013739,...,-0.084025,-0.291103,-0.003621,0.245929,-0.443244,0.229245,-0.685159,-0.725621,0.285964,0.313211
GO:0000006,-0.641113,-0.541363,0.413941,0.699345,0.461507,-0.497388,-0.044589,-0.655766,-0.596647,0.208076,...,-0.561434,0.246475,-0.029871,-0.212828,-0.985273,0.677472,0.582681,0.299317,-0.131577,0.739702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MIR29B2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
PLXNA2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
CELA2A,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
CELA2B,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [10]:
combined_edges.to_csv('GNN/GO_gene_edges.csv', index=False)

In [46]:
node_labels = []

for node in combined_features.index:
    if node in labels_df.index:
        node_labels.append(labels_df.loc[node, 'Label'])
    else:
        node_labels.append(-1)

labels_tensor = torch.tensor(node_labels, dtype=torch.long)
print(node_labels)

[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

In [14]:
from torch_geometric.data import Data
node_features = torch.tensor(features_df.values, dtype=torch.float64)
source_indices = [node_index_map[node] for node in combined_edges['Source']]
target_indices = [node_index_map[node] for node in combined_edges['Target']]
edge_index = torch.tensor([source_indices, target_indices], dtype=torch.long)

data = Data(x=node_features, edge_index=edge_index, y = labels_tensor)


In [15]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch.nn import Linear, ModuleList, Dropout

class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes, num_layers, activation, dropout):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_dim)
        self.convs = ModuleList([GCNConv(hidden_dim, hidden_dim) for _ in range(num_layers - 2)])
        self.conv_last = GCNConv(hidden_dim, num_classes)
        self.activation = activation
        self.dropout = Dropout(dropout)
        self.num_layers = num_layers

    def forward(self, x, edge_index):
        # 输入层
        x = self.conv1(x, edge_index)
        x = self.activation(x)
        x = self.dropout(x)
        
        # 隐藏层
        for conv in self.convs:
            x = conv(x, edge_index)
            x = self.activation(x)
            x = self.dropout(x)

        # 输出层
        x = self.conv_last(x, edge_index)
        return x


In [18]:
from __future__ import division

import torch


def accuracy(pred, target):
    r"""Computes the accuracy of correct predictions.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.

    :rtype: int
    """
    return (pred == target).sum().item() / target.numel()



def true_positive(pred, target, num_classes):
    r"""Computes the number of true positive predictions.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.
        num_classes (int): The number of classes.

    :rtype: :class:`LongTensor`
    """
    out = []
    for i in range(num_classes):
        out.append(((pred == i) & (target == i)).sum())

    return torch.tensor(out)



def true_negative(pred, target, num_classes):
    r"""Computes the number of true negative predictions.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.
        num_classes (int): The number of classes.

    :rtype: :class:`LongTensor`
    """
    out = []
    for i in range(num_classes):
        out.append(((pred != i) & (target != i)).sum())

    return torch.tensor(out)



def false_positive(pred, target, num_classes):
    r"""Computes the number of false positive predictions.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.
        num_classes (int): The number of classes.

    :rtype: :class:`LongTensor`
    """
    out = []
    for i in range(num_classes):
        out.append(((pred == i) & (target != i)).sum())

    return torch.tensor(out)



def false_negative(pred, target, num_classes):
    r"""Computes the number of false negative predictions.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.
        num_classes (int): The number of classes.

    :rtype: :class:`LongTensor`
    """
    out = []
    for i in range(num_classes):
        out.append(((pred != i) & (target == i)).sum())

    return torch.tensor(out)



def precision(pred, target, num_classes):
    r"""Computes the precision:
    :math:`\frac{\mathrm{TP}}{\mathrm{TP}+\mathrm{FP}}`.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.
        num_classes (int): The number of classes.

    :rtype: :class:`Tensor`
    """
    tp = true_positive(pred, target, num_classes).to(torch.float)
    fp = false_positive(pred, target, num_classes).to(torch.float)

    out = tp / (tp + fp)
    out[torch.isnan(out)] = 0

    return out



def recall(pred, target, num_classes):
    r"""Computes the recall:
    :math:`\frac{\mathrm{TP}}{\mathrm{TP}+\mathrm{FN}}`.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.
        num_classes (int): The number of classes.

    :rtype: :class:`Tensor`
    """
    tp = true_positive(pred, target, num_classes).to(torch.float)
    fn = false_negative(pred, target, num_classes).to(torch.float)

    out = tp / (tp + fn)
    out[torch.isnan(out)] = 0

    return out



def f1_score(pred, target, num_classes):
    r"""Computes the :math:`F_1` score:
    :math:`2 \cdot \frac{\mathrm{precision} \cdot \mathrm{recall}}
    {\mathrm{precision}+\mathrm{recall}}`.

    Args:
        pred (Tensor): The predictions.
        target (Tensor): The targets.
        num_classes (int): The number of classes.

    :rtype: :class:`Tensor`
    """
    prec = precision(pred, target, num_classes)
    rec = recall(pred, target, num_classes)

    score = 2 * (prec * rec) / (prec + rec)
    score[torch.isnan(score)] = 0

    return score

In [19]:
from sklearn.preprocessing import label_binarize
def train_model_scheduler(model, masked_features, labels, edge_index, optimizer, criterion, scheduler, train_mask):
    model.train()  # 设置模型为训练模
    optimizer.zero_grad()  # 清空梯度
    out = model(masked_features, edge_index)  # 获取模型输出
    loss = criterion(out[train_mask], data.y[train_mask])  # 计算损失值，只针对训练集的节点
    loss.backward()  # 反向传播
    optimizer.step()  # 更新模型参数
    scheduler.step(loss)
    return loss.item()

def train_model(model, masked_features, labels, edge_index, optimizer, criterion, train_mask):
    model.train()  # 设置模型为训练模式
    optimizer.zero_grad()  # 清空梯度
    out = model(masked_features, edge_index) # 获取模型输出
    loss = criterion(out[train_mask], data.y[train_mask])  # 计算损失值，只针对训练集的节点
    loss.backward()  # 反向传播
    optimizer.step()  # 更新模型参数
    return loss.item()

def evaluate_model(model, features, labels, edge_index, mask):
    model.eval()  # 设置模型为评估模式
    with torch.no_grad():  # 关闭梯度计算
        # 获取模型输出，这里假设输出已经是经过sigmoid的概率
        probabilities = model(features, edge_index)
        if probabilities.shape[1] == 2:  # 假设有两个输出（每个类一个概率）
            positive_probs = probabilities[mask, 1]  # 选择正类概率
        else:
            positive_probs = probabilities[mask]  # 如果只有一个输出，假设已经是正类概率
        val_f1 = torch.mean(f1_score(torch.argmax(probabilities[mask],dim=1), labels[mask], num_classes=2)).cpu().numpy()
        auc_score = roc_auc_score(labels[mask].cpu().numpy(), positive_probs.cpu().numpy())

    return val_f1, auc_score

In [28]:
node_labels = np.array(node_labels)
count_label_0 = np.count_nonzero(node_labels == 0)
count_label_1 = np.count_nonzero(node_labels == 1)

print("标签为 0 的节点数量：", count_label_0)
print("标签为 1 的节点数量：", count_label_1)

标签为 0 的节点数量： 670
标签为 1 的节点数量： 709


In [37]:
print(len(node_labels))

62045


In [43]:
from sklearn.metrics import roc_auc_score
import numpy as np
from sklearn.model_selection import train_test_split
# 实例化模型
device = torch.device('cuda:1')
data = data.to(device)
model = GCN(num_features=data.x.shape[1], hidden_dim=64, num_classes=2, num_layers=1, activation=F.relu, dropout=0.5).to(torch.float64)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=0.001)
loss_fn = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=100, verbose=True)


labeled_indices = np.where(node_labels != -1)[0]
print(labeled_indices)
num_labeled = len(labeled_indices)
num_train = int(num_labeled * 0.8)
num_test = num_labeled - num_train
print(num_test)

# 创建训练和测试掩码
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

train_mask[labeled_indices[:num_train]] = True
test_mask[labeled_indices[num_train:num_train+num_test]] = True
print(test_mask)

[47662 47676 47688 ... 61993 62002 62021]
276
tensor([False, False, False,  ..., False, False, False])


In [44]:
test_dataset = np.array(test_mask)

count_label_0 = np.count_nonzero(test_dataset == 0)
count_label_1 = np.count_nonzero(test_dataset == 1)

print("标签为 0 的节点数量：", count_label_0)
print("标签为 1 的节点数量：", count_label_1)

标签为 0 的节点数量： 69601
标签为 1 的节点数量： 276


In [42]:

print("x:", data.x.shape, data.x.dtype)
print("edge_index:", data.edge_index.shape, data.edge_index.dtype)
print("labels:", data.y.shape, data.y.dtype)

x: torch.Size([69877, 768]) torch.float64
edge_index: torch.Size([2, 477028]) torch.int64
labels: torch.Size([62045]) torch.int64


In [45]:
num_epochs = 3500
for epoch in range(num_epochs):
    train_loss = train_model_scheduler(model, data.x, data.y, data.edge_index, optimizer, loss_fn, scheduler, train_mask)
    test_f1, test_auc = evaluate_model(model, data.x, data.y, data.edge_index, test_mask)
    
    if epoch % 100 == 0: 
        print(f'Epoch {epoch}: Train Loss:{train_loss:.4f}, Macro_F1: {test_f1:.4f}, AUC_score: {test_auc:.4f}')

IndexError: The shape of the mask [69877] at index 0 does not match the shape of the indexed tensor [62045] at index 0

In [None]:
import numpy as np

num_iterations = 20
count = 1 / num_iterations

# 总节点数
num_nodes = features.size(0)
# 所有节点的索引
all_indices = np.arange(num_nodes)
mask_out = torch.ones(num_nodes, dtype=torch.bool)
# 将测试集索引处的掩码设为False
mask_out[test_indices] = False
# 使用掩码获取剩余的索引
remaining_indices = all_indices[mask_out]
remaining_indices

# 从边数据集获取节点
nodes_in_edges = set(edges_df_cleaned['protein1']).union(set(edges_df_cleaned['protein2']))

# 从特征数据集获取节点
nodes_in_features = set(features_df['protein'])

# 找出共同的节点
common_nodes = nodes_in_edges.intersection(nodes_in_features)

# 过滤边数据集，保留只包含共同节点的边
filtered_edges_df = edges_df_cleaned[edges_df_cleaned['protein1'].isin(common_nodes) & edges_df_cleaned['protein2'].isin(common_nodes)].reset_index(drop=True)

# 过滤特征数据集，保留共同节点的特征
filtered_features_df = features_df[features_df['protein'].isin(common_nodes)].reset_index(drop=True)