In [1]:
import torch
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

class SimpleGNN(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, out_dim):
        super(SimpleGNN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_dim)
        self.bn1 = BatchNorm1d(hidden_dim)
        self.conv2 = GCNConv(hidden_dim, out_dim)

    def forward(self, x, edge_index, edge_weight):
        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(self.bn1(x)) 
        x = self.conv2(x, edge_index, edge_weight)
        return x

def triplet_loss(anchor, positive, negative, margin):
    distance_positive = (anchor - positive).pow(2).sum(1)
    distance_negative = (anchor - negative).pow(2).sum(1)
    losses = F.relu(distance_positive - distance_negative + margin)
    return losses.mean()


In [2]:
import torch
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
import pandas as pd
from torch_geometric.data import Data
from torch_geometric.utils import negative_sampling

edges_df = pd.read_csv('GNN/protein_interactions.csv')

col_name = ['protein']
for i in range(1,769):
  col_name.append('feature'+str(i))
features_df = pd.read_csv('GNN/gene_embedding_GeneLLM_2.csv', header=None, names=col_name)

print(features_df)
labels_df = pd.read_csv('GNN/solubility.csv')
print(labels_df)
edges_df['combined_score'][7] = 594
edges_df_cleaned = edges_df.dropna()
print(edges_df_cleaned)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  edges_df = pd.read_csv('GNN/protein_interactions.csv')


      protein  feature1  feature2  feature3  feature4  feature5  feature6  \
0         FES  0.339602 -0.030744 -0.901381  0.100888  0.886443  0.383596   
1      HADHA  -0.131799 -0.025745 -0.677301 -0.053545  0.971046  0.180315   
2      SLC7A7  0.385693 -0.070692 -0.847796 -0.022054  0.959772  0.085487   
3        LCK   0.650428  0.014479 -0.866163  0.053508  0.951529  0.269402   
4       HSPA2  0.322262  0.017484 -0.849302  0.046401  0.920429  0.463832   
...       ...       ...       ...       ...       ...       ...       ...   
14445   BPY2C -0.840158 -0.042814 -0.853394 -0.049438  0.943925  0.104337   
14446    CLPS -0.270716 -0.036871 -0.915350 -0.013635  0.972046  0.016017   
14447    DNER  0.228932 -0.033579 -0.907262  0.010446  0.961684  0.524211   
14448    SOX7  0.140491  0.033339 -0.806014 -0.072016  0.938781  0.339959   
14449  CXCL14 -0.570266 -0.011502 -0.741149 -0.096209  0.967244  0.426519   

       feature7  feature8  feature9  ...  feature759  feature760  feature76

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  edges_df['combined_score'][7] = 594


         protein1  protein2 combined_score
0            ARF5   RALGPS2            173
1            ARF5     FHDC1            154
2            ARF5  ATP6V1E1            151
3            ARF5     CYTH2            471
4            ARF5      PSD3            201
...           ...       ...            ...
13715123     LDB1    SAMD14            260
13715124     LDB1     KDM6B            161
13715125     LDB1      WWP2            229
13715126     LDB1    VPS33B            152
13715127     LDB1     NDST2            440

[13067419 rows x 3 columns]


In [3]:
node_id_f = [node_id for node_id in features_df['protein']]
node_id_e1 = [node_id for node_id in edges_df_cleaned['protein1']]
node_id_e2 = [node_id for node_id in edges_df_cleaned['protein2']]
node_id_e = list(set(node_id_e1 + node_id_e2))
print(len(node_id_f))
print(len(node_id_e))
labels_df.rename(columns={'Gene name': 'protein'}, inplace=True)
labels_df

14450
18838


Unnamed: 0,protein,Solubility,Label,Word_Count,Count_Category
0,ERAP2,Membrane,0,117,High Count
1,ADAMTSL5,Soluble,1,28,Low Count
2,TBC1D30,Membrane,0,55,High Count
3,KCNK18,Membrane,0,184,High Count
4,NDNF,Soluble,1,129,High Count
...,...,...,...,...,...
1374,TRABD2B,Membrane,0,96,High Count
1375,RPS9,Soluble,1,205,High Count
1376,SLC22A16,Membrane,0,93,High Count
1377,FBN3,Soluble,1,90,High Count


In [4]:
nodes_in_edges = set(edges_df['protein1']).union(set(edges_df['protein2']))
nodes_in_features = set(features_df['protein'])
missing_nodes = list(nodes_in_edges - nodes_in_features)
average_features = features_df.iloc[:, 1:].mean()

# 为缺失的节点创建平均特征
missing_features =pd.DataFrame([average_features.values] * len(missing_nodes), columns=average_features.index)
missing_features['protein'] = missing_nodes
missing_features = missing_features[features_df.columns]

# 合并原始特征和缺失节点的特征
new_features_df = pd.concat([features_df, missing_features], ignore_index=True)
new_features_df

Unnamed: 0,protein,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature759,feature760,feature761,feature762,feature763,feature764,feature765,feature766,feature767,feature768
0,FES,0.339602,-0.030744,-0.901381,0.100888,0.886443,0.383596,-0.192082,-0.032063,-0.154869,...,-0.549204,-0.856123,0.714672,-0.046649,-0.894424,-0.001815,0.739485,0.015581,-0.023863,-0.022002
1,HADHA,-0.131799,-0.025745,-0.677301,-0.053545,0.971046,0.180315,-0.028189,-0.077389,-0.095152,...,0.927885,-0.817812,0.809631,-0.005827,-0.848839,0.024516,0.526404,-0.039926,-0.102787,-0.026980
2,SLC7A7,0.385693,-0.070692,-0.847796,-0.022054,0.959772,0.085487,0.076455,-0.003006,-0.032268,...,0.941094,-0.912443,0.789828,0.046979,-0.715636,0.085842,0.150494,0.025392,-0.066035,-0.028283
3,LCK,0.650428,0.014479,-0.866163,0.053508,0.951529,0.269402,-0.214788,0.045179,-0.506429,...,-0.576739,-0.969558,0.916549,-0.080332,-0.927649,-0.047398,0.741663,-0.000096,-0.096318,-0.056501
4,HSPA2,0.322262,0.017484,-0.849302,0.046401,0.920429,0.463832,-0.050414,-0.033398,0.387791,...,0.387301,-0.860696,0.678607,-0.060695,-0.945793,0.040472,0.831079,-0.001711,-0.079842,-0.011189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19196,PRM3,0.166824,-0.022353,-0.818924,-0.015590,0.938668,0.350558,-0.064741,-0.024466,0.048526,...,0.638018,-0.851449,0.644471,-0.036707,-0.842413,0.005465,0.456407,-0.028600,-0.076502,-0.027952
19197,PCDHGA2,0.166824,-0.022353,-0.818924,-0.015590,0.938668,0.350558,-0.064741,-0.024466,0.048526,...,0.638018,-0.851449,0.644471,-0.036707,-0.842413,0.005465,0.456407,-0.028600,-0.076502,-0.027952
19198,GALNT9,0.166824,-0.022353,-0.818924,-0.015590,0.938668,0.350558,-0.064741,-0.024466,0.048526,...,0.638018,-0.851449,0.644471,-0.036707,-0.842413,0.005465,0.456407,-0.028600,-0.076502,-0.027952
19199,MVD,0.166824,-0.022353,-0.818924,-0.015590,0.938668,0.350558,-0.064741,-0.024466,0.048526,...,0.638018,-0.851449,0.644471,-0.036707,-0.842413,0.005465,0.456407,-0.028600,-0.076502,-0.027952


In [5]:
# 从边数据集获取节点
nodes_in_edges = set(edges_df_cleaned['protein1']).union(set(edges_df_cleaned['protein2']))

# 从特征数据集获取节点
nodes_in_features = set(features_df['protein'])

# 找出共同的节点
common_nodes = nodes_in_edges.intersection(nodes_in_features)

# 过滤边数据集，保留只包含共同节点的边
filtered_edges_df = edges_df_cleaned[edges_df_cleaned['protein1'].isin(common_nodes) & edges_df_cleaned['protein2'].isin(common_nodes)].reset_index(drop=True)

# 过滤特征数据集，保留共同节点的特征
filtered_features_df = features_df[features_df['protein'].isin(common_nodes)].reset_index(drop=True)

In [6]:
# 将第二列中的两种字符映射为0和1
unique_values = labels_df['Count_Category'].unique()
print(unique_values)
mapping = {unique_values[0]: 0, unique_values[1]: 1}

labels_df['Count_Category'] = labels_df['Count_Category'].map(mapping)
# 显示更新后的DataFrame
print(labels_df)

['High Count' 'Low Count']
       protein Solubility  Label  Word_Count  Count_Category
0        ERAP2   Membrane      0         117               0
1     ADAMTSL5    Soluble      1          28               1
2      TBC1D30   Membrane      0          55               0
3       KCNK18   Membrane      0         184               0
4         NDNF    Soluble      1         129               0
...        ...        ...    ...         ...             ...
1374   TRABD2B   Membrane      0          96               0
1375      RPS9    Soluble      1         205               0
1376  SLC22A16   Membrane      0          93               0
1377      FBN3    Soluble      1          90               0
1378      BDH2    Soluble      1         102               0

[1379 rows x 5 columns]


In [7]:
node_id_to_index = {node_id: i for i, node_id in enumerate(filtered_features_df['protein'])}
# 确保edge_index是按照这个新的索引顺序排列的
source_indices = [node_id_to_index[node_id] for node_id in filtered_edges_df['protein1']]
target_indices = [node_id_to_index[node_id] for node_id in filtered_edges_df['protein2']]
edge_index = torch.tensor([source_indices, target_indices], dtype=torch.long)
filtered_edges_df['combined_score'] = pd.to_numeric(filtered_edges_df['combined_score'], errors='coerce', downcast='float')
edge_weight = torch.tensor(filtered_edges_df['combined_score'].values, dtype=torch.float)

# 重排特征矩阵以匹配edge_index的顺序
features = filtered_features_df.iloc[:, 1:].values
#features = features[[node_id_to_index[node_id] for node_id in filtered_features_df['protein']], :]

# 转换为PyTorch张量
features_tensor = torch.tensor(features, dtype=torch.float)

In [8]:
print(len(labels_df))
nodes_in_labels = set(labels_df['protein'])
nodes_in_filter_features = set(filtered_features_df['protein'])
common_nodes_labels = nodes_in_labels.intersection(nodes_in_filter_features)

# 过滤
filtered_labels_df = labels_df[labels_df['protein'].isin(common_nodes_labels)].reset_index(drop=True)
print(len(filtered_labels_df))
print(len(features_tensor))

1379
1355
14088


In [9]:
data = Data(x=features_tensor, edge_index=edge_index,edge_attr=edge_weight)

print("x:", data.x.shape, data.x.dtype)
print("edge_index:", data.edge_index.shape, data.edge_index.dtype)
print("edge_weight:", data.edge_attr.shape, data.edge_attr.dtype)

x: torch.Size([14088, 768]) torch.float32
edge_index: torch.Size([2, 9503503]) torch.int64
edge_weight: torch.Size([9503503]) torch.float32


In [10]:

def augment_data(features, noise_level=0.1):
    noise = torch.randn(features.size()) * noise_level
    augmented_features = features + noise
    return augmented_features
positive_features = augment_data(data.x)
positive_features

tensor([[ 0.4707, -0.0877, -0.8592,  ...,  0.1232,  0.0039, -0.0741],
        [ 0.4056,  0.1192, -0.6972,  ...,  0.0166, -0.0215, -0.0956],
        [ 0.4352,  0.0031, -0.9215,  ..., -0.1369, -0.2358,  0.1595],
        ...,
        [ 0.3171, -0.0892, -0.8648,  ..., -0.0840,  0.1003, -0.0184],
        [ 0.1259,  0.0853, -0.8695,  ..., -0.0980, -0.2020, -0.1246],
        [-0.6543,  0.1548, -0.8042,  ..., -0.0503,  0.0058, -0.1092]])

In [11]:
import torch
from torch_geometric.utils import degree

def compute_degree_centralities(edge_index, num_nodes):
    # 计算每个节点的度中心性
    deg = degree(edge_index[0], num_nodes=num_nodes)
    # 归一化度中心性
    deg_centralities = deg / deg.max()
    return deg_centralities

centralities = compute_degree_centralities(data.edge_index, data.num_nodes)

In [12]:
print(len(centralities))

14088


In [13]:
filtered_edges_df

Unnamed: 0,protein1,protein2,combined_score
0,ARF5,RALGPS2,173.0
1,ARF5,FHDC1,154.0
2,ARF5,ATP6V1E1,151.0
3,ARF5,CYTH2,471.0
4,ARF5,PSD3,201.0
...,...,...,...
9503498,LDB1,PGAP6,197.0
9503499,LDB1,KDM6B,161.0
9503500,LDB1,WWP2,229.0
9503501,LDB1,VPS33B,152.0


In [14]:
#形成负样本图
import pandas as pd
import numpy as np

def randomize_edges(edges):
    edges['protein1'] = np.random.permutation(edges['protein1'])
    edges['protein2'] = np.random.permutation(edges['protein2'])
    edges['combined_score'] = np.random.permutation(edges['combined_score'])
    return edges

def remove_self_loops(edges):
    no_self_loops = edges[edges['protein1'] != edges['protein2']].reset_index(drop=True)
    return no_self_loops

def generate_unique_edges(edges):
    original_edge_count = len(edges)
    unique_edges = randomize_edges(edges.copy())
    #print("Randomization is OK")
    
    unique_edges = remove_self_loops(unique_edges)
    #print("Removed self loops")
    
    # 计算需要补充的边数
    missing_edges_count = original_edge_count - len(unique_edges)
    #print(f"Need to add {missing_edges_count} edges to maintain the original count.")
    
    # 获取蛋白质列表
    proteins = np.unique(edges[['protein1', 'protein2']].values)
    
    # 补充缺失的边
    new_edges = []
    while len(new_edges) < missing_edges_count:
        new_protein1, new_protein2 = np.random.choice(proteins, 2, replace=False)
        # 确保不产生自环
        if new_protein1 != new_protein2:
            new_edges.append([new_protein1, new_protein2, np.random.choice(edges['combined_score'])])
    
    # 将新边添加到DataFrame
    new_edges_df = pd.DataFrame(new_edges, columns=['protein1', 'protein2', 'combined_score'])
    unique_edges = pd.concat([unique_edges, new_edges_df], ignore_index=True)
    
    return unique_edges

def negtive_data(edges, features_df,features_tensor):
    unique_edges = generate_unique_edges(edges)
    nodes_in_neg_edges = set(unique_edges['protein1']).union(set(unique_edges['protein2']))
    nodes_in_features = set(features_df['protein'])
    common_nodes = nodes_in_neg_edges.intersection(nodes_in_features)
    
    # 过滤边数据集，保留只包含共同节点的边
    filtered_neg_edges_df = unique_edges[unique_edges['protein1'].isin(common_nodes) & unique_edges['protein2'].isin(common_nodes)].reset_index(drop=True)
    
    
    source_indices_neg = [node_id_to_index[node_id] for node_id in filtered_neg_edges_df['protein1']]
    target_indices_neg = [node_id_to_index[node_id] for node_id in filtered_neg_edges_df['protein2']]
    edge_index_neg = torch.tensor([source_indices_neg, target_indices_neg], dtype=torch.long)
    filtered_neg_edges_df['combined_score'] = pd.to_numeric(filtered_neg_edges_df['combined_score'], errors='coerce', downcast='float')
    edge_weight = torch.tensor(filtered_neg_edges_df['combined_score'].values, dtype=torch.float)
    return edge_index_neg, edge_weight

In [None]:
import torch.optim.lr_scheduler as lr_scheduler
from torch.nn import BatchNorm1d
num_features = data.num_features
hidden_dim = 64
out_dim = 64
model = SimpleGNN(num_features, hidden_dim, out_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
#scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=10)  # Reduce learning rate when a metric has stopped improving
scheduler = lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)
margin = 1.0

for epoch in range(200):
    optimizer.zero_grad()
    
    # 假设我们直接使用原始特征作为锚点特征
    anchor_out = model(data.x, data.edge_index, data.edge_weight)
    
    # 生成正样本特征
    positive_features = augment_data(data.x)
    positive_out = model(positive_features, data.edge_index, data.edge_weight)
    
    # 选择负样本特征
    negative_edges, negative_edges_weight = negtive_data(edges_df_cleaned, features_df,features_tensor)
    negative_out = model(data.x, negative_edges, negative_edges_weight)
    
    # 计算Triplet Loss
    loss = triplet_loss(anchor_out, positive_out, negative_out, margin)
    
    loss.backward()
    optimizer.step()
    #scheduler.step(loss)

    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")


Epoch 4, Loss: 0.06553506851196289
Epoch 9, Loss: 0.011904227547347546
Epoch 14, Loss: 0.0053902012296020985
Epoch 19, Loss: 0.002988204127177596
Epoch 24, Loss: 0.001757174963131547
Epoch 29, Loss: 0.001602000673301518
Epoch 34, Loss: 0.001750461058691144
Epoch 39, Loss: 0.0017438260838389397
Epoch 44, Loss: 0.0014201684389263391
Epoch 49, Loss: 0.0010026192758232355
Epoch 54, Loss: 0.001815497176721692
Epoch 59, Loss: 0.0018808568129315972
Epoch 64, Loss: 0.0018815622897818685
Epoch 69, Loss: 0.0017115039518103004
Epoch 74, Loss: 0.002052799565717578
Epoch 79, Loss: 0.002355648437514901
Epoch 84, Loss: 0.0018117381259799004
Epoch 89, Loss: 0.001681266468949616
Epoch 94, Loss: 0.001433409284800291
Epoch 99, Loss: 0.0026615720707923174
Epoch 104, Loss: 0.004988671746104956
Epoch 109, Loss: 0.012942103669047356
Epoch 114, Loss: 0.00706367427483201
Epoch 119, Loss: 0.0026497121434658766
Epoch 124, Loss: 0.0027119426522403955
Epoch 129, Loss: 0.003801556769758463
Epoch 134, Loss: 0.002990

In [None]:
import pandas as pd
import numpy as np

features = model(data.x, data.edge_index, data.edge_weight)

features_np = features.detach().numpy()  # 先分离，再转换为NumPy数组
features_np

In [None]:
# 为特征列生成列名
num_features = features_np.shape[1]  # 特征的数量
feature_columns = ['feature{}'.format(i+1) for i in range(num_features)]

# 创建DataFrame
# 确保filtered_features_df['protein']中的索引与features的行数相匹配
features_df = pd.DataFrame(features_np, index=filtered_features_df['protein'], columns=feature_columns)

# 保存DataFrame到CSV文件
features_df.to_csv('GNN/gcn_triplets_features.csv', index_label='node')

In [None]:

col_name = ['protein']
for i in range(1,64):
  col_name.append('feature'+str(i))
features_df = pd.read_csv('GNN/gcn_triplets_features.csv')
print(features_df)
labels_df = pd.read_csv('GNN/solubility.csv')
labels_df.rename(columns={'Gene name': 'node'}, inplace=True)
result = pd.merge(features_df, labels_df, on='node')
result

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
threshold = 0.5

y_pred_classes = (y_pred > threshold).astype(int)

y_test_classes = (y_test > threshold).astype(int)

accuracy = (y_pred_classes == y_test_classes).mean()
print(f"Accuracy: {accuracy}")