In [10]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import random

# üìå B∆∞·ªõc 1: ƒê·ªçc d·ªØ li·ªáu t·ª´ file CSV
nodes_df = pd.read_csv(r"E:\OOP\Project_OOP\Python\arixv\train_nodes.csv")
edges_df = pd.read_csv(r"E:\OOP\Project_OOP\Python\arixv\train_edges.csv")

# Chuy·ªÉn ƒë·ªïi d·ªØ li·ªáu th√†nh tensor
node_features = torch.tensor(nodes_df.iloc[:, 2:].values, dtype=torch.float)
edge_index = torch.tensor(edges_df.values.T, dtype=torch.long)

# T·∫°o ƒë·ªëi t∆∞·ª£ng Data cho PyTorch Geometric
data = Data(x=node_features, edge_index=edge_index)

# üìå B∆∞·ªõc 2: Chia d·ªØ li·ªáu th√†nh train/test edges
edges = edges_df.values
train_edges, test_edges = train_test_split(edges, test_size=0.2, random_state=42)

# üìå B∆∞·ªõc 3: T·∫°o negative edges
def create_negative_edges(num_nodes, existing_edges, num_samples):
    existing_edges_set = set(map(tuple, existing_edges))
    negative_edges = set()

    while len(negative_edges) < num_samples:
        u, v = random.randint(0, num_nodes - 1), random.randint(0, num_nodes - 1)
        if u != v and (u, v) not in existing_edges_set and (v, u) not in existing_edges_set:
            negative_edges.add((u, v))
    
    return np.array(list(negative_edges))

num_nodes = node_features.shape[0]
negative_test_edges = create_negative_edges(num_nodes, test_edges, len(test_edges))

# üìå B∆∞·ªõc 4: ƒê·ªãnh nghƒ©a m√¥ h√¨nh GCN
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

# üìå B∆∞·ªõc 5: ƒê·ªãnh nghƒ©a l·ªõp LinkPredictor
class LinkPredictor(torch.nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(embedding_dim * 2, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 1)
        )

    def forward(self, z, edge_index):
        z_u = z[edge_index[0]]
        z_v = z[edge_index[1]]
        edge_embeddings = torch.cat([z_u, z_v], dim=1)
        return torch.sigmoid(self.mlp(edge_embeddings))

# üìå B∆∞·ªõc 6: Kh·ªüi t·∫°o m√¥ h√¨nh
in_channels = node_features.shape[1]
hidden_channels = 64
out_channels = 32

gcn_model = GCN(in_channels, hidden_channels, out_channels)
link_predictor = LinkPredictor(out_channels)

# üìå B∆∞·ªõc 7: ƒê·ªãnh nghƒ©a h√†m hu·∫•n luy·ªán
def train(model, predictor, data, train_edges, test_edges, negative_edges, optimizer, epochs=100):
    model.train()
    predictor.train()
    
    for epoch in range(epochs):
        optimizer.zero_grad()
        
        # Tr√≠ch xu·∫•t node embeddings
        z = model(data.x, data.edge_index)
        
        # Chuy·ªÉn train_edges th√†nh tensor
        positive_edges = torch.tensor(train_edges.T, dtype=torch.long)
        negative_edges = create_negative_edges(data.x.shape[0], train_edges, len(train_edges))
        negative_edges = torch.tensor(negative_edges.T, dtype=torch.long)
        
        # D·ª± ƒëo√°n li√™n k·∫øt
        pos_preds = predictor(z, positive_edges)
        neg_preds = predictor(z, negative_edges)
        
        # T√≠nh loss
        loss = F.binary_cross_entropy(pos_preds, torch.ones_like(pos_preds)) + \
               F.binary_cross_entropy(neg_preds, torch.zeros_like(neg_preds))
        
        # Lan truy·ªÅn ng∆∞·ª£c
        loss.backward()
        optimizer.step()
        
        # In loss v√† accuracy m·ªói 10 epoch
        if (epoch + 1) % 10 == 0:
            accuracy = evaluate(model, predictor, data, test_edges, negative_edges)
            print(f'Epoch {epoch + 1}, Loss: {loss.item()}, Accuracy: {accuracy:.4f}')

# üìå B∆∞·ªõc 8: Kh·ªüi t·∫°o optimizer v√† hu·∫•n luy·ªán m√¥ h√¨nh
optimizer = torch.optim.Adam(list(gcn_model.parameters()) + list(link_predictor.parameters()), lr=0.01)
train(gcn_model, link_predictor, data, train_edges, test_edges, negative_test_edges, optimizer)

# üìå B∆∞·ªõc 9: ƒê√°nh gi√° m√¥ h√¨nh sau khi hu·∫•n luy·ªán
accuracy = evaluate(gcn_model, link_predictor, data, test_edges, negative_test_edges)
print(f"Final Accuracy: {accuracy:.4f}")

# üìå B∆∞·ªõc 10: L∆∞u m√¥ h√¨nh
torch.save({
    'gcn_model': gcn_model.state_dict(),
    'link_predictor': link_predictor.state_dict()
}, 'link_predict.pth')

print("M√¥ h√¨nh ƒë√£ ƒë∆∞·ª£c l∆∞u v√†o 'link_predict.pth'")

# üìå B∆∞·ªõc 11: ƒê·ªãnh nghƒ©a h√†m ƒë√°nh gi√° accuracy
def evaluate(model, predictor, data, test_edges, negative_edges):
    model.eval()
    predictor.eval()
    
    with torch.no_grad():
        # Tr√≠ch xu·∫•t node embeddings
        z = model(data.x, data.edge_index)
        
        # Chuy·ªÉn ƒë·ªïi edges th√†nh tensor
        test_edges_tensor = torch.tensor(test_edges.T, dtype=torch.long)
        negative_edges_tensor = torch.tensor(negative_edges.T, dtype=torch.long)
        
        # D·ª± ƒëo√°n li√™n k·∫øt
        positive_preds = predictor(z, test_edges_tensor)
        negative_preds = predictor(z, negative_edges_tensor)
        
        # G√°n nh√£n
        all_preds = torch.cat([positive_preds, negative_preds])
        all_labels = torch.cat([torch.ones_like(positive_preds), torch.zeros_like(negative_preds)])
        
        # T√≠nh accuracy
        accuracy = accuracy_score(all_labels.cpu(), (all_preds.cpu() > 0.5).int())
        return accuracy


  negative_edges_tensor = torch.tensor(negative_edges.T, dtype=torch.long)


Epoch 10, Loss: 1.0756607055664062, Accuracy: 0.6530


  negative_edges_tensor = torch.tensor(negative_edges.T, dtype=torch.long)


Epoch 20, Loss: 0.9846515655517578, Accuracy: 0.6817


  negative_edges_tensor = torch.tensor(negative_edges.T, dtype=torch.long)


Epoch 30, Loss: 0.9188287258148193, Accuracy: 0.7573


  negative_edges_tensor = torch.tensor(negative_edges.T, dtype=torch.long)


Epoch 40, Loss: 0.8894703984260559, Accuracy: 0.7199


  negative_edges_tensor = torch.tensor(negative_edges.T, dtype=torch.long)


Epoch 50, Loss: 0.8697546720504761, Accuracy: 0.7779


  negative_edges_tensor = torch.tensor(negative_edges.T, dtype=torch.long)


Epoch 60, Loss: 0.8561029434204102, Accuracy: 0.7816


  negative_edges_tensor = torch.tensor(negative_edges.T, dtype=torch.long)


Epoch 70, Loss: 0.8483686447143555, Accuracy: 0.7889


  negative_edges_tensor = torch.tensor(negative_edges.T, dtype=torch.long)


Epoch 80, Loss: 0.8427889943122864, Accuracy: 0.8180


  negative_edges_tensor = torch.tensor(negative_edges.T, dtype=torch.long)


Epoch 90, Loss: 0.8344252109527588, Accuracy: 0.8298


  negative_edges_tensor = torch.tensor(negative_edges.T, dtype=torch.long)


Epoch 100, Loss: 0.8278007507324219, Accuracy: 0.7962
Final Accuracy: 0.8073
M√¥ h√¨nh ƒë√£ ƒë∆∞·ª£c l∆∞u v√†o 'link_predict.pth'


In [7]:
import torch
import pandas as pd
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# S·ª≠a l·∫°i class GCN
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, in_channels)  # out_channels = in_channels

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

def map_edge_index_to_new_nodes(edge_index, existing_node_count, new_node_count):
    # L·ªçc b·ªè c√°c c·∫°nh c√≥ ch·ªâ s·ªë node v∆∞·ª£t qu√° s·ªë node hi·ªán c√≥
    max_node_idx = existing_node_count + new_node_count - 1
    valid_edges = (edge_index[0] <= max_node_idx) & (edge_index[1] <= max_node_idx)
    edge_index = edge_index[:, valid_edges]
    
    # √Ånh x·∫° node
    node_map = {i: i for i in range(existing_node_count)}
    node_map.update({existing_node_count + i: existing_node_count + i for i in range(new_node_count)})
    
    edge_index_mapped = edge_index.clone()
    edge_index_mapped[0] = torch.tensor([node_map.get(int(x), x) for x in edge_index[0]], dtype=torch.long)
    edge_index_mapped[1] = torch.tensor([node_map.get(int(x), x) for x in edge_index[1]], dtype=torch.long)
    
    return edge_index_mapped

# S·ª≠a l·∫°i h√†m load_and_train_model
def load_and_train_model(train_edges_file, train_nodes_file, new_nodes_file=None):
    # ƒê·ªçc v√† x·ª≠ l√Ω d·ªØ li·ªáu
    edges_df = pd.read_csv(train_edges_file)
    nodes_df = pd.read_csv(train_nodes_file)
    
    # ƒê·∫£m b·∫£o c√°c ch·ªâ s·ªë trong edges_df kh√¥ng v∆∞·ª£t qu√° s·ªë node
    max_node_id = len(nodes_df) - 1
    edges_df = edges_df[
        (edges_df['source'] <= max_node_id) & 
        (edges_df['target'] <= max_node_id)
    ]

    # ƒê·ªçc th√™m node m·ªõi n·∫øu c√≥
    if new_nodes_file:
        new_nodes_df = pd.read_csv(new_nodes_file)
        new_node_features = torch.tensor(new_nodes_df.iloc[:, 1:].values, dtype=torch.float)
    else:
        new_node_features = torch.tensor([], dtype=torch.float)

    # S·ªë l∆∞·ª£ng node trong ƒë·ªì th·ªã hi·ªán t·∫°i
    existing_node_count = len(nodes_df)
    all_node_features = torch.tensor(nodes_df.iloc[:, 1:].values, dtype=torch.float)

    # √Ånh x·∫° l·∫°i ch·ªâ s·ªë c√°c c·∫°nh
    edge_index = torch.tensor(edges_df.values.T, dtype=torch.long)
    edge_index = map_edge_index_to_new_nodes(edge_index, existing_node_count, new_nodes_df.shape[0] if new_nodes_file else 0)

    # Gh√©p c√°c node m·ªõi v√†o node features
    all_node_features = torch.cat([all_node_features, new_node_features], dim=0)

    # T·∫°o ƒë·ªëi t∆∞·ª£ng d·ªØ li·ªáu cho PyTorch Geometric
    data = Data(x=all_node_features, edge_index=edge_index)

    # Kh·ªüi t·∫°o model v·ªõi out_channels = in_channels
    model = GCN(in_channels=all_node_features.shape[1], hidden_channels=64)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    # Hu·∫•n luy·ªán m√¥ h√¨nh
    model.train()
    for epoch in range(100):
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = F.mse_loss(out, data.x)
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')

    # L∆∞u m√¥ h√¨nh
    torch.save(model.state_dict(), "link_predict.pth")

    return model, data

# S·ª≠a l·∫°i h√†m predict_with_trained_model 
def predict_with_trained_model(model_path, data, new_nodes_file):
    # T·∫£i m√¥ h√¨nh ƒë√£ hu·∫•n luy·ªán
    model = GCN(in_channels=data.x.shape[1], hidden_channels=64)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    # ƒê·ªçc node m·ªõi v√† ƒë·∫£m b·∫£o s·ªë features gi·ªëng nhau
    new_nodes_df = pd.read_csv(new_nodes_file)
    expected_features = data.x.shape[1]  # S·ªë features c·ªßa nodes hi·ªán t·∫°i
    
    # Ch·ªâ l·∫•y s·ªë c·ªôt features gi·ªëng v·ªõi nodes hi·ªán t·∫°i
    new_node_features = torch.tensor(new_nodes_df.iloc[:, 1:expected_features+1].values, dtype=torch.float)

    # Gh√©p c√°c node m·ªõi v√†o node_features hi·ªán t·∫°i
    all_node_features = torch.cat([data.x, new_node_features], dim=0)

    # D·ª± ƒëo√°n embedding cho t·∫•t c·∫£ c√°c node, bao g·ªìm node m·ªõi
    edge_index = data.edge_index
    with torch.no_grad():
        output_embeddings = model(all_node_features, edge_index)

    return output_embeddings, all_node_features

def compute_similarity_and_create_edges(output_embeddings, existing_node_count, edges_df, threshold=0.5, max_edges=2):
    embeddings_np = output_embeddings if isinstance(output_embeddings, np.ndarray) else output_embeddings.numpy()
    
    new_nodes_embeddings = embeddings_np[existing_node_count:]
    existing_nodes_embeddings = embeddings_np[:existing_node_count]
    
    similarities = cosine_similarity(new_nodes_embeddings, existing_nodes_embeddings)
    
    new_edges = []
    for i in range(len(new_nodes_embeddings)):
        new_node_idx = i + existing_node_count
        # Get top 2 most similar existing nodes
        top_similar_indices = np.argsort(similarities[i])[-max_edges:]
        
        for target_idx in top_similar_indices:
            if similarities[i][target_idx] > threshold:
                new_edges.append([new_node_idx, int(target_idx)])
    
    # Combine old and new edges
    old_edges = edges_df.values.tolist()
    all_edges = old_edges + new_edges
    
    return all_edges

# Main Code
if __name__ == "__main__":
    # ƒê·ªçc d·ªØ li·ªáu ban ƒë·∫ßu v√† hu·∫•n luy·ªán m√¥ h√¨nh
    model, data = load_and_train_model("E:/OOP/Project_OOP/Python/link_process/edges_user.csv", 
                                       "E:/OOP/Project_OOP/Python/link_process/nodes_user.csv")
    
    # D·ª± ƒëo√°n v·ªõi node m·ªõi
    output_embeddings, all_node_features = predict_with_trained_model("link_predict.pth", data, "E:/OOP/Project_OOP/Python/link_process/node_difference.csv")
    
    # L∆∞u embeddings v√† c√°c node m·ªõi v√†o CSV
    output_node_ids = torch.arange(all_node_features.shape[0]).numpy()
    output_embeddings = output_embeddings.numpy()

    # L∆∞u node embeddings v√†o CSV
    node_embeddings_df = pd.DataFrame(output_embeddings)
    node_embeddings_df.insert(0, 'node_id', output_node_ids)
    node_embeddings_df.to_csv(r'E:\OOP\Project_OOP\Python\link_process\all_embeddings.csv', index=False)

    # Read original edges
    edges_df = pd.read_csv("E:/OOP/Project_OOP/Python/link_process/edges_user.csv")
    
    # Get all edges including predictions
    all_edges = compute_similarity_and_create_edges(
        output_embeddings,
        len(data.x),
        edges_df,
        threshold=0.5
    )
    
    # Save all edges
    all_edges_df = pd.DataFrame(all_edges, columns=['source', 'target'])
    all_edges_df.to_csv(r'E:\OOP\Project_OOP\Python\link_process\all_edges.csv', index=False)


Epoch 0, Loss: 0.09416823089122772
Epoch 10, Loss: 0.01551192905753851
Epoch 20, Loss: 0.012615996412932873
Epoch 30, Loss: 0.011646936647593975
Epoch 40, Loss: 0.010929622687399387
Epoch 50, Loss: 0.010382021777331829
Epoch 60, Loss: 0.009937699884176254
Epoch 70, Loss: 0.009567917324602604
Epoch 80, Loss: 0.009255164302885532
Epoch 90, Loss: 0.008993063122034073


  model.load_state_dict(torch.load(model_path))


In [8]:
import py4cytoscape as p4c
import pandas as pd

# K·∫øt n·ªëi v·ªõi Cytoscape
p4c.cytoscape_ping()

# ƒê·ªçc d·ªØ li·ªáu t·ª´ t·ªáp CSV (source, target)
edges_df = pd.read_csv(r'E:\OOP\Project_OOP\Python\link_process\all_edges.csv')

# Ki·ªÉm tra d·ªØ li·ªáu ƒë√£ ƒë·ªçc
print(edges_df.head())

# L·∫•y danh s√°ch c√°c n√∫t t·ª´ c√°c c·ªôt 'source' v√† 'target'
nodes_list = pd.concat([edges_df['source'], edges_df['target']]).unique()

# T·∫°o DataFrame cho c√°c n√∫t
nodes_df = pd.DataFrame(nodes_list, columns=['id'])

# Th√™m c·ªôt 'interaction' n·∫øu ch∆∞a c√≥
if 'interaction' not in edges_df.columns:
    edges_df['interaction'] = 'interacts'

# Chuy·ªÉn ƒë·ªïi c√°c gi√° tr·ªã th√†nh chu·ªói
edges_df['source'] = edges_df['source'].astype(str)
edges_df['target'] = edges_df['target'].astype(str)
nodes_df['id'] = nodes_df['id'].astype(str)

# T·∫°o m·∫°ng t·ª´ c√°c DataFrame c·ªßa n√∫t v√† c·∫°nh
network = p4c.create_network_from_data_frames(nodes_df, edges_df, title='Link Network', collection='My Collection')


# Ki·ªÉm tra m·∫°ng ƒë√£ ƒë∆∞·ª£c t·∫°o th√†nh c√¥ng
print("Network created successfully!")

You are connected to Cytoscape!
   source  target
0     339     175
1     368     460
2     340     380
3      60      46
4     442      22
Applying default style...
Applying preferred layout
Network created successfully!


In [9]:
import py4cytoscape as cy
import os

# ƒê∆∞·ªùng d·∫´n t·ªõi t·ªáp CSV
file_path = r'E:\OOP\Project_OOP\Python\link_process\all_embeddings.csv'

# Ki·ªÉm tra xem t·ªáp c√≥ t·ªìn t·∫°i kh√¥ng
if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
else:
    # T·∫£i d·ªØ li·ªáu b·∫£ng v√†o Cytoscape
    try:
        cy.load_table_data_from_file(file_path)
        # Ki·ªÉm tra c√°c c·ªôt trong b·∫£ng d·ªØ li·ªáu ƒë√£ ƒë∆∞·ª£c t·∫£i v√†o Cytoscape
        print(cy.get_table_columns())
    except Exception as e:
        print(f"Error loading table data: {e}")

  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv


      SUID shared name   id         0         1         2         3         4  \
7169  7169          98   98 -0.109298  0.061719 -0.220922  0.081481  0.026675   
6146  6146          26   26 -0.110200 -0.059621 -0.211268  0.110036  0.061998   
7172  7172         335  335 -0.149694  0.062972 -0.180110  0.112999  0.134093   
6149  6149          73   73 -0.136004 -0.008596 -0.218698  0.026196  0.039039   
7175  7175         112  112 -0.087737  0.026812 -0.100444 -0.105991  0.040443   
...    ...         ...  ...       ...       ...       ...       ...       ...   
6137  6137          11   11 -0.160092 -0.016777 -0.163568 -0.073726  0.068310   
7163  7163         265  265 -0.139076  0.043572 -0.180990 -0.108719  0.035920   
6140  6140          37   37 -0.180965 -0.072114 -0.221447 -0.091723  0.068965   
7166  7166         171  171 -0.184844  0.070447 -0.207689  0.069398 -0.003302   
6143  6143         345  345 -0.087531 -0.034971 -0.164422 -0.094609  0.034036   

             5         6  .

  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
  df[col] = cvv
