In [15]:
import dgl
import torch
import scipy.sparse as sp
import numpy as np
import pandas as pd
import itertools
import networkx as nx
import os
from utils import eids_split, remove_all_edges_of_etype, get_num_nodes_dict
from collections import Counter

In [2]:

def process_grn_data(g,
                     val_ratio,
                     test_ratio,
                     neg):
    
    '''
    Parameters
    ----------
    g : dgl graph
    
    val_ratio : float
    
    test_ratio : float
    
    neg: string
        One of ['pred_etype_neg', 'src_tgt_neg'], different negative sampling modes. See below.
    
    Returns
    ----------
    mp_g: 
        graph for message passing.
    
    graphs containing positive edges and negative edges for train, valid, and test
    '''
    
    u, v = g.edges()

    M = u.shape[0] # number of edges
    eids = torch.arange(M)
    train_pos_eids, val_pos_eids, test_pos_eids = eids_split(eids, val_ratio, test_ratio)

    train_pos_u, train_pos_v = u[train_pos_eids], v[train_pos_eids]
    val_pos_u, val_pos_v = u[val_pos_eids], v[val_pos_eids]
    test_pos_u, test_pos_v = u[test_pos_eids], v[test_pos_eids]

    if neg == 'pred_etype_neg':
        # Edges not in pred_etype as negative edges
        adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())), shape=(g.num_nodes(), g.num_nodes()))
        adj_neg = 1 - adj.todense()
        neg_u, neg_v = np.where(adj_neg != 0)
    else:
        raise ValueError('Unknow negative argument')
        
    neg_eids = np.random.choice(neg_u.shape[0], min(neg_u.shape[0], M), replace=False)
    train_neg_eids, val_neg_eids, test_neg_eids = eids_split(torch.from_numpy(neg_eids), val_ratio, test_ratio)

    # Avoid losing dimension in single number slicing
    train_neg_u, train_neg_v = np.take(neg_u, train_neg_eids), np.take(neg_v, train_neg_eids)
    val_neg_u, val_neg_v = np.take(neg_u, val_neg_eids),np.take(neg_v, val_neg_eids)
    test_neg_u, test_neg_v = np.take(neg_u, test_neg_eids), np.take(neg_v, test_neg_eids)


    train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.num_nodes())
    train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.num_nodes())
    val_pos_g = dgl.graph((val_pos_u, val_pos_v), num_nodes=g.num_nodes())
    val_neg_g = dgl.graph((val_neg_u, val_neg_v), num_nodes=g.num_nodes())
    test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.num_nodes())

    test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.num_nodes())
        # Create message passing graph by removing all edges (그러나 엣지 타입의 구분이 없기때문에 동일.)
    mp_g = g

    return mp_g, train_pos_g, train_neg_g, val_pos_g, val_neg_g, test_pos_g, test_neg_g



In [3]:


def load_grn_dataset(dataset_dir, dataset_name, val_ratio, test_ratio):
    '''
    Parameters
    ----------
    dataset_dir : string
        dataset directory
    
    dataset_name : string
    
    val_ratio : float
    
    test_ratio : float

    Returns:
    ----------
    g: dgl graph
        The original graph

    processed_g: tuple of seven dgl graphs
        The outputs of the function `process_data`, 
        which includes g for message passing, train, valid, and test
  
    '''
    graph_saving_path = f'{dataset_dir}/{dataset_name}'
    graph_list, _ = dgl.load_graphs(graph_saving_path)
    g = graph_list[0] # 리스트로 반환되나 실상 단일 그래프이므로.
 
    neg = 'pred_etype_neg'
    processed_g = process_grn_data(g, val_ratio, test_ratio, neg)
    return g, processed_g





In [4]:
def edge_label_creation(ecoli1_gold,edge_list):

   edge_df = pd.DataFrame(edge_list, columns =['source', 'target'])
   ecoli1_gold[0] = ecoli1_gold[0].str.replace('G', '')
   ecoli1_gold[1] = ecoli1_gold[1].str.replace('G', '')
   ecoli1_gold= ecoli1_gold.astype(int)
   ecoli1_gold[0] = ecoli1_gold[0] - 1
   ecoli1_gold[1] = ecoli1_gold[1] - 1

   edge_df['edge'] = 0
   for i in range(ecoli1_gold.shape[0]):
         r = ecoli1_gold.iat[i,0]
         c = ecoli1_gold.iat[i,1]
         idx= edge_df.loc[(edge_df['source'] == r) & (edge_df['target'] == c)].index
         edge_df.loc[idx,'edge']=ecoli1_gold.iat[i,2]
   return edge_df


def save_graphs_to_folder(graphs, folder_path):
    """
    DGL 그래프들을 지정된 폴더에 저장하는 함수.

    Parameters:
    ----------
    graphs : dict
        저장할 그래프 딕셔너리. 키는 그래프 이름, 값은 DGL 그래프.
    folder_path : str
        그래프를 저장할 폴더 경로.
    """
    # 폴더가 존재하지 않으면 생성
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # 그래프 저장
    for graph_name, graph in graphs.items():
        save_path = os.path.join(folder_path, f"{graph_name}.bin")
        dgl.save_graphs(save_path, graph)
        print(f"그래프 '{graph_name}'이(가) {save_path}에 저장되었습니다.")




In [16]:

def convert_grn_to_dgl_graph(file_hetero,file_null,file_traject,file_gold):
    """
    GRN 데이터를 DGL 그래프 형태로 변환하는 함수

    Parameters:
    ----------
    node_file : str
        노드 특징이 포함된 파일 경로
    edge_file : str
        엣지 데이터가 포함된 파일 경로
    gold_file : str
        라벨링된 표준(Gold Standard) 파일 경로

    Returns:
    ----------
    dgl_graph : dgl.DGLGraph
        DGL 그래프 객체
    """
    default_path="./data/DREAM4/DREAM4_InSilico_Size100/"#+folder_name+"/"+folder_name+"/"
    default_goldpath="./data/DREAM4/gold_std/"

    # Load data
    hetero = pd.read_csv(default_path + file_hetero, sep='\t')
    null = pd.read_csv(default_path + file_null, sep='\t')
    traject = pd.read_csv(default_path + file_traject, sep='\t')
    gold = pd.read_csv(default_goldpath + file_gold, sep='\t', header=None)

    # Extract wildtype values
    wildtype_vals = hetero.loc[1, :].values.tolist()
    hetero['id'] = hetero.index
    
    # Create node features
    node_features = hetero[['id']]
    node_features['wildtype'] = wildtype_vals

    traj = traject.T.iloc[1:, 1:]
    traj = traj.reset_index()
    node_features = pd.concat([node_features, traj], axis=1)
    node_features = node_features.drop(['index'], axis=1)

    # Extract edge features and edge labels
    edge_list = list(itertools.product(node_features["id"], repeat=2))
    edge_lab = edge_label_creation(gold, edge_list)

    null = null.iloc[1:, :].reset_index(drop=True)
    null_list = null.values.flatten()

    hetero = hetero.iloc[1:, :].reset_index(drop=True).drop(['id'], axis=1)
    hetero_list = hetero.values.flatten()

    edge_lab.columns = ['s', 'd', 'edge']
    edge_lab = edge_lab.iloc[100:].reset_index(drop=True)
    edge_lab['KO'] = null_list
    edge_lab['KD'] = hetero_list
    edge_lab = edge_lab[edge_lab['edge'] == 1]

    # Extract source, destination, and edge attributes
    src = edge_lab["s"].tolist()
    dst = edge_lab['d'].tolist()
    #######디버깅 코드1
    # src와 dst 생성 확인
    print("Source nodes:", src[:10])  # src의 일부 확인
    print("Destination nodes:", dst[:10])  # dst의 일부 확인

    # edge_lab 데이터에서 source와 destination 간선 중복 확인
    edge_lab_pairs = list(zip(edge_lab["s"], edge_lab["d"]))
    edge_lab_counts = Counter(edge_lab_pairs)

    # 중복 간선 확인
    duplicates_in_edge_lab = {edge: count for edge, count in edge_lab_counts.items() if count > 1}
    print("edge_lab 중복 간선:", duplicates_in_edge_lab)
    ########
    KO = edge_lab["KO"].tolist()
    KD = edge_lab["KD"].tolist()

    # Create DGL graph for basic_data
    g_basic = dgl.graph((src, dst))
    id_tensor = torch.tensor(node_features['id'].tolist(), dtype=torch.float32).view(-1, 1)
    wildtype_tensor = torch.tensor(node_features['wildtype'].tolist(), dtype=torch.float32).view(-1, 1)
    g_basic.ndata['id'] = id_tensor
    g_basic.ndata['wildtype'] = wildtype_tensor
    g_basic.edata['KO'] = torch.tensor(KO, dtype=torch.float32).view(-1, 1)
    g_basic.edata['KD'] = torch.tensor(KD, dtype=torch.float32).view(-1, 1)

    ######
    # g_basic의 간선 데이터 확인
    src, dst = g_basic.edges()

    # (source, destination) 튜플로 간선 리스트 생성
    edges = list(zip(src.tolist(), dst.tolist()))

    # 중복된 간선 확인
    
    edge_counts = Counter(edges)

    # 중복 간선 출력
    duplicates = {edge: count for edge, count in edge_counts.items() if count > 1}
    print("중복 간선:", duplicates)

    #####
    # Create DGL graph for basic_TS_data
    g_basic_TS = g_basic.clone()
    traj_tensor = torch.tensor(node_features.iloc[:, 2:].values, dtype=torch.float32)
    g_basic_TS.ndata['trajectory'] = traj_tensor

    # Create DGL graph for basic_aug_data
    G = g_basic.to_networkx().to_undirected()
    ############
    print(type(G))  # <class 'networkx.classes.multigraph.MultiGraph'>이면 Multigraph로 변환된 것

    # G의 중복 간선 확인
    if isinstance(G, nx.MultiGraph):
        multi_edges = list(G.edges(data=True))
        print("Multigraph 간선:", multi_edges)
    ############    
    pagerank = nx.pagerank(G)
    clustering_coef = nx.clustering(G)
    betweenness_centrality = nx.betweenness_centrality(G, k=50)
    degree = dict(G.degree())

    pagerank_tensor = torch.tensor([pagerank[i] for i in range(len(pagerank))], dtype=torch.float32).view(-1, 1)
    clustering_tensor = torch.tensor([clustering_coef[i] for i in range(len(clustering_coef))], dtype=torch.float32).view(-1, 1)
    betweenness_tensor = torch.tensor([betweenness_centrality[i] for i in range(len(betweenness_centrality))], dtype=torch.float32).view(-1, 1)
    degree_tensor = torch.tensor([degree[i] for i in range(len(degree))], dtype=torch.float32).view(-1, 1)

    g_basic_aug = g_basic.clone()
    g_basic_aug.ndata['pagerank'] = pagerank_tensor
    g_basic_aug.ndata['clustering_coef'] = clustering_tensor
    g_basic_aug.ndata['betweenness'] = betweenness_tensor
    g_basic_aug.ndata['degree'] = degree_tensor

    # Create DGL graph for basic_TS_aug_data
    g_basic_TS_aug = g_basic_TS.clone()
    g_basic_TS_aug.ndata['pagerank'] = pagerank_tensor
    g_basic_TS_aug.ndata['clustering_coef'] = clustering_tensor
    g_basic_TS_aug.ndata['betweenness'] = betweenness_tensor
    g_basic_TS_aug.ndata['degree'] = degree_tensor

    print("Basic Graph:", g_basic)
    print("Basic TS Graph:", g_basic_TS)
    print("Basic Augmented Graph:", g_basic_aug)
    print("Basic TS Augmented Graph:", g_basic_TS_aug)


    # 그래프를 저장할 폴더 경로
    folder_path = "./datasets"

    # 그래프를 딕셔너리로 저장
    graphs = {
        "basic_graph": g_basic,
        "basic_ts_graph": g_basic_TS,
        "basic_aug_graph": g_basic_aug,
        "basic_ts_aug_graph": g_basic_TS_aug
    }

    # 그래프 저장
    save_graphs_to_folder(graphs, folder_path)
    
    return g_basic, g_basic_TS, g_basic_aug, g_basic_TS_aug

# Example usage:
# g_basic, g_basic_TS, g_basic_aug, g_basic_TS_aug = data_preprocessing_dgl('folder_name', 'file_hetero.tsv', 'file_null.tsv', 'file_traject.tsv', 'file_gold.tsv')



In [17]:
#Path for the files
gold_std = "./data/DREAM4/gold_std/"

InsilicoSize100_org  = {"Ecoli1":  ["InSilicoSize100","insilico_size100_1_knockdowns.tsv","insilico_size100_1_knockouts.tsv","insilico_size100_1_timeseries.tsv","DREAM4_GoldStandard_InSilico_Size100_1.tsv"],
                       "Ecoli2":  ["InSilicoSize100","insilico_size100_2_knockdowns.tsv","insilico_size100_2_knockouts.tsv","insilico_size100_2_timeseries.tsv","DREAM4_GoldStandard_InSilico_Size100_2.tsv"],
                       "Yeast1":  ["InSilicoSize100","insilico_size100_3_knockdowns.tsv","insilico_size100_3_knockouts.tsv","insilico_size100_3_timeseries.tsv","DREAM4_GoldStandard_InSilico_Size100_3.tsv"],
                       "Yeast2":  ["InSilicoSize100","insilico_size100_4_knockdowns.tsv","insilico_size100_4_knockouts.tsv","insilico_size100_4_timeseries.tsv","DREAM4_GoldStandard_InSilico_Size100_4.tsv"],
                       "Yeast3":  ["InSilicoSize100","insilico_size100_5_knockdowns.tsv","insilico_size100_5_knockouts.tsv","insilico_size100_5_timeseries.tsv","DREAM4_GoldStandard_InSilico_Size100_5.tsv"]
                        }

In [18]:

path = "./data/DREAM4/DREAM4_InSilico_Size100/"
for org, files in InsilicoSize100_org.items():
    basic_data,basic_TS_data,basic_aug_data,basic_TS_aug_data = convert_grn_to_dgl_graph(files[1],files[2],files[3],files[4])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  node_features['wildtype'] = wildtype_vals


Source nodes: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
Destination nodes: [1, 2, 3, 5, 6, 7, 8, 9, 10, 11]
edge_lab 중복 간선: {}
중복 간선: {}
<class 'networkx.classes.multigraph.MultiGraph'>
Multigraph 간선: [(0, 36, {'id': 36}), (0, 41, {'id': 63}), (1, 4, {'id': 0}), (2, 4, {'id': 1}), (3, 4, {'id': 2}), (3, 45, {'id': 73}), (3, 84, {'id': 138}), (4, np.int64(5), {'id': 3}), (4, np.int64(6), {'id': 4}), (4, np.int64(7), {'id': 5}), (4, np.int64(8), {'id': 6}), (4, np.int64(9), {'id': 7}), (4, np.int64(10), {'id': 8}), (4, np.int64(11), {'id': 9}), (4, np.int64(12), {'id': 10}), (4, np.int64(13), {'id': 11}), (4, np.int64(14), {'id': 12}), (4, np.int64(15), {'id': 13}), (4, np.int64(16), {'id': 14}), (4, np.int64(17), {'id': 15}), (4, np.int64(18), {'id': 16}), (4, np.int64(19), {'id': 17}), (4, np.int64(20), {'id': 18}), (4, np.int64(21), {'id': 19}), (6, 24, {'id': 25}), (7, 36, {'id': 37}), (8, 37, {'id': 60}), (8, 45, {'id': 74}), (8, 84, {'id': 139}), (9, np.int64(36), {'id': 38}), (9, np.int64(43

NetworkXNotImplemented: not implemented for multigraph type