<a href="https://colab.research.google.com/github/zeinabkamkar98/graph_simulation/blob/main/graph_simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step Zero

Loading datasets and datasets

## Loading Libraries

In [1]:
import random
import numpy as np

from scipy.special import  rel_entr

import networkx as nx
from networkx.drawing import draw_networkx

import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


## Importing datasets

**Load github datasets**

If you wanna use github datasets run below cell. This datasets are limited (10 datasets), They're just for test.

In [2]:
graphs_name = ['DHFR','BZR','COX2','AIDS','ENZYMES','DD','MUTAG','NCI1','PROTEINS_full','PTC_MR']

!git clone https://github.com/zeinabkamkar98/graph_simulation.git

fatal: destination path 'graph_simulation' already exists and is not an empty directory.


## Utils


In [3]:
def read_graph_file(path):
    G = nx.Graph()
    data = np.loadtxt(path, delimiter=',').astype(int)
    data_tuple = list(map(tuple, data))
    G.add_edges_from(data_tuple)
    return G


def create_random_graph(nodes_count, edges_count ):
    return nx.gnm_random_graph(nodes_count, edges_count)


def add_features_to_graph(G: nx.Graph):
    for node in G.nodes:
        neighbors = G.neighbors(node)
        degrees = [G.degree(neighbor) for neighbor in neighbors]
        min_degree = np.min(degrees) if degrees else 0
        max_degree = np.max(degrees) if degrees else 0
        mean_degree = np.mean(degrees) if degrees else 0
        median_degree = np.median(degrees) if degrees else 0
        G.nodes[node]["features"] = np.array(
            [
                mean_degree,
                median_degree,
                max_degree,
                min_degree,
            ]
        )
    for edge in G.edges:
        G.edges[edge]["features"] = np.zeros(5)
        G.edges[edge]["features"][:2] = (
            G.nodes[edge[0]]["features"][:2] + G.nodes[edge[1]]["features"][:2]
        )

        max_deg = max(G.nodes[edge[0]]["features"][2], G.nodes[edge[1]]["features"][2])
        min_deg = min(G.nodes[edge[0]]["features"][3], G.nodes[edge[1]]["features"][3])
        G.edges[edge]["features"][2] = max_deg
        G.edges[edge]["features"][3] = min_deg
        G.edges[edge]["features"][4] = max_deg - min_deg


def calculate_divergence(g1, g2):
    epsilon = 1e-10

    g1_features_data = np.concatenate(
        [np.array([v]) for v in nx.get_edge_attributes(g1, "features").values()]
    )
    g2_features_data = np.concatenate(
        [np.array([v]) for v in nx.get_edge_attributes(g2, "features").values()]
    )
    feature_divergence = np.zeros(g1_features_data.shape[1])

    for i in range(5):
      data = g1_features_data[:, i] + g2_features_data[:, i]
      hist_range = (np.min(data), np.max(data))
      g1_hist, _ = np.histogram(g1_features_data[:, i], range = hist_range)
      g2_hist, _ = np.histogram(g2_features_data[:, i], range = hist_range)
      feature_divergence[i] = np.sum(
          rel_entr(g1_hist + epsilon / (np.sum(g1_hist) + epsilon) , g2_hist + epsilon / (np.sum(g2_hist) + epsilon) )
      )

    return feature_divergence

# Step One: Calculate Features for Original Graph

## Original graph

Import original graph and calculate and features to its nodes and edges

In [4]:
selected_graph_name= graphs_name[6]
graph = read_graph_file('graph_simulation/DATASETS/MUTAG/MUTAG_A.txt')

add_features_to_graph(graph)

Some additional outputs for checking the nodes' labels and edges' labels

In [5]:
# node_labels = {
#     node: f"{node} mean:{attr['features'][0]:.3f} median:{attr['features'][1]} max:{attr['features'][2]} min:{attr['features'][3]}"
#     for node, attr in graph.nodes(data=True)
# }

# edge_labels = {
#     (u, v): {
#         "mean": attr["features"][0],
#         "median": attr["features"][1],
#         "max": attr["features"][2],
#         "min": attr["features"][3],
#         "range": attr["features"][4]
#     }
#     for u, v, attr in graph.edges(data=True)
# }

# display(edge_labels)
# display(node_labels)

# display(draw_networkx(graph,  with_labels = False))

## Complement Of Original Graph


In [6]:
graph_complement = nx.complement(graph)
add_features_to_graph(graph_complement)


In [7]:
# node_labels = {
#     node: f"{node} mean:{attr['features'][0]:.3f} median:{attr['features'][1]} max:{attr['features'][2]} min:{attr['features'][3]}"
#     for node, attr in graph_complement.nodes(data=True)
# }

# edge_labels = {
#     (u, v): {
#         "mean": attr["features"][0],
#         "median": attr["features"][1],
#         "max": attr["features"][2],
#         "min": attr["features"][3],
#         "range": attr["features"][4]
#     }
#     for u, v, attr in graph_complement.edges(data=True)
# }

# display(edge_labels)
# display(node_labels)

# display(draw_networkx(graph_complement,  with_labels=False))


# Step Two: Simulate Graph

In this step, I generated a random graph with same node and edge number of the original graph

In [10]:
random_seed = random.randint(1, 1000)
random.seed(random_seed)

simulated_graph = create_random_graph(len(graph.nodes), len(graph.edges))
add_features_to_graph(simulated_graph)
simulated_graph_div = np.sum(calculate_divergence(graph, simulated_graph))
print("simulated_graph_div", simulated_graph_div)

# simulated_graph_comp = nx.complement(simulated_graph)
# add_features_to_graph(simulated_graph_comp)
# simulated_graph_comp_div = np.sum(calculate_divergence(graph_complement, simulated_graph_comp))

# print("simulated_graph_comp", simulated_graph_comp_div)


simulated_graph_div 8311.068255204942


In [None]:

# # split the train and test dataset
# X_train, X_test,\
# 	y_train, y_test = train_test_split(X, y,
# 									test_size=0.20,
# 									random_state=23)
# # LogisticRegression
# clf = LogisticRegression(random_state=0)
# clf.fit(X_train, y_train)
# # Prediction
# y_pred = clf.predict(X_test)

# acc = accuracy_score(y_test, y_pred)
# print("Logistic Regression model accuracy (in %):", acc*100)
