<a href="https://colab.research.google.com/github/zeinabkamkar98/graph_simulation/blob/main/graph_simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step Zero

Loading datasets and datasets

## Loading Libraries

In [33]:
import random
import numpy as np
import pandas as pd

from scipy.special import  rel_entr

import networkx as nx
from networkx.drawing import draw_networkx

import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


## Importing datasets

**Load github datasets**

If you wanna use github datasets run below cell. This datasets are limited (10 datasets), They're just for test.

In [34]:
graphs_name = ['DHFR','BZR','COX2','AIDS','ENZYMES','DD','MUTAG','NCI1','PROTEINS_full','PTC_MR']

!git clone https://github.com/zeinabkamkar98/graph_simulation.git

fatal: destination path 'graph_simulation' already exists and is not an empty directory.


## Utils


In [35]:
def read_graph_file(path):
    G = nx.Graph()
    data = np.loadtxt(path, delimiter=',').astype(int)
    data_tuple = list(map(tuple, data))
    G.add_edges_from(data_tuple)
    return G


def create_random_graph(nodes_count, edges_count ):
    return nx.gnm_random_graph(nodes_count, edges_count)


def add_features_to_graph(G: nx.Graph):
    for node in G.nodes:
        neighbors = G.neighbors(node)
        degrees = [G.degree(neighbor) for neighbor in neighbors]
        min_degree = np.min(degrees) if degrees else 0
        max_degree = np.max(degrees) if degrees else 0
        mean_degree = np.mean(degrees) if degrees else 0
        median_degree = np.median(degrees) if degrees else 0
        G.nodes[node]["features"] = np.array(
            [
                mean_degree,
                median_degree,
                max_degree,
                min_degree,
            ]
        )
    for edge in G.edges:
        G.edges[edge]["features"] = np.zeros(5)
        G.edges[edge]["features"][:2] = (
            G.nodes[edge[0]]["features"][:2] + G.nodes[edge[1]]["features"][:2]
        )

        max_deg = max(G.nodes[edge[0]]["features"][2], G.nodes[edge[1]]["features"][2])
        min_deg = min(G.nodes[edge[0]]["features"][3], G.nodes[edge[1]]["features"][3])
        G.edges[edge]["features"][2] = max_deg
        G.edges[edge]["features"][3] = min_deg
        G.edges[edge]["features"][4] = max_deg - min_deg

def get_graph_features(graph):
    return np.concatenate(
            [np.array([v]) for v in nx.get_edge_attributes(graph, "features").values()]
        )
def calculate_divergence(g1, g2):
    epsilon = 1e-10

    g1_features_data = get_graph_features(g1)
    g2_features_data = get_graph_features(g2)

    feature_divergence = np.zeros(g1_features_data.shape[1])

    for i in range(5):
      data = g1_features_data[:, i] + g2_features_data[:, i]
      hist_range = (np.min(data), np.max(data))
      g1_hist, _ = np.histogram(g1_features_data[:, i], range = hist_range)
      g2_hist, _ = np.histogram(g2_features_data[:, i], range = hist_range)
      feature_divergence[i] = np.sum(
          rel_entr(g1_hist + epsilon / (np.sum(g1_hist) + epsilon) , g2_hist + epsilon / (np.sum(g2_hist) + epsilon) )
      )

    return feature_divergence

def convet_to_data_frame(data):
  column_labels = ["mean", "median", "max", "min", "range"]

  return pd.DataFrame(data, columns = column_labels)



# Step One: Calculate Features for Original Graph

## Original graph

Import original graph and calculate and features to its nodes and edges

In [22]:
selected_graph_name= graphs_name[6]
graph = read_graph_file('graph_simulation/DATASETS/MUTAG/MUTAG_A.txt')

add_features_to_graph(graph)

Some additional outputs for checking the nodes' labels and edges' labels

In [23]:
# node_labels = {
#     node: f"{node} mean:{attr['features'][0]:.3f} median:{attr['features'][1]} max:{attr['features'][2]} min:{attr['features'][3]}"
#     for node, attr in graph.nodes(data=True)
# }

# edge_labels = {
#     (u, v): {
#         "mean": attr["features"][0],
#         "median": attr["features"][1],
#         "max": attr["features"][2],
#         "min": attr["features"][3],
#         "range": attr["features"][4]
#     }
#     for u, v, attr in graph.edges(data=True)
# }

# display(edge_labels)
# display(node_labels)

# display(draw_networkx(graph,  with_labels = False))

## Complement Of Original Graph


In [24]:
graph_complement = nx.complement(graph)
add_features_to_graph(graph_complement)


In [25]:
# node_labels = {
#     node: f"{node} mean:{attr['features'][0]:.3f} median:{attr['features'][1]} max:{attr['features'][2]} min:{attr['features'][3]}"
#     for node, attr in graph_complement.nodes(data=True)
# }

# edge_labels = {
#     (u, v): {
#         "mean": attr["features"][0],
#         "median": attr["features"][1],
#         "max": attr["features"][2],
#         "min": attr["features"][3],
#         "range": attr["features"][4]
#     }
#     for u, v, attr in graph_complement.edges(data=True)
# }

# display(edge_labels)
# display(node_labels)

# display(draw_networkx(graph_complement,  with_labels=False))


# Step Two: Simulate Graph

In this step, I generated a random graph with same node and edge number of the original graph

In [26]:
random_seed = random.randint(1, 1000)
random.seed(random_seed)

simulated_graph = create_random_graph(len(graph.nodes), len(graph.edges))
add_features_to_graph(simulated_graph)
simulated_graph_div = np.sum(calculate_divergence(graph, simulated_graph))
print("simulated_graph_div", simulated_graph_div)

# simulated_graph_comp = nx.complement(simulated_graph)
# add_features_to_graph(simulated_graph_comp)
# simulated_graph_comp_div = np.sum(calculate_divergence(graph_complement, simulated_graph_comp))

# print("simulated_graph_comp", simulated_graph_comp_div)


simulated_graph_div 10948.307565220308


In [28]:
graph_feature_data_frame = convet_to_data_frame(get_graph_features(graph))
graph_feature_data_frame['label'] = 1
graph_feature_data_frame.describe()


Unnamed: 0,mean,median,max,min,range,label
count,3721.0,3721.0,3721.0,3721.0,3721.0,3721.0
mean,4.917495,4.786885,2.980113,1.719162,1.260951,1.0
std,0.447816,0.742706,0.15074,0.486794,0.522508,0.0
min,3.5,3.0,2.0,1.0,0.0,1.0
25%,4.666667,4.0,3.0,1.0,1.0,1.0
50%,4.833333,5.0,3.0,2.0,1.0,1.0
75%,5.333333,5.0,3.0,2.0,2.0,1.0
max,6.0,6.0,4.0,3.0,3.0,1.0


In [29]:
graph_compelement_feature_data_frame = convet_to_data_frame(get_graph_features(graph_complement))
graph_compelement_feature_data_frame['label'] = 0
graph_compelement_feature_data_frame.describe()


Unnamed: 0,mean,median,max,min,range,label
count,5676414.0,5676414.0,5676414.0,5676414.0,5676414.0,5676414.0
mean,6735.585,6736.0,3369.0,3366.0,2.999999,0.0
std,0.0004596008,0.0,0.0,0.001028107,0.001028107,0.0
min,6735.584,6736.0,3369.0,3366.0,2.0,0.0
25%,6735.585,6736.0,3369.0,3366.0,3.0,0.0
50%,6735.585,6736.0,3369.0,3366.0,3.0,0.0
75%,6735.585,6736.0,3369.0,3366.0,3.0,0.0
max,6735.587,6736.0,3369.0,3367.0,3.0,0.0


In [31]:
train_data_frame = pd.concat([graph_feature_data_frame,graph_compelement_feature_data_frame])
train_data_frame.describe()

Unnamed: 0,mean,median,max,min,range,label
count,5680135.0,5680135.0,5680135.0,5680135.0,5680135.0,5680135.0
mean,6731.176,6731.59,3366.795,3363.796,2.99886,0.0006550901
std,172.2132,172.2271,86.12415,86.07965,0.04647301,0.02558634
min,3.5,3.0,2.0,1.0,0.0,0.0
25%,6735.585,6736.0,3369.0,3366.0,3.0,0.0
50%,6735.585,6736.0,3369.0,3366.0,3.0,0.0
75%,6735.585,6736.0,3369.0,3366.0,3.0,0.0
max,6735.587,6736.0,3369.0,3367.0,3.0,1.0


In [32]:
simulated_graph_feature_data_frame = convet_to_data_frame(get_graph_features(simulated_graph))
simulated_graph_feature_data_frame.describe()

Unnamed: 0,mean,median,max,min,range
count,3721.0,3721.0,3721.0,3721.0,3721.0
mean,6.422467,6.24402,5.037356,1.483741,3.553615
std,1.426265,1.480058,1.501953,0.568261,1.53731
min,2.0,2.0,1.0,1.0,0.0
25%,5.666667,5.5,4.0,1.0,3.0
50%,6.5,6.0,5.0,1.0,4.0
75%,7.333333,7.0,6.0,2.0,4.0
max,12.2,12.0,10.0,3.0,9.0


In [None]:
X = train_data_frame.drop('label', axis=1)
y = train_data_frame['label']

# split the train and test dataset
X_train, X_test, y_train, y_test = train_test_split(train_data_frame, y, shuffle=True,test_size=0.1, random_state=23)
y_test.sum()

# LogisticRegression
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)

# Prediction
y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Logistic Regression model accuracy (in %):", acc*100)


Logistic Regression model accuracy (in %): 100.0
