In [1]:
! pip install node2vec
! pip install gensim
! pip install networkx
! pip install matplotlib

Collecting node2vec
  Downloading node2vec-0.5.0-py3-none-any.whl.metadata (849 bytes)
Downloading node2vec-0.5.0-py3-none-any.whl (7.2 kB)
Installing collected packages: node2vec
Successfully installed node2vec-0.5.0


In [2]:
import networkx as nx
import matplotlib.pyplot as plt
from node2vec import Node2Vec

In [4]:
graphe=nx.read_graphml("database_formated_for_NetworkX.graphml")

In [20]:
node_to_print = 'n0'

# Print the node and its attributes
if node_to_print in graphe.nodes():
    print(f"Node {node_to_print} and its attributes:")
    print(graphe.nodes[node_to_print])
else:
    print(f"Node {node_to_print} does not exist in the graph.")


Node n0 and its attributes:
{'labels': ':Event', 'id': 'fireColorado2012', 'trecisid': 'TRECIS-CTIT-H-001', 'eventType': 'wildfire'}


In [44]:
import pandas as pd
node_classes = [graphe.nodes[node].get('eventType') for node in graphe.nodes()]
len(graphe.nodes())
print(node_classes[0])

node_class_counts = pd.Series(node_classes).value_counts()

print("Class Distribution:")
print(node_class_counts)

wildfire
Class Distribution:
earthquake    8
flood         7
typhoon       6
wildfire      5
shooting      5
bombing       3
Name: count, dtype: int64


In [39]:
import random
import networkx as nx
import pandas as pd

minority_classes = node_class_counts[node_class_counts < node_class_counts.max()].index
print(f"Minority Classes: {minority_classes}")

# Set a target number of nodes for each minority class (use the majority class size)
target_count = node_class_counts.max()

# Create a new graph for oversampling
new_graphe = graphe.copy()

# For each minority class, calculate how many additional nodes are needed
for minority_class in minority_classes:
    current_count = node_class_counts[minority_class]
    nodes_needed = target_count - current_count

    if nodes_needed > 0:
        print(f"Minority class '{minority_class}' needs {nodes_needed} nodes.")

        # List of nodes in the current minority class
        nodes_in_class = [node for node in graphe.nodes() if graphe.nodes[node].get('eventType') == minority_class]

        # Duplicate the nodes to achieve the target count
        nodes_to_add = []
        while len(nodes_to_add) < nodes_needed:
            nodes_to_add.extend(nodes_in_class)

        # Trim to exactly the number of nodes needed
        nodes_to_add = nodes_to_add[:nodes_needed]

        # Add duplicates to the new graph (create new nodes with the same attributes)
        for node in nodes_to_add:
            new_node = f"{node}_duplicate"  # New unique node name
            new_graphe.add_node(new_node, **graphe.nodes[node])  # Copy attributes
            # Add an edge to a random neighboring node (optional, to keep the graph connected)
            if len(list(graphe.neighbors(node))) > 0:
                new_graphe.add_edge(new_node, random.choice(list(graphe.neighbors(node))))

# Verify the updated class distribution in the new graph
new_node_classes = [new_graphe.nodes[node].get('eventType') for node in new_graphe.nodes()]
new_node_class_counts = pd.Series(new_node_classes).value_counts()

print("Updated Class Distribution in new graph:")
print(new_node_class_counts)


Class Distribution before oversampling:
earthquake    8
flood         7
typhoon       6
wildfire      5
shooting      5
bombing       3
Name: count, dtype: int64
Minority Classes: Index(['flood', 'typhoon', 'wildfire', 'shooting', 'bombing'], dtype='object')
Minority class 'flood' needs 1 nodes.
Minority class 'typhoon' needs 2 nodes.
Minority class 'wildfire' needs 3 nodes.
Minority class 'shooting' needs 3 nodes.
Minority class 'bombing' needs 5 nodes.
Updated Class Distribution in new graph:
wildfire      8
earthquake    8
flood         8
typhoon       8
shooting      8
bombing       6
Name: count, dtype: int64


In [42]:
len(new_graphe.nodes())

109639

In [5]:
import random

# Ensure reproducibility
random.seed(42)

# Get all edges and shuffle them
edges = list(graphe.edges())
random.shuffle(edges)

# Split edges into train (70%) and test (30%)
split_idx = int(0.7 * len(edges))
train_edges = edges[:split_idx]
test_edges = edges[split_idx:]

# Create the train graph
train_graph = nx.Graph()
train_graph.add_nodes_from(graphe.nodes(data=True))
train_graph.add_edges_from(train_edges)

# Create the test set (you won't actually create a graph from it, just keep the edge list)
test_set = test_edges

# Print results
print(f"Total edges: {len(edges)}")
print(f"Train edges: {len(train_edges)}")
print(f"Test edges: {len(test_set)}")


Total edges: 311654
Train edges: 218157
Test edges: 93497
