In [12]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import csv
import pandas as pd
import random
from random import choice
from random import sample



In [13]:
# grap init
G_dblp = nx.Graph()


# dblp.tsv
with open('data/dblp/com-dblp/out.com-dblp.tsv', 'r') as file:
    for line in file:
        source, target = line.strip().split(' ')
        G_dblp.add_edge(int(source), int(target))

num_edges = G_dblp.number_of_edges()
num_nodes = G_dblp.number_of_nodes()


print("number of directed edges in dblp.tsv:", num_edges)
print("number of directed nodes in dblp.tsv:", num_nodes)

number of directed edges in dblp.tsv: 1049866
number of directed nodes in dblp.tsv: 317080


## Creating a fraction of graph for teting

In [66]:
# Define the fraction of nodes to include in the subgraph 
fraction = 0.05

# Get a subset of nodes based on the fraction
subset_nodes = list(G_dblp.nodes())[:int(fraction * len(G_dblp))]

# Create a subgraph using the subset of nodes
subgraph_dblp = G_dblp.subgraph(subset_nodes)

# Get the number of edges and nodes in the subgraph
num_edges_sub = subgraph_dblp.number_of_edges()
num_nodes_sub = subgraph_dblp.number_of_nodes()

# subgraph info:
# print("Subgraph Nodes:", subgraph_dblp.nodes())
# print("Subgraph Edges:", subgraph_dblp.edges())
print("-----" + "\n")
print("Number of edges in subgraph:", num_edges_sub)
print("Number of nodes in subgraph:", num_nodes_sub)


-----

Number of edges in subgraph: 72769
Number of nodes in subgraph: 15854


## Choosing random nodes for landmark

In [54]:
def generate_landmark_list(graph, num_landmarks=20):
    """
    Generate a list of random landmarks from the given graph.

    Parameters:
    - graph: NetworkX graph
    - num_landmarks: Number of landmarks to generate (default is 20)

    Returns:
    - landmark_list: List of random landmarks
    """
    landmark_list = []

    for i in range(num_landmarks):
        random_node = choice(list(graph.nodes()))
        if random_node not in landmark_list:
            landmark_list.append(random_node)

    return landmark_list

# Example usage:
num_landmarks = 20

landmark_list = generate_landmark_list(G_dblp, num_landmarks)
print("Landmark List:", landmark_list)


Landmark List: [281365, 268588, 197617, 14306, 116569, 101195, 108644, 21442, 261569, 265311, 57815, 227744, 226747, 293167, 129730, 156599, 243827, 9209, 43918, 37144]


## Calculating each node's degree

In [57]:
# Define the number of top nodes to print
N = 20

# Get the top nodes with the highest degree
top_nodes_degree = sorted(G_dblp.nodes(), key=G_dblp.degree, reverse=True)[:N]

# Print the degree for each of the top nodes
for node in top_nodes_degree:
    node_degree = G_dblp.degree(node)
    print(f"The degree of node {node} is {node_degree}")


The degree of node 3336 is 343
The degree of node 3345 is 296
The degree of node 167 is 290
The degree of node 14690 is 269
The degree of node 13941 is 264
The degree of node 30095 is 244
The degree of node 13842 is 230
The degree of node 865 is 227
The degree of node 3298 is 225
The degree of node 13811 is 221
The degree of node 15326 is 219
The degree of node 3346 is 218
The degree of node 3326 is 218
The degree of node 1827 is 215
The degree of node 1833 is 215
The degree of node 13953 is 215
The degree of node 45 is 208
The degree of node 7227 is 207
The degree of node 2486 is 201
The degree of node 6319 is 201


## Calculate each's node closeness centrality (approximation)

In [58]:
def approximate_closeness_centrality(graph, num_seeds=10):
    # Select a sample of random seed nodes
    seed_nodes = sample(graph.nodes(), num_seeds)

    # Initialize the closeness centrality dictionary
    approx_closeness_centrality = {node: 0.0 for node in graph.nodes()}

    # Perform BFS computations from each seed node
    for seed_node in seed_nodes:
        distances = nx.single_source_shortest_path_length(graph, seed_node)
        for node, distance in distances.items():
            approx_closeness_centrality[node] += distance

    # Normalize the closeness centrality by the number of seed nodes
    for node in graph.nodes():
        approx_closeness_centrality[node] /= num_seeds

    return approx_closeness_centrality

In [59]:
# # Example usage on fraction of graph
# subgraph_dblp = nx.subgraph(G_dblp, subset_nodes)  
# num_seeds = 100  # Adjust as needed

# # Calculate approximate closeness centrality for each node
# approx_closeness_centrality = approximate_closeness_centrality(subgraph_dblp, num_seeds)

# # Count the number of nodes with zero closeness centrality
# num_nodes_with_zero_centrality = sum(1 for centrality in approx_closeness_centrality.values() if centrality == 0)

# # Get the bottom nodes with the lowest approximate closeness centrality and non-zero centrality
# bottom_nodes_closeness = [node for node in sorted(approx_closeness_centrality, key=approx_closeness_centrality.get) if approx_closeness_centrality[node] > 0][:N]

# # Print the approximate closeness centrality for each of the bottom nodes
# for node in bottom_nodes_closeness:
#     centrality = approx_closeness_centrality[node]
#     print(f"The approximate closeness centrality of node {node} is {centrality}")

# # Print the count of nodes with zero closeness centrality
# print(f"Number of nodes with zero closeness centrality: {num_nodes_with_zero_centrality}")

In [64]:
# Whole graph closeness
num_seeds = 20  

# Calculate approximate closeness centrality for each node
approx_closeness_centrality = approximate_closeness_centrality(G_dblp, num_seeds)

# Count the number of nodes with zero closeness centrality
num_nodes_with_zero_centrality = sum(1 for centrality in approx_closeness_centrality.values() if centrality == 0)

# Get the bottom nodes with the lowest approximate closeness centrality and non-zero centrality
bottom_nodes_closeness = [node for node in sorted(approx_closeness_centrality, key=approx_closeness_centrality.get) if approx_closeness_centrality[node] > 0][:N]

# Print the approximate closeness centrality for each of the bottom nodes
for node in bottom_nodes_closeness:
    centrality = approx_closeness_centrality[node]
    print(f"The approximate closeness centrality of node {node} is {centrality}")

# Print the count of nodes with zero closeness centrality
print(f"Number of nodes with zero closeness centrality: {num_nodes_with_zero_centrality}")

since Python 3.9 and will be removed in a subsequent version.
  seed_nodes = sample(graph.nodes(), num_seeds)


The approximate closeness centrality of node 167 is 4.65
The approximate closeness centrality of node 4306 is 4.7
The approximate closeness centrality of node 35572 is 4.7
The approximate closeness centrality of node 2021 is 4.75
The approximate closeness centrality of node 4807 is 4.75
The approximate closeness centrality of node 6341 is 4.75
The approximate closeness centrality of node 3330 is 4.75
The approximate closeness centrality of node 1972 is 4.75
The approximate closeness centrality of node 2175 is 4.75
The approximate closeness centrality of node 1830 is 4.8
The approximate closeness centrality of node 3336 is 4.8
The approximate closeness centrality of node 7430 is 4.8
The approximate closeness centrality of node 13842 is 4.8
The approximate closeness centrality of node 29720 is 4.8
The approximate closeness centrality of node 3300 is 4.8
The approximate closeness centrality of node 45 is 4.85
The approximate closeness centrality of node 2006 is 4.85
The approximate closen

## Calculate each's node betweeness centrality

In [61]:
def approximate_betweenness_centrality(graph, num_seeds=10):
    # Select a sample of random seed nodes
    seed_nodes = sample(graph.nodes(), num_seeds)

    # Initialize the betweenness centrality dictionary
    approx_betweenness_centrality = {node: 0.0 for node in graph.nodes()}

    # Perform BFS computations from each seed node and accumulate dependencies
    for seed_node in seed_nodes:
        paths = nx.single_source_shortest_path(graph, source=seed_node)
        dependencies = {node: 0 for node in graph.nodes()}

        for path in paths.values():
            for node in path[1:-1]:  # Exclude the source and target nodes
                dependencies[node] += 1

        # Accumulate betweenness centrality using dependencies
        for node in graph.nodes():
            if node != seed_node:
                approx_betweenness_centrality[node] += dependencies[node]

    # Normalize the betweenness centrality by the number of seed nodes
    for node in graph.nodes():
        approx_betweenness_centrality[node] /= num_seeds

    return approx_betweenness_centrality


In [62]:
# Whole graph betweenness
num_seeds = 20

# Calculate approximate betweenness centrality for each node
approx_betweenness_centrality = approximate_betweenness_centrality(G_dblp, num_seeds)

# Count the number of nodes with zero betweenness centrality
num_nodes_with_zero_centrality = sum(1 for centrality in approx_betweenness_centrality.values() if centrality == 0)

# Get the top nodes with the highest approximate betweenness centrality and non-zero centrality
top_nodes_between = [node for node in sorted(approx_betweenness_centrality, reverse=True, key=approx_betweenness_centrality.get) if approx_betweenness_centrality[node] > 0][:N]

# Print the approximate betweenness centrality for each of the top nodes
for node in top_nodes_between:
    centrality = approx_betweenness_centrality[node]
    print(f"The approximate betweenness centrality of node {node} is {centrality}")

# Print the count of nodes with zero betweenness centrality
print(f"Number of nodes with zero betweenness centrality: {num_nodes_with_zero_centrality}")

since Python 3.9 and will be removed in a subsequent version.
  seed_nodes = sample(graph.nodes(), num_seeds)


The approximate betweenness centrality of node 19741 is 15975.45
The approximate betweenness centrality of node 161387 is 15881.6
The approximate betweenness centrality of node 41719 is 15877.35
The approximate betweenness centrality of node 133007 is 15872.15
The approximate betweenness centrality of node 207952 is 15869.8
The approximate betweenness centrality of node 60061 is 15867.05
The approximate betweenness centrality of node 19972 is 15862.8
The approximate betweenness centrality of node 207430 is 15858.95
The approximate betweenness centrality of node 161386 is 15856.65
The approximate betweenness centrality of node 112139 is 15855.75
The approximate betweenness centrality of node 194301 is 15819.4
The approximate betweenness centrality of node 38607 is 15721.75
The approximate betweenness centrality of node 1197 is 15706.9
The approximate betweenness centrality of node 93135 is 15696.5
The approximate betweenness centrality of node 37940 is 15281.95
The approximate betweenne

## Area of repulsion

In [31]:
def check_neighborhood(nodes_to_check, landmarks, graph, k=2):
    """
    Check if any node in nodes_to_check is within the k neighborhood of any landmark.

    Parameters:
        - nodes_to_check (list): List of nodes to check
        - landmarks (list): List of landmark nodes
        - graph (NetworkX graph): The graph
        - k (int): Neighborhood size (default is 2)

    Returns:
        - conflicting_nodes (list): List of nodes that are within the k neighborhood of any landmark
    """
    conflicting_nodes = []

    for landmark in landmarks:
        # Exclude nodes within the k neighborhood of the current landmark
        excluded_nodes = set(nx.single_source_shortest_path_length(graph, landmark, cutoff=k).keys())
        
        # Check for conflicts with nodes_to_check
        conflicts = set(nodes_to_check) & excluded_nodes

        # Add conflicting nodes to the list
        conflicting_nodes.extend(conflicts)

    return list(set(conflicting_nodes))




In [65]:
# Create a loop to generate random landmarks until there are no conflicts
while True:
    
    landmark_list = generate_landmark_list(G_dblp)
    # Check for conflicts between the set of landmarks for each centrality measure
    conflicts_degree = check_neighborhood(top_nodes_degree, landmark_list, G_dblp, 2)
    conflicts_closeness = check_neighborhood(bottom_nodes_closeness, landmark_list, G_dblp, 2)
    conflicts_between = check_neighborhood(top_nodes_between, landmark_list, G_dblp, 2)

    # If there are no conflicts for any centrality measure, break out of the loop
    if not (conflicts_degree or conflicts_closeness or conflicts_between):
        break

# Print the final set of landmarks
print("Landmarks:", landmark_list)


Landmarks: [261089, 80272, 41075, 106958, 11958, 167282, 139864, 145453, 180338, 260548, 216110, 279046, 3703, 257204, 192959, 81757, 208892, 268068, 253142, 241438]
