In [12]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import csv
import pandas as pd
import random
from random import choice
from random import sample



In [13]:
# grap init
G_dblp = nx.Graph()


# dblp.tsv
with open('data/dblp/com-dblp/out.com-dblp.tsv', 'r') as file:
    for line in file:
        source, target = line.strip().split(' ')
        G_dblp.add_edge(int(source), int(target))

num_edges = G_dblp.number_of_edges()
num_nodes = G_dblp.number_of_nodes()


print("number of directed edges in dblp.tsv:", num_edges)
print("number of directed nodes in dblp.tsv:", num_nodes)

number of directed edges in dblp.tsv: 1049866
number of directed nodes in dblp.tsv: 317080


## Creating a fraction of graph for teting

In [52]:
# Define the fraction of nodes to include in the subgraph 
fraction = 0.05

# Get a subset of nodes based on the fraction
subset_nodes = list(G_dblp.nodes())[:int(fraction * len(G_dblp))]

# Create a subgraph using the subset of nodes
subgraph_dblp = G_dblp.subgraph(subset_nodes)

# Get the number of edges and nodes in the subgraph
num_edges_sub = subgraph_dblp.number_of_edges()
num_nodes_sub = subgraph_dblp.number_of_nodes()

# Now you can use 'subgraph' for testing purposes
# For example, print the nodes and edges of the subgraph
print("Subgraph Nodes:", subgraph_dblp.nodes())
print("Subgraph Edges:", subgraph_dblp.edges())
print("-----" + "\n")
print("Number of edges in subgraph:", num_edges_sub)
print("Number of nodes in subgraph:", num_nodes_sub)


## Choosing random nodes for landmark

In [54]:
from random import choice

def generate_landmark_list(graph, num_landmarks=20):
    """
    Generate a list of random landmarks from the given graph.

    Parameters:
    - graph: NetworkX graph
    - num_landmarks: Number of landmarks to generate (default is 20)

    Returns:
    - landmark_list: List of random landmarks
    """
    landmark_list = []

    for i in range(num_landmarks):
        random_node = choice(list(graph.nodes()))
        if random_node not in landmark_list:
            landmark_list.append(random_node)

    return landmark_list

# Example usage:
num_landmarks = 20

landmark_list = generate_landmark_list(G_dblp, num_landmarks)
print("Landmark List:", landmark_list)


Landmark List: [281365, 268588, 197617, 14306, 116569, 101195, 108644, 21442, 261569, 265311, 57815, 227744, 226747, 293167, 129730, 156599, 243827, 9209, 43918, 37144]


## Calculating each node's degree

In [16]:
# Define the number of top nodes to print
N = 20

# Get the top nodes with the highest degree
top_nodes_degree = sorted(subgraph_dblp.nodes(), key=subgraph_dblp.degree, reverse=True)[:N]

# Print the degree for each of the top nodes
for node in top_nodes_degree:
    node_degree = subgraph_dblp.degree(node)
    print(f"The degree of node {node} is {node_degree}")


The degree of node 167 is 290
The degree of node 865 is 227
The degree of node 45 is 208
The degree of node 1448 is 191
The degree of node 1197 is 174
The degree of node 993 is 152
The degree of node 906 is 149
The degree of node 875 is 149
The degree of node 480 is 147
The degree of node 162 is 146
The degree of node 980 is 136
The degree of node 1270 is 128
The degree of node 1208 is 121
The degree of node 166 is 119
The degree of node 979 is 118
The degree of node 747 is 117
The degree of node 1206 is 117
The degree of node 765 is 116
The degree of node 1338 is 115
The degree of node 164 is 114


## Calculate each's node closeness centrality (approximation)

In [17]:
def approximate_closeness_centrality(graph, num_seeds=10):
    # Select a sample of random seed nodes
    seed_nodes = sample(graph.nodes(), num_seeds)

    # Initialize the closeness centrality dictionary
    approx_closeness_centrality = {node: 0.0 for node in graph.nodes()}

    # Perform BFS computations from each seed node
    for seed_node in seed_nodes:
        distances = nx.single_source_shortest_path_length(graph, seed_node)
        for node, distance in distances.items():
            approx_closeness_centrality[node] += distance

    # Normalize the closeness centrality by the number of seed nodes
    for node in graph.nodes():
        approx_closeness_centrality[node] /= num_seeds

    return approx_closeness_centrality

In [18]:
# Example usage on fraction of graph
subgraph_dblp = nx.subgraph(G_dblp, subset_nodes)  
num_seeds = 100  # Adjust as needed

# Calculate approximate closeness centrality for each node
approx_closeness_centrality = approximate_closeness_centrality(subgraph_dblp, num_seeds)

# Count the number of nodes with zero closeness centrality
num_nodes_with_zero_centrality = sum(1 for centrality in approx_closeness_centrality.values() if centrality == 0)

# Get the bottom nodes with the lowest approximate closeness centrality and non-zero centrality
bottom_nodes_closeness = [node for node in sorted(approx_closeness_centrality, key=approx_closeness_centrality.get) if approx_closeness_centrality[node] > 0][:N]

# Print the approximate closeness centrality for each of the bottom nodes
for node in bottom_nodes_closeness:
    centrality = approx_closeness_centrality[node]
    print(f"The approximate closeness centrality of node {node} is {centrality}")

# Print the count of nodes with zero closeness centrality
print(f"Number of nodes with zero closeness centrality: {num_nodes_with_zero_centrality}")

since Python 3.9 and will be removed in a subsequent version.
  seed_nodes = sample(graph.nodes(), num_seeds)


The approximate closeness centrality of node 6329 is 3.49
The approximate closeness centrality of node 45 is 3.5
The approximate closeness centrality of node 742 is 3.53
The approximate closeness centrality of node 1827 is 3.54
The approximate closeness centrality of node 2175 is 3.55
The approximate closeness centrality of node 1818 is 3.56
The approximate closeness centrality of node 906 is 3.57
The approximate closeness centrality of node 4101 is 3.57
The approximate closeness centrality of node 979 is 3.59
The approximate closeness centrality of node 1448 is 3.59
The approximate closeness centrality of node 167 is 3.6
The approximate closeness centrality of node 1197 is 3.6
The approximate closeness centrality of node 2016 is 3.61
The approximate closeness centrality of node 1823 is 3.62
The approximate closeness centrality of node 4306 is 3.62
The approximate closeness centrality of node 6319 is 3.62
The approximate closeness centrality of node 993 is 3.63
The approximate closenes

In [24]:
# Example usage on fraction of whole graph
num_seeds = 20  

# Calculate approximate closeness centrality for each node
approx_closeness_centrality = approximate_closeness_centrality(G_dblp, num_seeds)

# Count the number of nodes with zero closeness centrality
num_nodes_with_zero_centrality = sum(1 for centrality in approx_closeness_centrality.values() if centrality == 0)

# Get the bottom nodes with the lowest approximate closeness centrality and non-zero centrality
bottom_nodes_closeness = [node for node in sorted(approx_closeness_centrality, key=approx_closeness_centrality.get) if approx_closeness_centrality[node] > 0][:N]

# Print the approximate closeness centrality for each of the bottom nodes
for node in bottom_nodes_closeness:
    centrality = approx_closeness_centrality[node]
    print(f"The approximate closeness centrality of node {node} is {centrality}")

# Print the count of nodes with zero closeness centrality
print(f"Number of nodes with zero closeness centrality: {num_nodes_with_zero_centrality}")

since Python 3.9 and will be removed in a subsequent version.
  seed_nodes = sample(graph.nodes(), num_seeds)


The approximate closeness centrality of node 4306 is 4.25
The approximate closeness centrality of node 6329 is 4.35
The approximate closeness centrality of node 1823 is 4.4
The approximate closeness centrality of node 2486 is 4.4
The approximate closeness centrality of node 7227 is 4.4
The approximate closeness centrality of node 4721 is 4.4
The approximate closeness centrality of node 3345 is 4.45
The approximate closeness centrality of node 1818 is 4.45
The approximate closeness centrality of node 1827 is 4.45
The approximate closeness centrality of node 6319 is 4.45
The approximate closeness centrality of node 4357 is 4.45
The approximate closeness centrality of node 1500 is 4.45
The approximate closeness centrality of node 6318 is 4.45
The approximate closeness centrality of node 15304 is 4.45
The approximate closeness centrality of node 167 is 4.5
The approximate closeness centrality of node 21237 is 4.5
The approximate closeness centrality of node 6311 is 4.5
The approximate clos

## Calculate each's node betweeness centrality

In [29]:
def approximate_betweenness_centrality(graph, num_seeds=10):
    # Select a sample of random seed nodes
    seed_nodes = sample(graph.nodes(), num_seeds)

    # Initialize the betweenness centrality dictionary
    approx_betweenness_centrality = {node: 0.0 for node in graph.nodes()}

    # Perform BFS computations from each seed node and accumulate dependencies
    for seed_node in seed_nodes:
        paths = nx.single_source_shortest_path(graph, source=seed_node)
        dependencies = {node: 0 for node in graph.nodes()}

        for path in paths.values():
            for node in path[1:-1]:  # Exclude the source and target nodes
                dependencies[node] += 1

        # Accumulate betweenness centrality using dependencies
        for node in graph.nodes():
            if node != seed_node:
                approx_betweenness_centrality[node] += dependencies[node]

    # Normalize the betweenness centrality by the number of seed nodes
    for node in graph.nodes():
        approx_betweenness_centrality[node] /= num_seeds

    return approx_betweenness_centrality


In [30]:
# Example usage on a fraction of the whole graph
num_seeds = 20

# Calculate approximate betweenness centrality for each node
approx_betweenness_centrality = approximate_betweenness_centrality(G_dblp, num_seeds)

# Count the number of nodes with zero betweenness centrality
num_nodes_with_zero_centrality = sum(1 for centrality in approx_betweenness_centrality.values() if centrality == 0)

# Get the top nodes with the highest approximate betweenness centrality and non-zero centrality
top_nodes_between = [node for node in sorted(approx_betweenness_centrality, reverse=True, key=approx_betweenness_centrality.get) if approx_betweenness_centrality[node] > 0][:N]

# Print the approximate betweenness centrality for each of the top nodes
for node in top_nodes_between:
    centrality = approx_betweenness_centrality[node]
    print(f"The approximate betweenness centrality of node {node} is {centrality}")

# Print the count of nodes with zero betweenness centrality
print(f"Number of nodes with zero betweenness centrality: {num_nodes_with_zero_centrality}")

since Python 3.9 and will be removed in a subsequent version.
  seed_nodes = sample(graph.nodes(), num_seeds)


The approximate betweenness centrality of node 28647 is 15882.75
The approximate betweenness centrality of node 2722 is 15880.8
The approximate betweenness centrality of node 72773 is 15869.4
The approximate betweenness centrality of node 22230 is 15862.95
The approximate betweenness centrality of node 9657 is 15862.15
The approximate betweenness centrality of node 3512 is 15860.7
The approximate betweenness centrality of node 238023 is 15860.25
The approximate betweenness centrality of node 2721 is 15857.65
The approximate betweenness centrality of node 189944 is 15755.45
The approximate betweenness centrality of node 865 is 15726.85
The approximate betweenness centrality of node 18201 is 15704.3
The approximate betweenness centrality of node 67325 is 15390.3
The approximate betweenness centrality of node 135994 is 15251.5
The approximate betweenness centrality of node 125275 is 15139.8
The approximate betweenness centrality of node 36 is 14762.8
The approximate betweenness centrality

## Area of repulsion

In [31]:
def check_neighborhood(nodes_to_check, landmarks, graph, k=2):
    """
    Check if any node in nodes_to_check is within the k neighborhood of any landmark.

    Parameters:
        - nodes_to_check (list): List of nodes to check
        - landmarks (list): List of landmark nodes
        - graph (NetworkX graph): The graph
        - k (int): Neighborhood size (default is 2)

    Returns:
        - conflicting_nodes (list): List of nodes that are within the k neighborhood of any landmark
    """
    conflicting_nodes = []

    for landmark in landmarks:
        # Exclude nodes within the k neighborhood of the current landmark
        excluded_nodes = set(nx.single_source_shortest_path_length(graph, landmark, cutoff=k).keys())
        
        # Check for conflicts with nodes_to_check
        conflicts = set(nodes_to_check) & excluded_nodes

        # Add conflicting nodes to the list
        conflicting_nodes.extend(conflicts)

    return list(set(conflicting_nodes))




In [32]:
check_neighborhood(top_nodes_degree, landmark_list, G_dblp)

[993,
 865,
 164,
 166,
 167,
 1448,
 906,
 747,
 45,
 1197,
 979,
 980,
 1206,
 1270,
 1208,
 1338]

In [33]:
check_neighborhood(bottom_nodes_closeness, landmark_list, G_dblp)

[1827,
 4357,
 167,
 6318,
 6319,
 3345,
 4721,
 4306,
 2486,
 6326,
 6329,
 1818,
 1500,
 6333,
 3902,
 1823]

In [34]:
check_neighborhood(top_nodes_between, landmark_list, G_dblp)

[18201, 1428, 865, 36]

In [51]:
# Create a loop to generate random landmarks until there are no conflicts
while True:
    
    landmark_list = generate_landmark_list()
    # Check for conflicts between the set of landmarks for each centrality measure
    conflicts_degree = check_neighborhood(top_nodes_degree, landmark_list, G_dblp, 2)
    conflicts_closeness = check_neighborhood(bottom_nodes_closeness, landmark_list, G_dblp, 2)
    conflicts_between = check_neighborhood(top_nodes_between, landmark_list, G_dblp, 2)

    # If there are no conflicts for any centrality measure, break out of the loop
    if not (conflicts_degree or conflicts_closeness or conflicts_between):
        break

# Print the final set of landmarks
print("Landmarks:", landmark_list)


TypeError: generate_landmark_list() missing 1 required positional argument: 'graph'