In [35]:
import csv
import re
import time
import cProfile
from collections import defaultdict
import networkx as nx
import itertools
#import cProfile
#cProfile.run('create_mention_graph_with_centrality("twitter-small.tsv")')

def reachable_nodes(graph, start_node):
    """Efficiently counts reachable nodes in an undirected graph."""
    try:
        return len(list(nx.node_connected_component(graph, start_node)))
    except nx.NetworkXError:
        return 0

def neighborhood_lower_bound(graph):
    nodes = list(graph.nodes())
    n = len(nodes)
    lower_bounds = {}

    # CORRECTED: Precompute reachability information ONCE *BEFORE* the main loop
    reachable_nodes_dict = {node: reachable_nodes(graph, node) for node in nodes}
    print('1')

    k = 2
    max_iterations = 10  # Limit iterations to prevent excessive runtime.

    while k <= max_iterations:
        Y = {}
        S_un = {}
        nVisited = {}
        finished = {}

        # Initialization - use degrees for efficiency
        for s in nodes:
            degree_s = graph.degree(s)
            Y[(k - 1, s)] = degree_s
            S_un[(k - 1, s)] = degree_s
            nVisited[s] = degree_s + 1
            finished[s] = False

        nFinished = 0
        while nFinished < n:
            for s in nodes:
                if finished[s]:
                    continue
                r_v = reachable_nodes_dict[s]
                print('2')

                if k == 2:
                    Y[(k, s)] = sum(Y[(k - 1, w)] for w in graph.neighbors(s)) - graph.degree(s)
                elif k > 2:
                    Y[(k, s)] = sum(Y[(k - 1, w)] for w in graph.neighbors(s)) - Y.get((k - 2, s), 0) * (
                                graph.degree(s) - 1)
                else:
                    Y[(k, s)] = 0  # Should not happen

                nVisited[s] += Y.get((k - 1, s), 0)

                if nVisited[s] < r_v:
                    S_un[(k, s)] = S_un.get((k - 1, s), 0) + k * Y.get((k - 1, s), 0)
                else:
                    S_un[(k, s)] = S_un.get((k - 1, s), 0) + k * (r_v - (nVisited[s] - Y.get((k - 1, s), 0)))
                    nFinished += 1
                    finished[s] = True

        k += 1

    for v in nodes:
        r_v = reachable_nodes_dict[v]
        lower_bounds[v] = (n - 1) * S_un.get((k - 1, v), 0) / ((r_v - 1) ** 2) if r_v > 1 else 0

    return lower_bounds


def create_mention_graph_with_centrality(filepath):
    start_time = time.time()
    edge_weights = defaultdict(int)

    def process_chunk(chunk):
        for row in chunk:
            try:
                timestamp, user, tweet_text = row
                mentions = re.findall(r'@([a-zA-Z0-9_]+)', tweet_text)
                for mention in mentions:
                    edge_weights[(user, mention)] += 1
            except (ValueError, IndexError) as e:
                print(f"Error processing row: {row}. Skipping. Error: {e}")

    chunk_size = 100000
    current_chunk = []
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            current_chunk.append(row)
            if len(current_chunk) == chunk_size:
                process_chunk(current_chunk)
                current_chunk = []

    if current_chunk:
        process_chunk(current_chunk)

    G = nx.Graph()  # Create an undirected graph directly
    for (u, v), weight in edge_weights.items():
        G.add_edge(u, v, weight=weight)

    print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
    print("Calculating centrality...")
    lower_bounds = neighborhood_lower_bound(G)
    nx.set_node_attributes(G, lower_bounds, 'lower_centrality')

    end_time = time.time()
    total_time = end_time - start_time
    print(f"\nTotal time taken: {total_time:.4f} seconds")

    sorted_centralities = sorted(
        nx.get_node_attributes(G, 'lower_centrality').items(),
        key=lambda x: x[1],
        reverse=True
    )
    print("\nTop 10 nodes by lower centrality bound:")
    for node, centrality in sorted_centralities[:10]:
        print(f"Node: {node}, Lower Centrality: {centrality}")

    return G


if __name__ == "__main__":
    filepath = "twitter-small.tsv"  # Make sure this points to your file
    print("Creating graph...")
    graph_with_centrality = create_mention_graph_with_centrality(filepath)

Creating graph...
Graph created with 94861 nodes and 154719 edges.
Calculating centrality...


KeyboardInterrupt: 

In [1]:
import networkx as nx
import numpy as np
import re
import csv
from collections import defaultdict
from itertools import combinations

def memory_efficient_lin_closeness(G):
    """计算图中所有节点的Lin接近中心性，使用内存效率更高的方法。"""
    n = G.number_of_nodes()
    lin_centrality = {}
    for node in G.nodes():
        length = nx.single_source_shortest_path_length(G, node)
        reachable = len(length)
        total_distance = sum(length.values())
        if reachable > 1:
            lin_centrality[node] = (reachable - 1)**2 / ((n - 1) * total_distance)
        else:
            lin_centrality[node] = 0.0
    return lin_centrality

def top_k_closeness_centrality_nodes(G, k):
    """根据Lin接近中心性查找前k个节点。"""
    closeness_centrality = memory_efficient_lin_closeness(G)
    sorted_nodes = sorted(closeness_centrality.items(), key=lambda item: item[1], reverse=True)
    return [node for node, centrality_value in sorted_nodes[:k]]

def create_mention_graph(filepath):
    edge_weights = defaultdict(int)
    
    def process_chunk(chunk):
        for row in chunk:
            try:
                timestamp, user, tweet_text = row
                mentions = re.findall(r'@([a-zA-Z0-9_]+)', tweet_text)
                for mention in mentions:
                    edge_weights[(user, mention)] += 1
            except (ValueError, IndexError) as e:
                print(f"Error processing row: {row}. Skipping. Error: {e}")
    
    chunk_size = 100000  # 增大块大小以减少 I/O 操作
    current_chunk = []
    
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            current_chunk.append(row)
            if len(current_chunk) == chunk_size:
                process_chunk(current_chunk)
                current_chunk = []
    
    if current_chunk:
        process_chunk(current_chunk)
    
    G = nx.DiGraph()
    for (u, v), weight in edge_weights.items():
        G.add_edge(u, v, weight=weight)
    
    print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
    return G

if __name__ == "__main__":
    filepath = "twitter-small.tsv"
    print("Creating graph...")
    graph = create_mention_graph(filepath)
    k = 10

    if graph.number_of_nodes() == 0:
        print("The graph is empty. Please check your input file.")
    else:
        print("Calculating closeness centrality...")
        top_k_nodes = top_k_closeness_centrality_nodes(graph, k)
        print(f"接近中心性最高的{k}个节点：{top_k_nodes}")

Creating graph...
Graph created with 94861 nodes and 155574 edges.
Calculating closeness centrality...
接近中心性最高的10个节点：['tamaraschilling', 'americandream09', 'nachhi', 'teddy_salad', 'drjennifer', 'medic_ray', 'pcpitcrew', 'judismile', 'kellythomas1', 'june_prissydog']


In [None]:
import cProfile
cProfile.run('create_mention_graph_with_centrality("twitter-small.tsv")')

In [9]:
import csv
import re
import time
import cProfile
from collections import defaultdict
import networkx as nx
import itertools

def reachable_nodes_bfs(graph, start_node):
    """使用 deque 作为队列的高效 BFS 实现。"""
    if start_node not in graph:
        return 0

    visited = set()
    queue = deque([start_node])  # 使用 deque 作为队列
    while queue:
        node = queue.popleft()  # 使用 popleft() 高效地弹出队列元素
        if node not in visited:
            visited.add(node)
            neighbors = list(graph.neighbors(node))
            queue.extend(neighbor for neighbor in neighbors if neighbor not in visited)
    return len(visited)

def neighborhood_lower_bound(graph):
    nodes = list(graph.nodes())
    n = len(nodes)
    lower_bounds = {}
    reachable_nodes_dict = {node: reachable_nodes_bfs(graph, node) for node in nodes}

    k = 2
    max_iterations = 10
    print('2')

    # 预先计算所有节点的邻居
    neighbors_dict = {node: list(graph.neighbors(node)) for node in nodes}
    print('3')

    while k <= max_iterations:
        Y = {}
        S_un = {}
        nVisited = {}
        finished = {}

        for s in nodes:
            degree_s = graph.degree(s)
            Y[(k - 1, s)] = degree_s
            S_un[(k - 1, s)] = degree_s
            nVisited[s] = degree_s + 1
            finished[s] = False

        nFinished = 0
        while nFinished < n:
            for s in nodes:
                if finished[s]:
                    continue
                r_v = reachable_nodes_dict[s]

                if k == 2:
                    Y[(k, s)] = sum(Y[(k - 1, w)] for w in neighbors_dict[s]) - graph.degree(s) # 使用预先计算的邻居
                elif k > 2:
                    Y[(k, s)] = sum(Y[(k - 1, w)] for w in neighbors_dict[s]) - Y.get((k - 2, s), 0) * (
                                graph.degree(s) - 1)
                else:
                    Y[(k, s)] = 0

                nVisited[s] += Y.get((k - 1, s), 0)

                if nVisited[s] < r_v:
                    S_un[(k, s)] = S_un.get((k - 1, s), 0) + k * Y.get((k - 1, s), 0)
                else:
                    S_un[(k, s)] = S_un.get((k - 1, s), 0) + k * (r_v - (nVisited[s] - Y.get((k - 1, s), 0)))
                    nFinished += 1
                    finished[s] = True

        k += 1

    for v in nodes:
        r_v = reachable_nodes_dict[v]
        lower_bounds[v] = (n - 1) * S_un.get((k - 1, v), 0) / ((r_v - 1) ** 2) if r_v > 1 else 0

    return lower_bounds


def create_mention_graph_with_centrality(filepath):
    start_time = time.time()
    edge_weights = defaultdict(int)

    def process_chunk(chunk):
        for row in chunk:
            try:
                timestamp, user, tweet_text = row
                mentions = re.findall(r'@([a-zA-Z0-9_]+)', tweet_text)
                for mention in mentions:
                    edge_weights[(user, mention)] += 1
            except (ValueError, IndexError) as e:
                print(f"Error processing row: {row}. Skipping. Error: {e}")

    chunk_size = 100000
    current_chunk = []
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            current_chunk.append(row)
            if len(current_chunk) == chunk_size:
                process_chunk(current_chunk)
                current_chunk = []

    if current_chunk:
        process_chunk(current_chunk)

    G = nx.Graph()  # Create an undirected graph directly
    for (u, v), weight in edge_weights.items():
        G.add_edge(u, v, weight=weight)

    print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
    print("Calculating centrality...")
    lower_bounds = neighborhood_lower_bound(G)
    nx.set_node_attributes(G, lower_bounds, 'lower_centrality')

    end_time = time.time()
    total_time = end_time - start_time
    print(f"\nTotal time taken: {total_time:.4f} seconds")

    sorted_centralities = sorted(
        nx.get_node_attributes(G, 'lower_centrality').items(),
        key=lambda x: x[1],
        reverse=True
    )
    print("\nTop 10 nodes by lower centrality bound:")
    for node, centrality in sorted_centralities[:10]:
        print(f"Node: {node}, Lower Centrality: {centrality}")

    return G


if __name__ == "__main__":
    filepath = "twitter-small.tsv"  # Make sure this points to your file
    print("Creating graph...")
    cProfile.run('create_mention_graph_with_centrality("twitter-small.tsv")')

Creating graph...
Graph created with 94861 nodes and 154719 edges.
Calculating centrality...
         5205774151 function calls (5205774150 primitive calls) in 862.869 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
1680866949/1680866948  144.511    0.000  144.511    0.000 1310056601.py:21(<genexpr>)
        1    0.080    0.080  862.868  862.868 1310056601.py:24(neighborhood_lower_bound)
        1    0.001    0.001  862.470  862.470 1310056601.py:28(<dictcomp>)
        1    0.000    0.000  862.868  862.868 1310056601.py:83(create_mention_graph_with_centrality)
        1    0.036    0.036    0.068    0.068 1310056601.py:87(process_chunk)
     8212  405.942    0.049 1719.657    0.209 1310056601.py:9(reachable_nodes_bfs)
        1    0.000    0.000    0.000    0.000 <frozen codecs>:260(__init__)
        1    0.000    0.000    0.000    0.000 <frozen codecs>:309(__init__)
      711    0.000    0.000    0.002    0.000 <frozen cod

KeyboardInterrupt: 