In [2]:
import requests
import networkx as nx
from pathlib import Path

# URLs
URL_NODES = "https://nus-st5225.netlify.app/assignment_1/A3bk6832/graph_nodes.txt"
URL_EDGES = "https://nus-st5225.netlify.app/assignment_1/A3bk6832/graph_edges.txt"

# Local candidates
LOCAL_NODES = Path("graph_nodes.txt")
LOCAL_EDGES = Path("graph_edges.txt")
DATA_NODES = Path("data/graph_nodes.txt")
DATA_EDGES = Path("data/graph_edges.txt")

# Load nodes: try URL -> local -> data/
try:
    resp = requests.get(URL_NODES, timeout=5)
    resp.raise_for_status()
    nodes_text = resp.text
    print(f"Loaded nodes from URL: {URL_NODES}")
except Exception:
    try:
        nodes_text = LOCAL_NODES.read_text(encoding="utf-8")
        print(f"Loaded nodes from local path: {LOCAL_NODES}")
    except FileNotFoundError:
        try:
            nodes_text = DATA_NODES.read_text(encoding="utf-8")
            print(f"Loaded nodes from local path: {DATA_NODES}")
        except FileNotFoundError:
            raise FileNotFoundError(
                f"Could not load nodes from URL or local files: {LOCAL_NODES} or {DATA_NODES}"
            )

# Load edges: try URL -> local -> data/
try:
    resp = requests.get(URL_EDGES, timeout=5)
    resp.raise_for_status()
    edges_text = resp.text
    print(f"Loaded edges from URL: {URL_EDGES}")
except Exception:
    try:
        edges_text = LOCAL_EDGES.read_text(encoding="utf-8")
        print(f"Loaded edges from local path: {LOCAL_EDGES}")
    except FileNotFoundError:
        try:
            edges_text = DATA_EDGES.read_text(encoding="utf-8")
            print(f"Loaded edges from local path: {DATA_EDGES}")
        except FileNotFoundError:
            raise FileNotFoundError(
                f"Could not load edges from URL or local files: {LOCAL_EDGES} or {DATA_EDGES}"
            )

# Build undirected graph and attach node attribute 'age'
G = nx.Graph()
for line in nodes_text.splitlines():
    line = line.strip()
    if not line or line.startswith("#"):
        continue
    nid, age = map(int, line.split())
    G.add_node(nid, age=age)

for line in edges_text.splitlines():
    line = line.strip()
    if not line or line.startswith("#"):
        continue
    u, v = map(int, line.split())
    G.add_edge(u, v)

# Should give "Graph with 1000 nodes and 25087 edges"
print(G)

Loaded nodes from URL: https://nus-st5225.netlify.app/assignment_1/A3bk6832/graph_nodes.txt
Loaded edges from URL: https://nus-st5225.netlify.app/assignment_1/A3bk6832/graph_edges.txt
Graph with 1000 nodes and 25087 edges


In [3]:
# Compute statistics
n = G.number_of_nodes()
m = G.number_of_edges()
avg_deg = sum(dict(G.degree()).values()) / n
density = nx.density(G)
is_directed = nx.is_directed(G)

# Statements
a = (n == 1000)
b = (avg_deg > 50 and avg_deg < 52)
c = is_directed
d = (density > 0.05 and density < 0.06)

# Print results
print(f"n = {n}")
print(f"m = {m}")
print(f"average degree = {avg_deg}")
print(f"density = {density}")
print(f"is_directed = {is_directed}")
print()
print("Statements truth values:")
print(f"a. exactly 1000 nodes: {a}")
print(f"b. average degree between 50 and 52: {b}")
print(f"c. graph is directed: {c}")
print(f"d. edge density between 0.05 and 0.06: {d}")


n = 1000
m = 25087
average degree = 50.174
density = 0.05022422422422423
is_directed = False

Statements truth values:
a. exactly 1000 nodes: True
b. average degree between 50 and 52: True
c. graph is directed: False
d. edge density between 0.05 and 0.06: True


In [4]:
import numpy as np
from collections import Counter

# Degree sequence
deg_seq = np.array([d for _, d in G.degree()])
max_deg = int(deg_seq.max())
min_deg = int(deg_seq.min())
mean_deg = float(deg_seq.mean())
std_deg = float(deg_seq.std(ddof=0))

# skewness and kurtosis (population)
skewness = ((deg_seq - mean_deg) ** 3).mean() / (std_deg ** 3) if std_deg > 0 else 0.0
kurtosis = ((deg_seq - mean_deg) ** 4).mean() / (std_deg ** 4) if std_deg > 0 else 0.0

# Simple power-law check: linear fit on log(degree) vs log(frequency) for degree>0
deg_counts = Counter(deg_seq.tolist())
xs = np.array(sorted([k for k in deg_counts.keys() if k > 0]))
ys = np.array([deg_counts[int(x)] for x in xs])
if len(xs) >= 3:
    logx = np.log(xs)
    logy = np.log(ys)
    coef = np.polyfit(logx, logy, 1)
    pred = np.polyval(coef, logx)
    ss_res = ((logy - pred) ** 2).sum()
    ss_tot = ((logy - logy.mean()) ** 2).sum()
    r2 = 1 - ss_res / ss_tot if ss_tot > 0 else 0.0
else:
    r2 = 0.0

power_law_like = (r2 > 0.9)

# Heuristic for bell-shaped: small skew and not power-law
bell_shaped = (abs(skewness) < 0.5) and (not power_law_like)

# Node(s) with highest degree
max_nodes = [n for n, d in G.degree() if d == max_deg]
# pick the first one to check age
top_node = max_nodes[0]
top_node_age = G.nodes[top_node].get('age', None)

# Statements
stmt_a = (max_deg > 100 and max_deg < 104)  # NOTE: question a asks max degree in (100,104)
stmt_b = bell_shaped and (not power_law_like)
stmt_c = (min_deg == 0)
stmt_d = (top_node_age is not None and top_node_age > 60)

# Print diagnostics and answers
print(f"max degree = {max_deg}")
print(f"min degree = {min_deg}")
print(f"mean degree = {mean_deg:.4f}")
print(f"std degree = {std_deg:.4f}")
print(f"skewness = {skewness:.4f}")
print(f"kurtosis = {kurtosis:.4f}")
print(f"power-law fit R^2 (log-log) = {r2:.4f}")
print(f"power_law_like (R^2>0.9) = {power_law_like}")
print(f"bell_shaped heuristic = {bell_shaped}")
print()
print("Statements truth values:")
print(f"a. max degree strictly between 100 and 104: {stmt_a}")
print(f"b. degree distribution approximately bell-shaped and not power-law: {stmt_b}")
print(f"c. minimum degree is 0: {stmt_c}")
print(f"d. node with highest degree is strictly older than 60: {stmt_d} (node {top_node} age={top_node_age})")


max degree = 83
min degree = 17
mean degree = 50.1740
std degree = 13.5489
skewness = -0.2573
kurtosis = 2.2439
power-law fit R^2 (log-log) = 0.0060
power_law_like (R^2>0.9) = False
bell_shaped heuristic = True

Statements truth values:
a. max degree strictly between 100 and 104: False
b. degree distribution approximately bell-shaped and not power-law: True
c. minimum degree is 0: False
d. node with highest degree is strictly older than 60: False (node 607 age=59)


In [5]:
# Assortativity and age
age_assortativity = nx.numeric_assortativity_coefficient(G, "age")
degree_assortativity = nx.degree_assortativity_coefficient(G)

# Statements
stmt_a = (age_assortativity < 0)
stmt_b = (age_assortativity > 0)
stmt_c = (degree_assortativity < 0)
stmt_d = (degree_assortativity > 0.26)

# Print results
print(f"age assortativity = {age_assortativity}")
print(f"degree assortativity = {degree_assortativity}")
print()
print("Statements truth values:")
print(f"a. numeric assortativity for 'age' is negative: {stmt_a}")
print(f"b. graph is assortative by age (nodes of similar ages tend to connect): {stmt_b}")
print(f"c. degree assortativity is negative (disassortative by degree): {stmt_c}")
print(f"d. degree assortativity strictly greater than 0.26: {stmt_d}")


age assortativity = 0.41111515464800547
degree assortativity = 0.25720731284047177

Statements truth values:
a. numeric assortativity for 'age' is negative: False
b. graph is assortative by age (nodes of similar ages tend to connect): True
c. degree assortativity is negative (disassortative by degree): False
d. degree assortativity strictly greater than 0.26: False


In [6]:
# Centrality measures on the largest connected component
components = list(nx.connected_components(G))
largest_cc_nodes = max(components, key=len)
L = G.subgraph(largest_cc_nodes).copy()

# Compute centralities
closeness = nx.closeness_centrality(L)
betweenness = nx.betweenness_centrality(L, normalized=True)

max_closeness = max(closeness.values())
min_closeness = min(closeness.values())
max_betweenness = max(betweenness.values())

# Node(s) with highest betweenness
top_b_nodes = [n for n, v in betweenness.items() if v == max_betweenness]
top_b_node = top_b_nodes[0]
top_b_age = L.nodes[top_b_node].get('age', None)

# Statements
stmt_a = (max_closeness > 0.5)
stmt_b = (top_b_age is not None and 40 <= top_b_age <= 60)
stmt_c = (min_closeness < 0.1)
stmt_d = (max_betweenness > 0.0029)

# Print diagnostics and answers
print(f"largest CC size = {L.number_of_nodes()} nodes, {L.number_of_edges()} edges")
print(f"max closeness = {max_closeness:.6f}")
print(f"min closeness = {min_closeness:.6f}")
print(f"max betweenness = {max_betweenness:.6f}")
print(f"top betweenness node(s) = {top_b_nodes}, age(s) = {[L.nodes[n].get('age', None) for n in top_b_nodes]}")
print()
print("Statements truth values:")
print(f"a. max closeness > 0.5: {stmt_a}")
print(f"b. node with highest betweenness has age between 40 and 60: {stmt_b} (node {top_b_node} age={top_b_age})")
print(f"c. min closeness < 0.1: {stmt_c}")
print(f"d. max betweenness > 0.0029: {stmt_d}")


largest CC size = 1000 nodes, 25087 edges
max closeness = 0.514418
min closeness = 0.409091
max betweenness = 0.002934
top betweenness node(s) = [378], age(s) = [54]

Statements truth values:
a. max closeness > 0.5: True
b. node with highest betweenness has age between 40 and 60: True (node 378 age=54)
c. min closeness < 0.1: False
d. max betweenness > 0.0029: True


In [7]:
# Clustering and path-length for the largest connected component
components = list(nx.connected_components(G))
sizes = sorted([len(c) for c in components], reverse=True)
largest_cc_nodes = max(components, key=len)
L = G.subgraph(largest_cc_nodes).copy()

# Global clustering coefficient (transitivity) for L
global_clust = nx.transitivity(L)
# Average shortest path length for L
avg_spl = nx.average_shortest_path_length(L)

# Decide what 'significant size' means: here use 5% of total nodes
total_n = G.number_of_nodes()
sig_thresh = 0.05 * total_n
significant_components = [s for s in sizes if s >= sig_thresh]

# Statements
stmt_a = (global_clust > 0.060 and global_clust < 0.062)
stmt_b = (avg_spl > 2)
stmt_c = (len(significant_components) > 1)
stmt_d = (avg_spl < 3)

# Print diagnostics and answers
print(f"total nodes = {total_n}")
print(f"component sizes (top 10) = {sizes[:10]}")
print(f"significant threshold (5% of n) = {sig_thresh}")
print(f"significant components (size >= threshold) = {significant_components}")
print()
print(f"global clustering (transitivity) on largest CC = {global_clust:.6f}")
print(f"average shortest path length on largest CC = {avg_spl:.6f}")
print()
print("Statements truth values (assumption: 'significant' = >=5% of total nodes):")
print(f"a. global clustering between 0.060 and 0.062: {stmt_a}")
print(f"b. average shortest path length > 2: {stmt_b}")
print(f"c. multiple disconnected components of significant size: {stmt_c}")
print(f"d. average shortest path length < 3: {stmt_d}")


total nodes = 1000
component sizes (top 10) = [1000]
significant threshold (5% of n) = 50.0
significant components (size >= threshold) = [1000]

global clustering (transitivity) on largest CC = 0.062107
average shortest path length on largest CC = 2.069327

Statements truth values (assumption: 'significant' = >=5% of total nodes):
a. global clustering between 0.060 and 0.062: False
b. average shortest path length > 2: True
c. multiple disconnected components of significant size: False
d. average shortest path length < 3: True


In [8]:
# Age and degree correlation
# degree dict and age mapping
deg = dict(G.degree())
ages = {n: G.nodes[n].get('age') for n in G.nodes()}

# Nodes strictly younger than 20 and strictly older than 60
young_nodes = [n for n, a in ages.items() if a is not None and a < 20]
old_nodes = [n for n, a in ages.items() if a is not None and a > 60]

avg_deg_young = sum(deg[n] for n in young_nodes) / len(young_nodes) if young_nodes else 0.0
avg_deg_old = sum(deg[n] for n in old_nodes) / len(old_nodes) if old_nodes else 0.0

# Statement a
stmt_a = (avg_deg_young > avg_deg_old)
# Statement b
stmt_b = (avg_deg_old > 60)

# Statement c: no edges connecting nodes of age 10 and age 80
age_to_nodes = {}
for n, a in ages.items():
    age_to_nodes.setdefault(a, []).append(n)
nodes_age10 = age_to_nodes.get(10, [])
nodes_age80 = age_to_nodes.get(80, [])
exists_edge_10_80 = False
for u in nodes_age10:
    for v in G.neighbors(u):
        if v in nodes_age80:
            exists_edge_10_80 = True
            break
    if exists_edge_10_80:
        break
stmt_c = (not exists_edge_10_80)

# Statement d: average age of top 50 nodes by degree > 50
top50 = sorted(deg.items(), key=lambda x: x[1], reverse=True)[:50]
top50_nodes = [n for n, _ in top50]
top50_ages = [ages[n] for n in top50_nodes if ages.get(n) is not None]
avg_age_top50 = sum(top50_ages) / len(top50_ages) if top50_ages else 0.0
stmt_d = (avg_age_top50 > 50)

# Print diagnostics and answers in the same format as previous questions
print(f"# nodes with known age = {sum(1 for a in ages.values() if a is not None)}")
print(f"young (<20) count = {len(young_nodes)}, avg degree = {avg_deg_young:.4f}")
print(f"old (>60) count = {len(old_nodes)}, avg degree = {avg_deg_old:.4f}")
print(f"nodes age 10 count = {len(nodes_age10)}, nodes age 80 count = {len(nodes_age80)}")
print(f"top50 by degree considered = {len(top50_nodes)} (may be <50 if graph smaller)")
print(f"avg age of top50 by degree = {avg_age_top50:.4f}")

print()
print("Statements truth values:")
print(f"a. The average degree of nodes strictly younger than 20 is higher than that of nodes strictly older than 60: {stmt_a}")
print(f"b. The average degree of nodes strictly older than 60 is strictly larger than 60: {stmt_b}")
print(f"c. There are no edges connecting nodes of age 10 and age 80: {stmt_c}")
print(f"d. The average age of the top 50 nodes by degree is strictly greater than 50: {stmt_d}")


# nodes with known age = 1000
young (<20) count = 156, avg degree = 29.9167
old (>60) count = 284, avg degree = 60.4683
nodes age 10 count = 15, nodes age 80 count = 17
top50 by degree considered = 50 (may be <50 if graph smaller)
avg age of top50 by degree = 60.9400

Statements truth values:
a. The average degree of nodes strictly younger than 20 is higher than that of nodes strictly older than 60: False
b. The average degree of nodes strictly older than 60 is strictly larger than 60: True
c. There are no edges connecting nodes of age 10 and age 80: False
d. The average age of the top 50 nodes by degree is strictly greater than 50: True


In [9]:
import numpy as np

# Clustering vs configuration-model nulls
real_trans = nx.transitivity(G)

deg_seq = [d for _, d in G.degree()]
S = []
for i in range(100):
    MG = nx.configuration_model(deg_seq)
    H = nx.Graph(MG)  # collapse multi-edges
    H.remove_edges_from(nx.selfloop_edges(H))  # remove self-loops
    S.append(nx.transitivity(H))

S = np.array(S)
mean_sim = float(S.mean())
std_sim = float(S.std(ddof=0))
max_sim = float(S.max())
min_sim = float(S.min())

# simple significance rule: real > mean + 2*std
significant_higher = (real_trans > mean_sim + 2 * std_sim)

# Statements
stmt_a = significant_higher
stmt_b = (abs(real_trans - mean_sim) < 0.02)
stmt_c = (mean_sim < 0.1)
stmt_d = (max_sim > real_trans)

# Print diagnostics and answers
print(f"real global clustering (transitivity) = {real_trans:.6f}")
print(f"simulated mean = {mean_sim:.6f}, std = {std_sim:.6f}, min = {min_sim:.6f}, max = {max_sim:.6f}")
print()
print("Statements truth values:")
print(f"a. The global clustering coefficient of the original graph G is significantly higher than the average of the 100 simulated coefficients: {stmt_a}")
print(f"b. The difference between the global clustering coefficient of G and the average of the simulated values is less than 0.02: {stmt_b}")
print(f"c. The average of the 100 simulated global clustering coefficients is less than 0.1: {stmt_c}")
print(f"d. Across the 100 simulations, the maximum observed global clustering coefficient is greater than the global clustering coefficient of the original graph G: {stmt_d}")


real global clustering (transitivity) = 0.062107
simulated mean = 0.053887, std = 0.000331, min = 0.053106, max = 0.054568

Statements truth values:
a. The global clustering coefficient of the original graph G is significantly higher than the average of the 100 simulated coefficients: True
b. The difference between the global clustering coefficient of G and the average of the simulated values is less than 0.02: True
c. The average of the 100 simulated global clustering coefficients is less than 0.1: True
d. Across the 100 simulations, the maximum observed global clustering coefficient is greater than the global clustering coefficient of the original graph G: False


In [10]:
# Snowball sampling bias
seeds = [1, 13, 15]
sample = set(seeds)
current_wave = list(seeds)

for wave in range(3):
    next_wave = []
    for u in current_wave:
        age_u = G.nodes[u].get('age')
        for v in G.neighbors(u):
            if v in sample:
                continue
            age_v = G.nodes[v].get('age')
            if age_u is None or age_v is None:
                continue
            if abs(age_v - age_u) <= 5:
                sample.add(v)
                next_wave.append(v)
    current_wave = next_wave

sample = sorted(sample)

# Diagnostics
deg = dict(G.degree())
pop_avg_deg = sum(deg.values()) / G.number_of_nodes()
pop_ages = [G.nodes[n].get('age') for n in G.nodes() if G.nodes[n].get('age') is not None]
pop_avg_age = sum(pop_ages) / len(pop_ages) if pop_ages else 0.0

sample_degs = [deg[n] for n in sample]
sample_ages = [G.nodes[n].get('age') for n in sample if G.nodes[n].get('age') is not None]

sample_avg_deg = sum(sample_degs) / len(sample_degs) if sample_degs else 0.0
sample_avg_age = sum(sample_ages) / len(sample_ages) if sample_ages else 0.0

# Statements
stmt_a = (sample_avg_deg < pop_avg_deg)
stmt_b = (sample_avg_age < pop_avg_age)
stmt_c = (len(sample) < 50)
# Heuristic for bias: difference > 0.5 years
stmt_d = (abs(sample_avg_age - pop_avg_age) > 0.5)

# Print diagnostics and answers
print(f"seed nodes = {seeds}")
print(f"final sample size = {len(sample)}")
print(f"sample nodes = {sample}")
print(f"population average degree = {pop_avg_deg:.4f}")
print(f"sample average degree = {sample_avg_deg:.4f}")
print(f"population average age = {pop_avg_age:.4f}")
print(f"sample average age = {sample_avg_age:.4f}")

print()
print("Statements truth values:")
print(f"a. The average degree of the nodes in the sample is lower than the average degree of the entire graph G: {stmt_a}")
print(f"b. The average age of the nodes in the sample is lower than the average age of the entire graph G: {stmt_b}")
print(f"c. The final sample contains fewer than 50 nodes: {stmt_c}")
print(f"d. The average age in the sample is biased (|diff|>0.5yr): {stmt_d}")


seed nodes = [1, 13, 15]
final sample size = 391
sample nodes = [1, 7, 8, 13, 15, 20, 27, 28, 29, 31, 32, 34, 37, 39, 42, 44, 48, 56, 59, 61, 62, 66, 67, 70, 73, 76, 77, 83, 85, 87, 88, 89, 93, 99, 101, 102, 104, 111, 115, 118, 119, 121, 122, 127, 131, 132, 137, 138, 139, 140, 144, 146, 150, 151, 156, 161, 164, 167, 168, 170, 171, 172, 174, 175, 180, 183, 191, 193, 200, 201, 203, 204, 205, 207, 208, 210, 211, 213, 216, 218, 223, 225, 226, 228, 229, 231, 232, 234, 239, 243, 244, 246, 250, 257, 258, 259, 261, 262, 263, 264, 266, 268, 272, 273, 280, 281, 286, 293, 298, 304, 305, 306, 308, 309, 311, 312, 314, 320, 322, 325, 328, 331, 332, 334, 335, 338, 340, 345, 349, 351, 354, 357, 358, 360, 362, 364, 366, 368, 370, 382, 384, 389, 391, 393, 399, 400, 405, 406, 407, 409, 410, 412, 413, 416, 417, 420, 421, 423, 426, 427, 429, 434, 437, 440, 442, 444, 445, 451, 452, 455, 457, 468, 469, 470, 471, 475, 478, 482, 486, 488, 490, 491, 492, 494, 496, 500, 502, 506, 507, 510, 512, 513, 522, 528, 53