In [2]:
import networkx as nx
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

# =========================
# Dataset
# =========================
data = {
    "stop_id": ["S1","S2","S3","S4","S5","S6","S7","S8","S9","S10"],
    "neighborhood": ["Downtown","Downtown","Midtown","Uptown","Riverside",
                     "Industrial","Midtown","Downtown","Uptown","Riverside"],
    "avg_time_min": [4.2, 5.1, 6.8, 9.2, 11.3, 12.5, 6.5, 8.9, 8.9, 10.8],
    "peak_load_pax_per_hr": [620, 580, 400, 220, 150, 90, 700, 260, 250, 210]
}
df = pd.DataFrame(data)

# =========================
# Part A – Build Graph
# =========================
G = nx.Graph()
G.add_nodes_from(df['stop_id'])

# Add edges if same neighborhood
for nbh in df['neighborhood'].unique():
    stops = df[df['neighborhood'] == nbh]['stop_id'].tolist()
    for i in range(len(stops)):
        for j in range(i+1, len(stops)):
            G.add_edge(stops[i], stops[j])

# a) Count edges
num_edges = G.number_of_edges()
print("Number of edges:", num_edges)

# b) Highest degree stop
degrees = dict(G.degree())
highest_stop = max(degrees, key=degrees.get)
print("Highest degree stop:", highest_stop, "with degree", degrees[highest_stop])

# Handshake theorem check
print("Sum of degrees:", sum(degrees.values()))
print("2 * edges:", 2 * num_edges)

# Shortest path S1 to S6
try:
    path = nx.shortest_path(G, "S1", "S6")
    print("Shortest path S1–S6:", path)
except nx.NetworkXNoPath:
    print("No path exists between S1 and S6")

# =========================
# Part B – Centrality & Eccentricity
# =========================
centrality = nx.degree_centrality(G)
print("\nDegree Centrality:")
for node, c in centrality.items():
    print(node, ":", round(c,3))

eccentricity = {}
for component in nx.connected_components(G):
    subG = G.subgraph(component)
    if len(subG) > 1: # Eccentricity is defined for graphs with more than one node
        ecc = nx.eccentricity(subG)
        eccentricity.update(ecc)
    else: # Handle single node components
        eccentricity.update({node: 0 for node in subG.nodes()})


print("\nEccentricity:")
for node, e in eccentricity.items():
    print(node, ":", e)

# Identify center and periphery for each connected component
all_center = []
all_periphery = []
for component in nx.connected_components(G):
    subG = G.subgraph(component)
    if len(subG) > 1:
        all_center.extend(nx.center(subG))
        all_periphery.extend(nx.periphery(subG))
    else:
        all_center.extend(list(subG.nodes())) # Single node is both center and periphery
        all_periphery.extend(list(subG.nodes()))


print("Center nodes:", all_center)
print("Periphery nodes:", all_periphery)

# =========================
# Part C – Hub Label + Logistic Regression
# =========================
# Add degree column
df['degree'] = df['stop_id'].map(dict(G.degree()))

# Define hub label (>= 80th percentile)
threshold = np.percentile(df['degree'], 80)
df['hub'] = (df['degree'] >= threshold).astype(int)

print("\nUpdated Dataset with Hub label:\n", df)

# Features and target
X = df[['avg_time_min','peak_load_pax_per_hr']]
y = df['hub']

# Split (small dataset, so test size small)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Metrics
print("\nLogistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, zero_division=0))
print("Recall:", recall_score(y_test, y_pred, zero_division=0))

Number of edges: 6
Highest degree stop: S1 with degree 2
Sum of degrees: 12
2 * edges: 12
No path exists between S1 and S6

Degree Centrality:
S1 : 0.222
S2 : 0.222
S3 : 0.111
S4 : 0.111
S5 : 0.111
S6 : 0.0
S7 : 0.111
S8 : 0.222
S9 : 0.111
S10 : 0.111

Eccentricity:
S1 : 1
S2 : 1
S8 : 1
S3 : 1
S7 : 1
S4 : 1
S9 : 1
S5 : 1
S10 : 1
S6 : 0
Center nodes: ['S1', 'S2', 'S8', 'S3', 'S7', 'S4', 'S9', 'S5', 'S10', 'S6']
Periphery nodes: ['S1', 'S2', 'S8', 'S3', 'S7', 'S4', 'S9', 'S5', 'S10', 'S6']

Updated Dataset with Hub label:
   stop_id neighborhood  avg_time_min  peak_load_pax_per_hr  degree  hub
0      S1     Downtown           4.2                   620       2    1
1      S2     Downtown           5.1                   580       2    1
2      S3      Midtown           6.8                   400       1    0
3      S4       Uptown           9.2                   220       1    0
4      S5    Riverside          11.3                   150       1    0
5      S6   Industrial          12.5     