# Final Exam (Python)

# Instructions
- This is an open internet exam. You can use any materials you like, but you are not allowed to communicate with other people during the exam.
- The cell below will load the network data you will use for the exam. You can run the cell to generate the data, but do not modify it.

In [1]:
# Load data and build a NetworkX graph with node attributes
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt 

# Read CSV files (relative to this notebook)
nodes_df = pd.read_csv("C:\\Users\\Li Yuxin\\Downloads\\graph_nodes.csv")
edges_df = pd.read_csv("C:\\Users\\Li Yuxin\\Downloads\\graph_edges.csv")

# Create a directed graph
G = nx.DiGraph()

# Add nodes first (ensures nodes without edges are included)
G.add_nodes_from(nodes_df["ID"])  # add all IDs as nodes

# Attach all node attributes keyed by node ID
attr_dict = nodes_df.set_index("ID").to_dict(orient="index")
nx.set_node_attributes(G, attr_dict)

# Add edges from the edges dataframe
G.add_edges_from(edges_df[["ID1", "ID2"]].itertuples(index=False, name=None))

# Minimal confirmation output
print(G.number_of_nodes(), "nodes,", G.number_of_edges(), "edges")

# Print first few nodes to confirm
for node in list(G.nodes(data=True))[:5]:
    print(node)

# You should be seeing the following output:
# 120 nodes, 3583 edges
# (1, {'gender': 'female', 'class': 4, 'age': 17, 'GPA': 2.5})
# (2, {'gender': 'female', 'class': 6, 'age': 17, 'GPA': 3.31})
# (3, {'gender': 'female', 'class': 2, 'age': 17, 'GPA': 2.59})
# (4, {'gender': 'male', 'class': 1, 'age': 17, 'GPA': 3.85})
# (5, {'gender': 'female', 'class': 1, 'age': 16, 'GPA': 3.45})


120 nodes, 3583 edges
(1, {'gender': 'female', 'class': 4, 'age': 17, 'GPA': 2.5})
(2, {'gender': 'female', 'class': 6, 'age': 17, 'GPA': 3.31})
(3, {'gender': 'female', 'class': 2, 'age': 17, 'GPA': 2.59})
(4, {'gender': 'male', 'class': 1, 'age': 17, 'GPA': 3.85})
(5, {'gender': 'female', 'class': 1, 'age': 16, 'GPA': 3.45})


**1. Which of the following statements about the graph is/are true?**

a. The graph has exactly 3583 directed edges.

b. The edge density of the directed graph is between 0.25 and 0.27.

c. The average directed path length is between 2 and 3.

d. The directed graph is strongly connected.

In [2]:
# You can add your code here
import networkx as nx

def analyze_digraph(G: nx.DiGraph):
    # 1. Number of edges
    num_edges = G.number_of_edges()
    
    # 2. Edge density (for directed graph, includes both directions as possible edges)
    edge_density = nx.density(G)   # or nx.edge_density(G)
    
    # 3. Strongly connected?
    is_strongly_connected = nx.is_strongly_connected(G)
    
    # 4. Average directed path length
    #    Only defined if the directed graph is strongly connected.
    if is_strongly_connected:
        avg_directed_path_length = nx.average_shortest_path_length(G)
    else:
        avg_directed_path_length = None  # or handle it differently if you want
    
    return {
        "num_edges": num_edges,
        "edge_density": edge_density,
        "is_strongly_connected": is_strongly_connected,
        "avg_directed_path_length": avg_directed_path_length,
    }

# Example usage:
if __name__ == "__main__":

    stats = analyze_digraph(G)
    for k, v in stats.items():
        print(f"{k}: {v}")


num_edges: 3583
edge_density: 0.25091036414565826
is_strongly_connected: True
avg_directed_path_length: 1.780392156862745


**2. Which of the following statements about degrees in the directed graph is/are true?**

a. The maximum in-degree is greater than 50.

b. The node with the highest out-degree is in class 2.

c. The average out-degree of the graph is between 30 and 35.

d. The node's in-degree and its 'class' attribute are independent (do an ANOVA test, where the in-degree is the dependent variable and the class the categorical co-variate; use 0.05 as testing threshold).

In [3]:
# You can add your code here
from collections import defaultdict
from scipy.stats import f_oneway  # ANOVA

def analyze_degrees_and_class(G: nx.DiGraph, class_attr: str = "class"):
    # 1. Maximum in-degree
    in_degrees = dict(G.in_degree())
    max_in_degree = max(in_degrees.values()) if in_degrees else 0
    
    # 2. Node(s) with highest out-degree and their class
    out_degrees = dict(G.out_degree())
    if out_degrees:
        max_out_degree = max(out_degrees.values())
        # all nodes tied for max out-degree
        nodes_with_max_out = [n for n, d in out_degrees.items() if d == max_out_degree]
    else:
        max_out_degree = 0
        nodes_with_max_out = []
    
    # get their classes (may be multiple)
    node_class_mapping = {}
    for n in nodes_with_max_out:
        node_class_mapping[n] = G.nodes[n].get(class_attr, None)  # None if missing
    
    # 3. Average out-degree
    avg_out_degree = float(np.mean(list(out_degrees.values()))) if out_degrees else 0.0
    
    # 4. ANOVA: in-degree vs class
    #    Build groups: for each class, collect in-degrees of nodes in that class
    class_to_indegrees = defaultdict(list)
    for n, indeg in in_degrees.items():
        c = G.nodes[n].get(class_attr, None)
        if c is not None:
            class_to_indegrees[c].append(indeg)
    
    # Only keep classes with at least 2 nodes, otherwise ANOVA can be unstable
    groups = [vals for vals in class_to_indegrees.values() if len(vals) >= 2]
    
    if len(groups) >= 2:
        F_stat, p_val = f_oneway(*groups)
        anova_result = {
            "F_statistic": F_stat,
            "p_value": p_val,
            "independent": bool(p_val >= 0.05)  # True if we *fail* to reject independence at 5% level
        }
    else:
        anova_result = {
            "F_statistic": None,
            "p_value": None,
            "independent": None,
            "note": "Not enough classes with >=2 nodes to run ANOVA."
        }
    
    return {
        "max_in_degree": max_in_degree,
        "max_out_degree": max_out_degree,
        "nodes_with_max_out_degree_and_class": node_class_mapping,
        "avg_out_degree": avg_out_degree,
        "anova_indegree_vs_class": anova_result,
    }

analyze_degrees_and_class(G)


{'max_in_degree': 48,
 'max_out_degree': 50,
 'nodes_with_max_out_degree_and_class': {82: 2},
 'avg_out_degree': 29.858333333333334,
 'anova_indegree_vs_class': {'F_statistic': np.float64(3.057671943569375),
  'p_value': np.float64(0.012558954383250035),
  'independent': False}}

**3. Create an undirected graph by having an edge between two nodes only if both directed edges are present. Which of the following statements about attribute assortativity in the undirected graph is/are true?**

a. The graph is disassortative with respect to 'gender' (correlation bounded away from 0 by at least 0.1).

b. The graph is assortative with respect to 'age' (correlation bounded away from 0 by at least 0.1).

c. The degree assortativity of the graph is positive.

d. The graph is assortative with respect to 'GPA' (correlation bounded away from 0 by at least 0.1).


In [4]:
# You can add your code here

import networkx as nx
from networkx.algorithms import assortativity as nx_assort
import math

def directed_to_mutual_undirected(G: nx.DiGraph) -> nx.Graph:
    """
    Create an undirected graph H where an edge {u,v} exists
    iff both directed edges (u,v) and (v,u) exist in G.
    """
    H = nx.Graph()
    # copy nodes and their attributes
    H.add_nodes_from(G.nodes(data=True))
    
    mutual_edges = []
    for u, v in G.edges():
        # ensure we only add each undirected edge once
        if u < v and G.has_edge(v, u):
            mutual_edges.append((u, v))
    H.add_edges_from(mutual_edges)
    return H

def check_assortativity(H: nx.Graph, gender_attr="gender",
                        age_attr="age", gpa_attr="GPA",
                        bound=0.1):
    """
    Compute assortativity coefficients and check:
      - disassortative w.r.t gender (<= -bound)
      - assortative w.r.t age (>= bound)
      - assortative w.r.t GPA (>= bound)
      - degree assortativity positive (> 0)
    """
    # Gender: categorical attribute assortativity
    gender_r = nx_assort.attribute_assortativity_coefficient(H, gender_attr)
    
    # Age: numeric attribute assortativity
    age_r = nx_assort.numeric_assortativity_coefficient(H, age_attr)
    
    # GPA: numeric attribute assortativity
    gpa_r = nx_assort.numeric_assortativity_coefficient(H, gpa_attr)
    
    # Degree assortativity
    degree_r = nx_assort.degree_assortativity_coefficient(H)
    
    # Handle possible NaNs (no edges / no variation)
    def safe_check(r, pred):
        if r is None or isinstance(r, float) and math.isnan(r):
            return None  # cannot decide
        return pred(r)
    
    result = {
        "gender_assortativity": gender_r,
        "is_disassortative_gender": safe_check(gender_r, lambda r: r <= -bound),
        
        "age_assortativity": age_r,
        "is_assortative_age": safe_check(age_r, lambda r: r >= bound),
        
        "gpa_assortativity": gpa_r,
        "is_assortative_gpa": safe_check(gpa_r, lambda r: r >= bound),
        
        "degree_assortativity": degree_r,
        "is_degree_assortative_positive": safe_check(degree_r, lambda r: r > 0),
    }
    return result

# Example usage
if __name__ == "__main__":
    
    H = directed_to_mutual_undirected(G)
    assort_stats = check_assortativity(H)
    
    print("Mutual undirected graph edges:", H.edges())
    for k, v in assort_stats.items():
        print(f"{k}: {v}")


Mutual undirected graph edges: [(1, 10), (1, 14), (1, 16), (1, 21), (1, 26), (1, 27), (1, 47), (1, 49), (1, 50), (1, 56), (1, 58), (1, 59), (1, 60), (1, 63), (1, 64), (1, 88), (1, 89), (1, 97), (1, 98), (1, 101), (1, 103), (1, 107), (1, 108), (1, 110), (1, 120), (2, 13), (2, 15), (2, 16), (2, 18), (2, 24), (2, 25), (2, 26), (2, 46), (2, 48), (2, 53), (2, 60), (2, 67), (2, 78), (2, 79), (2, 80), (2, 81), (2, 95), (2, 102), (2, 116), (2, 118), (2, 119), (3, 7), (3, 8), (3, 9), (3, 12), (3, 19), (3, 29), (3, 40), (3, 45), (3, 47), (3, 48), (3, 50), (3, 58), (3, 61), (3, 64), (3, 69), (3, 70), (3, 71), (3, 73), (3, 77), (3, 82), (3, 84), (3, 88), (3, 89), (3, 94), (3, 96), (3, 104), (3, 106), (3, 108), (3, 109), (3, 110), (3, 119), (4, 5), (4, 11), (4, 14), (4, 23), (4, 34), (4, 36), (4, 53), (4, 54), (4, 83), (4, 87), (4, 117), (5, 8), (5, 23), (5, 32), (5, 37), (5, 50), (5, 53), (5, 83), (5, 92), (5, 107), (5, 117), (6, 10), (6, 12), (6, 21), (6, 22), (6, 28), (6, 35), (6, 43), (6, 45), 

**4. Create an undirected graph by having an edge between two nodes only if both directed edges are present. Which of the following statements is/are true after running the Louvain community detection algorithm on the undirected graph 100 times? (Use the Louvain method from the NetworkX package)**

a. The average modularity score across the 100 runs is greater than 0.3.

b. The most frequently observed number of communities is 6.

c. The average size of the largest community found is between 35 and 40.

d. The three nodes with the largest GPA are always assigned to the same community.


In [5]:
# You can add your code here

import networkx as nx
from networkx.algorithms.community import louvain_communities, modularity
from collections import Counter

def directed_to_mutual_undirected(G: nx.DiGraph) -> nx.Graph:
    """
    Create an undirected graph H where an edge {u,v} exists
    iff both directed edges (u,v) and (v,u) exist in G.
    """
    H = nx.Graph()
    # copy nodes + attributes
    H.add_nodes_from(G.nodes(data=True))
    
    mutual_edges = []
    for u, v in G.edges():
        # enforce ordering so we only consider each pair once
        if u < v and G.has_edge(v, u):
            mutual_edges.append((u, v))
    H.add_edges_from(mutual_edges)
    return H


def louvain_stats_on_mutual_graph(G: nx.DiGraph,
                                  n_runs: int = 100,
                                  gpa_attr: str = "GPA",
                                  weight: str | None = None):
    """
    1. Build mutual undirected graph from G.
    2. Run Louvain n_runs times on the undirected graph.
    3. Return:
       - average modularity score
       - most frequently observed number of communities
       - average size of the largest community
       - whether the three nodes with largest GPA are always
         in the same community.
    """
    H = directed_to_mutual_undirected(G)

    # --- Get the three nodes with largest GPA ---
    gpa_list = []
    for node, data in H.nodes(data=True):
        if gpa_attr in data and data[gpa_attr] is not None:
            gpa_list.append((node, data[gpa_attr]))
    # sort descending by GPA
    gpa_list.sort(key=lambda x: x[1], reverse=True)
    top3_nodes = [n for n, _ in gpa_list[:3]]

    if len(top3_nodes) < 3:
        raise ValueError("Graph must have at least 3 nodes with a GPA attribute.")

    modularities = []
    num_communities_list = []
    largest_community_sizes = []
    top3_always_same = True

    for run in range(n_runs):
        # different seed each run
        communities = louvain_communities(H, weight=weight, seed=run)

        # modularity for this partition
        Q = modularity(H, communities, weight=weight)
        modularities.append(Q)

        # number of communities
        num_communities_list.append(len(communities))

        # size of largest community
        largest_size = max(len(c) for c in communities)
        largest_community_sizes.append(largest_size)

        # map node -> community index for this run
        node_to_comm = {}
        for idx, comm in enumerate(communities):
            for node in comm:
                node_to_comm[node] = idx

        # check if the three top-GPA nodes are in the same community
        comm0 = node_to_comm.get(top3_nodes[0], None)
        same = (
            comm0 is not None and
            all(node_to_comm.get(n, None) == comm0 for n in top3_nodes)
        )
        if not same:
            top3_always_same = False

    # average modularity
    avg_modularity = sum(modularities) / len(modularities)

    # most frequently observed number of communities
    counts = Counter(num_communities_list)
    most_freq_num_comms, _ = counts.most_common(1)[0]

    # average size of largest community
    avg_largest_size = sum(largest_community_sizes) / len(largest_community_sizes)

    return {
        "average_modularity": avg_modularity,
        "most_frequent_num_communities": most_freq_num_comms,
        "average_largest_community_size": avg_largest_size,
        "top3_largest_GPA_nodes": top3_nodes,
        "top3_always_same_community": top3_always_same,
    }


# Example usage (toy graph)
if __name__ == "__main__":
    

    stats = louvain_stats_on_mutual_graph(G, n_runs=100, gpa_attr="GPA")
    for k, v in stats.items():
        print(f"{k}: {v}")


average_modularity: 0.3343562101482054
most_frequent_num_communities: 6
average_largest_community_size: 30.03
top3_largest_GPA_nodes: [68, 107, 97]
top3_always_same_community: False


**5. Which of the following statements about reciprocity in the directed graph is/are true?**

a. The overall reciprocity of the graph, defined as the ratio of the number of reciprocated edges to the total number of edges, is greater than 0.7.

b. Every node is involved in at least one mutual pair (that is, a pair (u,v) where both u->v and v->u exist).

c. There is a negative correlation between the in-degree and out-degree of nodes in the graph.

d. The number of mutually connected pairs is greater than the number of pairs with only one directed edge.

In [6]:
# You can add your code here

import networkx as nx
import numpy as np

def reciprocity_and_degree_stats(G: nx.DiGraph):
    m = G.number_of_edges()
    
    # --- Count mutual and single-direction pairs ---
    mutual_pairs = 0
    single_directed_pairs = 0
    nodes_in_mutual = set()
    
    # work on unordered node pairs by enforcing u < v
    for u, v in G.edges():
        if u < v:  # only handle each unordered pair once
            has_uv = True
            has_vu = G.has_edge(v, u)
        elif v < u:
            # (v,u) will be handled when we see that edge, skip here
            continue
        else:
            # self-loop; ignore for mutual-pair counting
            continue

        if has_uv and has_vu:
            mutual_pairs += 1
            nodes_in_mutual.add(u)
            nodes_in_mutual.add(v)
        elif has_uv or has_vu:
            single_directed_pairs += 1

    # number of reciprocated edges (each mutual pair has 2 directed edges)
    reciprocated_edges = 2 * mutual_pairs

    # overall reciprocity = reciprocated edges / total edges
    reciprocity = reciprocated_edges / m if m > 0 else float("nan")

    # --- Check if every node is in at least one mutual pair ---
    all_nodes_have_mutual = len(G) > 0 and len(nodes_in_mutual) == G.number_of_nodes()

    # --- Correlation between in-degree and out-degree ---
    in_degrees = []
    out_degrees = []
    for n in G.nodes():
        in_degrees.append(G.in_degree(n))
        out_degrees.append(G.out_degree(n))
    
    if len(G) >= 2 and len(set(in_degrees)) > 1 and len(set(out_degrees)) > 1:
        corr = float(np.corrcoef(in_degrees, out_degrees)[0, 1])
    else:
        corr = float("nan")  # not defined (too few nodes or zero variance)

    return {
        "reciprocity": reciprocity,
        "num_mutual_pairs": mutual_pairs,
        "num_single_direction_pairs": single_directed_pairs,
        "every_node_has_mutual_pair": all_nodes_have_mutual,
        "in_out_degree_correlation": corr,
    }

# Example usage
if __name__ == "__main__":

    stats = reciprocity_and_degree_stats(G)
    for k, v in stats.items():
        print(f"{k}: {v}")


reciprocity: 0.8311470834496232
num_mutual_pairs: 1489
num_single_direction_pairs: 277
every_node_has_mutual_pair: True
in_out_degree_correlation: 0.9457176422352114


**6. Create an undirected graph by having an edge between two nodes only if both directed edges are present. Which of the following statements about triangles and clustering in the undirected graph is/are true?**

a. The total number of triangles in the graph is greater than 5,000.

b. The global clustering coefficient (transitivity) of the graph is between 0.4 and 0.5.

c. The average clustering coefficient is smaller than the global clustering coefficient.

d. The node with the highest degree has a local clustering coefficient strictly bigger than 0.35.


In [7]:
# You can add your code here
import networkx as nx

def directed_to_mutual_undirected(G: nx.DiGraph) -> nx.Graph:
    """
    Create an undirected graph H where an edge {u, v} exists
    iff both directed edges (u -> v) and (v -> u) exist in G.
    """
    H = nx.Graph()
    # copy nodes and attributes
    H.add_nodes_from(G.nodes(data=True))

    mutual_edges = []
    for u, v in G.edges():
        # enforce an ordering so each unordered pair is handled once
        if u < v and G.has_edge(v, u):
            mutual_edges.append((u, v))
    H.add_edges_from(mutual_edges)
    return H


def triangle_and_clustering_stats(G: nx.DiGraph):
    """
    1. Build mutual undirected graph H from directed G.
    2. Compute:
       - total number of triangles
       - global clustering coefficient (transitivity)
       - average clustering coefficient
       - local clustering coefficient of node with highest degree
    """
    H = directed_to_mutual_undirected(G)

    # --- Total number of triangles ---
    # nx.triangles(H) returns dict {node: #triangles incident to node}
    tri_dict = nx.triangles(H)
    total_triangles = sum(tri_dict.values()) // 3  # each triangle counted 3 times

    # --- Global clustering coefficient (transitivity) ---
    global_clustering = nx.transitivity(H)

    # --- Average clustering coefficient ---
    avg_clustering = nx.average_clustering(H)

    # --- Node with highest degree & its local clustering ---
    if H.number_of_nodes() > 0:
        degrees = dict(H.degree())
        # pick node with max degree (tie-broken by smallest node id)
        max_deg_node = max(degrees, key=lambda n: (degrees[n], -int(n) if isinstance(n, int) else 0))
        local_clustering_max_deg = nx.clustering(H, max_deg_node)
    else:
        max_deg_node = None
        local_clustering_max_deg = None

    return {
        "total_triangles": total_triangles,
        "global_clustering_transitivity": global_clustering,
        "average_clustering": avg_clustering,
        "max_degree_node": max_deg_node,
        "local_clustering_max_degree_node": local_clustering_max_deg,
    }


# Example usage
if __name__ == "__main__":
    

    stats = triangle_and_clustering_stats(G)
    for k, v in stats.items():
        print(f"{k}: {v}")


total_triangles: 5190
global_clustering_transitivity: 0.3849767579863515
average_clustering: 0.3836126723443971
max_degree_node: 49
local_clustering_max_degree_node: 0.3488372093023256
