In [None]:
import pandas as pd

file_path = "/content/Student Survey - Jan.xlsx"  # Adjust if needed
sheet_dict = pd.read_excel(file_path, sheet_name=None)  # Load all sheets into a dictionary

# Access individual sheets
df_affiliations = sheet_dict.get("affiliations")
df_participants = sheet_dict.get("participants")
df_responses = sheet_dict.get("responses")
df_friends = sheet_dict.get("net_0_Friends")
df_influential = sheet_dict.get("net_1_Influential")
df_feedback = sheet_dict.get("net_2_Feedback")
df_more_time = sheet_dict.get("net_3_MoreTime")
df_advice = sheet_dict.get("net_4_Advice")
df_disrespect = sheet_dict.get("net_5_Disrespect")
df_school_activity = sheet_dict.get("net_affiliation_0_SchoolActivit")

In [None]:
import networkx as nx

#### Community detection

In [None]:
# Remove self-loops (where Source == Target)
df_disrespect = df_disrespect[df_disrespect["Source"] != df_disrespect["Target"]]

# Create a directed graph (DiGraph)
G = nx.DiGraph()
for _, row in df_disrespect.iterrows():
    G.add_edge(row["Source"], row["Target"])

In [None]:
!pip install python-louvain



In [None]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [None]:
!pip install deap

Collecting deap
  Downloading deap-1.4.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading deap-1.4.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m133.1/135.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deap
Successfully installed deap-1.4.3


In [None]:
def directed_modularity(G, communities):
    """
    Compute directed modularity (Leicht-Newman) for a given partition.

    Q = (1/m) * sum_{c in communities} sum_{i,j in c} [A_ij - (k_out(i) * k_in(j)) / m]

    Parameters:
      G: A NetworkX DiGraph.
      communities: A list of sets, where each set contains the nodes in one community.

    Returns:
      Q: The modularity value.
    """
    m = G.number_of_edges()
    if m == 0:
        return 0
    Q = 0.0
    for community in communities:
        for i in community:
            for j in community:
                A_ij = 1 if G.has_edge(i, j) else 0
                k_out_i = G.out_degree(i)
                k_in_j = G.in_degree(j)
                Q += (A_ij - (k_out_i * k_in_j) / m)
    return Q / m

def greedy_leicht_newman(G):
    """
    A greedy algorithm to optimize directed modularity (Leicht-Newman method).

    Starts with each node in its own community and iteratively merges the pair of communities
    that yields the highest increase in directed modularity.

    Parameters:
      G: A NetworkX DiGraph.

    Returns:
      communities: A list of sets, each set is a community of nodes.
    """
    # Initialize each node as its own community.
    communities = [{node} for node in G.nodes()]
    current_modularity = directed_modularity(G, communities)
    print("Initial modularity:", current_modularity)

    improvement = True
    while improvement:
        improvement = False
        best_delta = 0
        best_pair = None

        # Consider all pairs of communities.
        for i in range(len(communities)):
            for j in range(i + 1, len(communities)):
                merged = communities[i] | communities[j]
                # Form a new partition with communities[i] and communities[j] merged.
                new_communities = [communities[k] for k in range(len(communities)) if k not in (i, j)]
                new_communities.append(merged)
                new_modularity = directed_modularity(G, new_communities)
                delta = new_modularity - current_modularity
                if delta > best_delta:
                    best_delta = delta
                    best_pair = (i, j, merged)

        if best_pair is not None and best_delta > 0:
            i, j, merged = best_pair
            # Merge the best pair of communities.
            communities = [communities[k] for k in range(len(communities)) if k not in (i, j)]
            communities.append(merged)
            current_modularity += best_delta
            print("Merged communities, new modularity:", current_modularity)
            improvement = True

    return communities

# Run the greedy Leicht-Newman community detection algorithm on your graph
final_communities = greedy_leicht_newman(G)

# Print out the final communities
print("\nFinal communities:")
for idx, comm in enumerate(final_communities):
    print(f"Community {idx}: {sorted(comm)}")

Initial modularity: -0.005886426592797783
Merged communities, new modularity: 0.007098337950138504
Merged communities, new modularity: 0.02008310249307479
Merged communities, new modularity: 0.03306786703601108
Merged communities, new modularity: 0.046052631578947366
Merged communities, new modularity: 0.05886426592797784
Merged communities, new modularity: 0.07167590027700832
Merged communities, new modularity: 0.0844875346260388
Merged communities, new modularity: 0.09729916897506925
Merged communities, new modularity: 0.11011080332409973
Merged communities, new modularity: 0.12292243767313019
Merged communities, new modularity: 0.13573407202216065
Merged communities, new modularity: 0.1485457063711911
Merged communities, new modularity: 0.16135734072022156
Merged communities, new modularity: 0.174168975069252
Merged communities, new modularity: 0.18680747922437665
Merged communities, new modularity: 0.19944598337950134
Merged communities, new modularity: 0.21191135734072017
Merged c

In [None]:
import networkx as nx
from sklearn.preprocessing import minmax_scale

# ---------- 1. Degree-based bully score  ----------
def bully_score_degree(subg, weight=None):
    in_d  = dict(subg.in_degree(weight=weight))   # # nominations received
    out_d = dict(subg.out_degree(weight=weight))  # # nominations made
    return {n: in_d[n] - out_d[n] for n in subg.nodes()}  # +ve ⇒ likely bully

# ---------- 2. PageRank on the *reversed* graph ----------
def bully_score_pagerank(subg, weight=None, alpha=0.85):
    # Reverse edges so "influential receivers" of disrespect rank highest
    return nx.pagerank(subg, alpha=alpha, weight=weight)

# ---------- 3. Combine & pick top candidate ----------
def combine_scores(deg_dict, pr_dict, w_deg=0.6, w_pr=0.4):
    nodes = list(deg_dict.keys())
    d_s   = minmax_scale([deg_dict[n] for n in nodes])
    p_s   = minmax_scale([pr_dict[n]  for n in nodes])
    return {n: w_deg*d_s[i] + w_pr*p_s[i] for i, n in enumerate(nodes)}

def community_subgraphs(G, communities):
    for cid, nodes in enumerate(communities):
        yield cid, G.subgraph(nodes).copy()

def bully_candidates(G_d, communities, weight=None):
    results = {}
    for cid, subg in community_subgraphs(G_d, communities):
        deg_score = bully_score_degree(subg, weight)
        pr_score  = bully_score_pagerank(subg, weight)
        combo     = combine_scores(deg_score, pr_score)
        bully     = max(combo, key=combo.get)              # top candidate
        ranked    = sorted(combo.items(), key=lambda x: x[1], reverse=True)
        results[cid] = {"primary_bully": bully, "ranking": ranked}
    return results

In [None]:
bully_info  = bully_candidates(G, final_communities)

for cid, info in bully_info.items():
    print(f"Community {cid}: primary bully → {info['primary_bully']}")

Community 0: primary bully → 32536
Community 1: primary bully → 32485
Community 2: primary bully → 32414
Community 3: primary bully → 32405
Community 4: primary bully → 32422
Community 5: primary bully → 32455
Community 6: primary bully → 32466
Community 7: primary bully → 32500
Community 8: primary bully → 32441


In [None]:
import networkx as nx

# `final_communities` is a list of sets
# `bully_info` is a dict: {community_id: {"primary_bully": ID}}

# 1. Create a reverse lookup from student to their community
student_to_community = {}
for cid, members in enumerate(final_communities):
    for student in members:
        student_to_community[student] = cid

# 2. Extract all bullies
bully_dict = {}
for cid, info in bully_info.items():
    primary_bully = info["primary_bully"]
    victims = [sid for sid in final_communities[cid] if sid != primary_bully]
    bully_dict[primary_bully] = victims

# 3. Victim subgraph (exclude all known bullies)
all_victims = {sid for comm in final_communities for sid in comm}
all_bullies = set(bully_dict.keys())
victim_nodes = all_victims - all_bullies

victim_graph = G.subgraph(victim_nodes).copy()

#### GPA Regression

In [None]:
import torch, numpy as np, pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from torch_geometric.data import HeteroData
from torch_geometric.nn import HeteroConv, GATConv
from pathlib import Path

# ╒══════════════════════════════════════════════════════════════════════╕
# 1. Load workbook & node table
# ╘══════════════════════════════════════════════════════════════════════╛
file_path = "/content/Student Survey - Jan.xlsx"
sheets = pd.read_excel(file_path, sheet_name=None)
df_nodes = (sheets["participants"]
            .merge(sheets["responses"], on="Participant-ID", how="left",
                   suffixes=("", "_resp")))
df_nodes = df_nodes.dropna(subset=["Perc_Academic"])      # keep only labelled

# ╒══════════════════════════════════════════════════════════════════════╕
# 2. Tabular preprocessing ➜ numpy feature matrix
# ╘══════════════════════════════════════════════════════════════════════╛
# 2-a.  Label-aware normalisation  (z-score within House)
df_nodes["House"] = df_nodes["House"].astype("category")
mu  = df_nodes.groupby("House")["Perc_Academic"].transform("mean")
std = df_nodes.groupby("House")["Perc_Academic"].transform("std").clip(lower=1e-6)
y_raw = df_nodes["Perc_Academic"].to_numpy(dtype="float32")        # keep raw
y     = ((y_raw - mu) / std).to_numpy(dtype="float32")             # scaled

# 2-b.  *now* drop the target so it isn’t used as an input feature
df_nodes = df_nodes.drop(columns=["Perc_Academic"])
df_nodes = df_nodes.dropna(axis=1, how="all")             # drop empty columns

num_cols = df_nodes.select_dtypes(["int64", "float64"]).columns
cat_cols = df_nodes.select_dtypes(["object", "category", "bool"]).columns

pre = ColumnTransformer([
    ("num", Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("std", StandardScaler())
    ]), num_cols),

    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        # ↓ set sparse_output=False (sklearn ≥1.2) or sparse=False (≤1.1)
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ]), cat_cols)
])

X = pre.fit_transform(df_nodes).astype("float32")   # now a NumPy ndarray

# ╒══════════════════════════════════════════════════════════════════════╕
# 3. Build HeteroData with **six relations + reverse edges**
# ╘══════════════════════════════════════════════════════════════════════╛
pid_arr = df_nodes["Participant-ID"].to_numpy()
pid2idx = {pid: i for i, pid in enumerate(pid_arr)}

data = HeteroData()
data["student"].x = torch.from_numpy(X)
data["student"].y = torch.from_numpy(y)

edge_sheets = {
    "friends"     : "net_0_Friends",
    "influential" : "net_1_Influential",
    "feedback"    : "net_2_Feedback",
    "moretime"    : "net_3_MoreTime",
    "advice"      : "net_4_Advice",
    "disrespect"  : "net_5_Disrespect"
}

for rel, sheet_name in edge_sheets.items():
    df_e = sheets[sheet_name][["Source", "Target"]].dropna()
    mask = df_e["Source"].isin(pid2idx) & df_e["Target"].isin(pid2idx)
    src = df_e.loc[mask, "Source"].map(pid2idx).to_numpy()
    dst = df_e.loc[mask, "Target"].map(pid2idx).to_numpy()
    if len(src) == 0:             # skip empty relations
        continue
    ei = torch.tensor([src, dst], dtype=torch.long)
    data["student", rel, "student"].edge_index = ei
    # add explicit reverse relation to aid message flow
    data["student", f"{rel}_rev", "student"].edge_index = ei.flip(0)

# ╒══════════════════════════════════════════════════════════════════════╕
# 4. Masks
# ╘══════════════════════════════════════════════════════════════════════╛
from sklearn.model_selection import StratifiedShuffleSplit
seed = 42
bins = pd.qcut(y, q=4, labels=False, duplicates="drop")
sss  = StratifiedShuffleSplit(n_splits=1, test_size=0.30, random_state=seed)
train_idx, tmp_idx = next(sss.split(np.arange(len(y)), bins))

# split the remaining 30 % in half (stratified)
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=seed)
val_idx, test_idx = next(sss2.split(tmp_idx, bins[tmp_idx]))

# build boolean masks
for name, idx_arr in [("train_mask", train_idx),
                      ("val_mask",   val_idx),
                      ("test_mask",  test_idx)]:
    mask = torch.zeros(data["student"].num_nodes, dtype=torch.bool)
    mask[idx_arr] = True
    data["student"][name] = mask

# ╒══════════════════════════════════════════════════════════════════════╕
# 5. 2-layer Relational GAT
# ╘══════════════════════════════════════════════════════════════════════╛
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

class RGAT(torch.nn.Module):
    def __init__(self, metadata, in_dim, hid=64, heads=3, dropout=0.1):
        super().__init__()
        self.lin_in = torch.nn.Linear(in_dim, hid)

        self.convs = torch.nn.ModuleList()
        for _ in range(2):
            conv_dict = {
                et: GATConv(
                    (-1, -1),
                    32,
                    heads=heads,
                    concat=True,
                    dropout=0.2,
                    add_self_loops=False         # ← keep this
                    # edge_dropout=0.2  ← remove / comment out
                    )
                for et in metadata[1]
}
            self.convs.append(HeteroConv(conv_dict, aggr="mean"))

        self.lin_out = torch.nn.Linear(32 * heads, 1)   # 32×3 → 1
        self.dp = torch.nn.Dropout(dropout)

    def forward(self, data):
        x_dict = {"student": torch.relu(self.lin_in(data["student"].x))}
        for conv in self.convs:
            x_dict = conv(x_dict, data.edge_index_dict)
            x_dict = {k: torch.relu(v) for k, v in x_dict.items()}
            x_dict = {k: self.dp(v)    for k, v in x_dict.items()}
        return self.lin_out(x_dict["student"]).squeeze()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RGAT(data.metadata(), in_dim=data["student"].x.size(1)).to(device)
data = data.to(device)
opt  = torch.optim.Adam(model.parameters(), lr=1.0e-3, weight_decay=5e-4)
loss_fn = torch.nn.MSELoss()

#  (everything up to HeteroData stays the same, except use y not y_raw)

# ===================================================================
#  B.  tiny grid search: depth ∈ {2,3}, heads ∈ {2–6}, dropout ∈ {0–0.4}
# ===================================================================
hyper_grid = {
    "depth":   [2, 3],
    "heads":   [2, 3, 4, 5, 6],
    "dropout": [0.0, 0.1, 0.2, 0.3, 0.4],
}
best_val = float("inf")
best_cfg, best_state = None, None

for depth in hyper_grid["depth"]:
    for heads in hyper_grid["heads"]:
        for dp in hyper_grid["dropout"]:
            torch.manual_seed(seed)           # reproducible per run
            np.random.seed(seed)

            # -- define RGAT with variable depth / heads / dropout ----
            class RGAT(torch.nn.Module):
                def __init__(self, metadata, in_dim):
                    super().__init__()
                    self.lin_in = torch.nn.Linear(in_dim, 64)
                    self.convs = torch.nn.ModuleList()
                    for _ in range(depth):
                        conv_dict = {
                            et: GATConv((-1, -1), 32,
                                        heads=heads, concat=True,
                                        dropout=dp, add_self_loops=False)
                            for et in metadata[1]
                        }
                        self.convs.append(HeteroConv(conv_dict, aggr="mean"))
                    self.lin_out = torch.nn.Linear(32 * heads, 1)
                    self.dp = torch.nn.Dropout(dp)

                def forward(self, d):
                    x = {"student": torch.relu(self.lin_in(d["student"].x))}
                    for conv in self.convs:
                        x = conv(x, d.edge_index_dict)
                        x = {k: torch.relu(v) for k, v in x.items()}
                        x = {k: self.dp(v)    for k, v in x.items()}
                    return self.lin_out(x["student"]).squeeze()

            model = RGAT(data.metadata(), in_dim=data["student"].x.size(1)).to(device)
            opt   = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
            sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="min",
                                                               factor=0.5, patience=10)
            best_rmse, wait = float("inf"), 0
            for epoch in range(1, 401):
                model.train(); opt.zero_grad()
                out  = model(data)
                loss = loss_fn(out[data["student"].train_mask],
                               data["student"].y[data["student"].train_mask])
                loss.backward(); opt.step()

                # validation
                model.eval()
                with torch.no_grad():
                    v_pred = model(data)[data["student"].val_mask].cpu()
                v_rmse = np.sqrt(mean_squared_error(
                    data["student"].y[data["student"].val_mask].cpu(), v_pred))
                sched.step(v_rmse)

                if v_rmse + 1e-3 < best_rmse:
                    best_rmse, wait = v_rmse, 0
                    best_state = model.state_dict()
                else:
                    wait += 1
                if wait >= 30:               # early stop
                    break

            # keep global best
            if best_rmse < best_val:
                best_val, best_cfg = best_rmse, (depth, heads, dp)
                torch.save(best_state, "rgat_best_overall.pt")

print(f"Best config depth={best_cfg[0]} heads={best_cfg[1]} dropout={best_cfg[2]:.1f}"
      f"  |  val-RMSE={best_val:5.2f}")

# ===================================================================
#  C.  final test metrics on best model
# ===================================================================
# ── rebuild model with the winning hyper-params ─────────────────────
best_depth, best_heads, best_dp = best_cfg
class RGAT_Best(torch.nn.Module):
    def __init__(self, metadata, in_dim):
        super().__init__()
        self.lin_in = torch.nn.Linear(in_dim, 64)
        self.convs  = torch.nn.ModuleList()
        for _ in range(best_depth):
            conv_dict = {
                et: GATConv((-1, -1), 32,
                            heads=best_heads, concat=True,
                            dropout=best_dp, add_self_loops=False)
                for et in metadata[1]
            }
            self.convs.append(HeteroConv(conv_dict, aggr="mean"))
        self.lin_out = torch.nn.Linear(32 * best_heads, 1)
        self.dp = torch.nn.Dropout(best_dp)

    def forward(self, d):
        x = {"student": torch.relu(self.lin_in(d["student"].x))}
        for conv in self.convs:
            x = conv(x, d.edge_index_dict)
            x = {k: torch.relu(v) for k, v in x.items()}
            x = {k: self.dp(v)    for k, v in x.items()}
        return self.lin_out(x["student"]).squeeze()

model = RGAT_Best(data.metadata(), in_dim=data["student"].x.size(1)).to(device)
model.load_state_dict(torch.load("rgat_best_overall.pt"), strict=False)
model.eval()

# ── test inference ───────────────────────────────────────────────────
with torch.no_grad():
    y_hat_scaled = model(data).cpu().numpy()

# un-scale back to raw GPA units
y_hat = y_hat_scaled * std.to_numpy() + mu.to_numpy()

test_mask = data["student"].test_mask.cpu().numpy()
mae  = mean_absolute_error(y_raw[test_mask], y_hat[test_mask])
rmse = np.sqrt(mean_squared_error(y_raw[test_mask], y_hat[test_mask]))
r2   = r2_score(y_raw[test_mask], y_hat[test_mask])

print("\n=== RGAT  (grid-tuned, label-normalised) ===")
print(f"MAE : {mae:5.2f}")
print(f"RMSE: {rmse:5.2f}")
print(f"R²  : {r2:5.2f}")

  mu  = df_nodes.groupby("House")["Perc_Academic"].transform("mean")
  std = df_nodes.groupby("House")["Perc_Academic"].transform("std").clip(lower=1e-6)
  ei = torch.tensor([src, dst], dtype=torch.long)


Best config depth=2 heads=2 dropout=0.0  |  val-RMSE= 0.52

=== RGAT  (grid-tuned, label-normalised) ===
MAE :  2.35
RMSE:  7.81
R²  :  0.82
