In [None]:
# 📓 01_ogbl_biokg_eda.ipynb

# -------------------------------
# 🧬 BioKG EDA for DDI Prediction
# -------------------------------

%pip install torch-scatter torch-sparse torch-geometric
%pip install torch-geometric
%pip install ogb

# -------------------------------
from ogb.linkproppred import LinkPropPredDataset
import torch
import pandas as pd
import matplotlib.pyplot as plt
from torch_geometric.transforms import ToUndirected
from torch_geometric.utils import degree

# Load dataset
dataset = LinkPropPredDataset(name='ogbl-biokg')
graph = dataset[0]

# Convert to undirected
graph = ToUndirected()(graph)

# -----------------------
# 🔍 Basic Graph Insights
# -----------------------
print("Number of nodes:", graph.num_nodes)
print("Number of edges:", graph.edge_index.shape[1])
print("Number of unique edge types:", graph.edge_type.unique().shape[0])

# -----------------------
# 📊 Edge Type Distribution
# -----------------------
edge_types, counts = torch.unique(graph.edge_type, return_counts=True)

plt.figure(figsize=(12, 6))
plt.bar(edge_types.tolist(), counts.tolist())
plt.xlabel("Edge Type ID")
plt.ylabel("Count")
plt.title("Edge Type Frequency Distribution in BioKG")
plt.grid(True)
plt.show()

# -----------------------
# 📈 Node Degree Distribution
# -----------------------
deg = degree(graph.edge_index[0])
plt.figure(figsize=(10, 6))
plt.hist(deg.numpy(), bins=100, color="skyblue")
plt.title("Node Degree Distribution")
plt.xlabel("Degree")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

# -----------------------
# 🧠 Graph Summary Table
# -----------------------
summary = {
    "Num Nodes": [graph.num_nodes],
    "Num Edges": [graph.edge_index.shape[1]],
    "Unique Relations": [graph.edge_type.unique().shape[0]],
    "Max Node Degree": [deg.max().item()],
    "Mean Node Degree": [deg.mean().item()],
}
pd.DataFrame(summary)
