# Phase 8: Graph Neural Networks
## Часть 3: Node Classification - Практика

### Задача

Классификация узлов в социальной сети:
- Предсказать категорию пользователя по его связям
- Semi-supervised learning (мало меток)

### В этом ноутбуке:

1. Генерация синтетического графа
2. Сравнение GCN, GAT, GraphSAGE
3. Анализ влияния структуры
4. Inductive vs Transductive learning

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv, SAGEConv
from torch_geometric.utils import to_networkx
import networkx as nx
from sklearn.metrics import accuracy_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
np.random.seed(42)
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

## 1. Генерация Синтетического Графа

Создаём граф социальной сети с community structure.

In [None]:
def generate_community_graph(n_communities=4, nodes_per_community=100, 
                             p_intra=0.1, p_inter=0.01, n_features=16):
    """
    Генерирует граф с community structure.
    
    p_intra: вероятность связи внутри сообщества
    p_inter: вероятность связи между сообществами
    """
    n_nodes = n_communities * nodes_per_community
    
    # Labels
    labels = np.repeat(range(n_communities), nodes_per_community)
    
    # Features: base features + community-specific + noise
    features = np.zeros((n_nodes, n_features))
    for i in range(n_communities):
        start = i * nodes_per_community
        end = (i + 1) * nodes_per_community
        # Community-specific pattern
        community_pattern = np.zeros(n_features)
        community_pattern[i * (n_features // n_communities):(i+1) * (n_features // n_communities)] = 1
        features[start:end] = community_pattern + np.random.randn(nodes_per_community, n_features) * 0.5
    
    # Edges
    edges = []
    for i in range(n_nodes):
        for j in range(i + 1, n_nodes):
            same_community = labels[i] == labels[j]
            p = p_intra if same_community else p_inter
            if np.random.random() < p:
                edges.append([i, j])
                edges.append([j, i])  # Undirected
    
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    x = torch.tensor(features, dtype=torch.float)
    y = torch.tensor(labels, dtype=torch.long)
    
    return Data(x=x, edge_index=edge_index, y=y)

# Генерируем граф
data = generate_community_graph(
    n_communities=4,
    nodes_per_community=200,
    p_intra=0.05,
    p_inter=0.005,
    n_features=32
)

print(f'Граф создан:')
print(f'  Узлов: {data.num_nodes}')
print(f'  Рёбер: {data.num_edges}')
print(f'  Признаков: {data.num_node_features}')
print(f'  Классов: {len(torch.unique(data.y))}')

In [None]:
# Визуализация графа
G = to_networkx(data, to_undirected=True)

fig, ax = plt.subplots(figsize=(10, 8))
pos = nx.spring_layout(G, seed=42, k=0.5)

colors = plt.cm.Set1(data.y.numpy() / 3)
nx.draw(G, pos, ax=ax,
        node_color=colors,
        node_size=30,
        width=0.1,
        alpha=0.8)

ax.set_title('Synthetic Community Graph')
plt.show()

In [None]:
# Train/Val/Test split
n_nodes = data.num_nodes
indices = torch.randperm(n_nodes)

train_mask = torch.zeros(n_nodes, dtype=torch.bool)
val_mask = torch.zeros(n_nodes, dtype=torch.bool)
test_mask = torch.zeros(n_nodes, dtype=torch.bool)

train_mask[indices[:int(0.1 * n_nodes)]] = True  # 10% train (semi-supervised)
val_mask[indices[int(0.1 * n_nodes):int(0.2 * n_nodes)]] = True
test_mask[indices[int(0.2 * n_nodes):]] = True

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask

print(f'Train: {train_mask.sum().item()} ({train_mask.sum().item()/n_nodes*100:.1f}%)')
print(f'Val: {val_mask.sum().item()}')
print(f'Test: {test_mask.sum().item()}')

## 2. Модели для Сравнения

In [None]:
class GCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
    
    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

class GAT(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=4)
        self.conv2 = GATConv(hidden_channels * 4, out_channels, heads=1)
    
    def forward(self, x, edge_index):
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

class GraphSAGE(nn.Module):
    """GraphSAGE - inductive learning через sampling."""
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)
    
    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

print('Модели определены: GCN, GAT, GraphSAGE')

In [None]:
def train_and_evaluate(model, data, epochs=200, lr=0.01):
    """
    Обучение и оценка модели.
    """
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
    
    best_val_acc = 0
    best_test_acc = 0
    losses = []
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        
        # Evaluation
        model.eval()
        with torch.no_grad():
            out = model(data.x, data.edge_index)
            pred = out.argmax(dim=1)
            
            val_acc = (pred[data.val_mask] == data.y[data.val_mask]).sum().item() / data.val_mask.sum().item()
            test_acc = (pred[data.test_mask] == data.y[data.test_mask]).sum().item() / data.test_mask.sum().item()
            
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_test_acc = test_acc
    
    return best_test_acc, losses

# Обучаем все модели
n_features = data.num_node_features
n_classes = len(torch.unique(data.y))
hidden = 64

results = {}
all_losses = {}

for name, Model in [('GCN', GCN), ('GAT', GAT), ('GraphSAGE', GraphSAGE)]:
    model = Model(n_features, hidden, n_classes)
    acc, losses = train_and_evaluate(model, data)
    results[name] = acc
    all_losses[name] = losses
    print(f'{name}: {acc:.4f}')

In [None]:
# Визуализация
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Training curves
for name, losses in all_losses.items():
    axes[0].plot(losses, label=name)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training Loss')
axes[0].legend()

# Comparison
names = list(results.keys())
accs = list(results.values())
colors = ['steelblue', 'coral', 'seagreen']
axes[1].bar(names, accs, color=colors)
axes[1].set_ylabel('Test Accuracy')
axes[1].set_title('Model Comparison')
for i, acc in enumerate(accs):
    axes[1].text(i, acc + 0.01, f'{acc:.3f}', ha='center')

plt.tight_layout()
plt.show()

## 3. Влияние Количества Меток

In [None]:
# Эксперимент с разным % training labels
train_ratios = [0.01, 0.05, 0.1, 0.2, 0.3]
ratio_results = {name: [] for name in ['GCN', 'GAT', 'GraphSAGE']}

for ratio in train_ratios:
    # Update masks
    indices = torch.randperm(n_nodes)
    train_mask = torch.zeros(n_nodes, dtype=torch.bool)
    val_mask = torch.zeros(n_nodes, dtype=torch.bool)
    test_mask = torch.zeros(n_nodes, dtype=torch.bool)
    
    train_mask[indices[:int(ratio * n_nodes)]] = True
    val_mask[indices[int(ratio * n_nodes):int((ratio + 0.1) * n_nodes)]] = True
    test_mask[indices[int((ratio + 0.1) * n_nodes):]] = True
    
    data.train_mask = train_mask
    data.val_mask = val_mask
    data.test_mask = test_mask
    
    for name, Model in [('GCN', GCN), ('GAT', GAT), ('GraphSAGE', GraphSAGE)]:
        model = Model(n_features, hidden, n_classes)
        acc, _ = train_and_evaluate(model, data, epochs=100)
        ratio_results[name].append(acc)
    
    print(f'Ratio {ratio}: GCN={ratio_results["GCN"][-1]:.3f}, '
          f'GAT={ratio_results["GAT"][-1]:.3f}, '
          f'SAGE={ratio_results["GraphSAGE"][-1]:.3f}')

# Визуализация
fig, ax = plt.subplots(figsize=(8, 5))
for name in ratio_results:
    ax.plot([r * 100 for r in train_ratios], ratio_results[name], 'o-', label=name)
ax.set_xlabel('Training Labels (%)')
ax.set_ylabel('Test Accuracy')
ax.set_title('Effect of Label Ratio')
ax.legend()
plt.show()

## Итоги

### Результаты:

- Все GNN хорошо работают на задаче node classification
- GraphSAGE - inductive, может работать с новыми узлами
- GAT часто лучше при достаточном количестве данных
- GNN эффективны даже при малом % labels (semi-supervised)

### Ключевые выводы:

1. **Структура графа** важна - GNN используют информацию о связях
2. **Semi-supervised** - можно обучаться на малом количестве меток
3. **Выбор модели** зависит от задачи (transductive vs inductive)

### Применения:

- Рекомендательные системы
- Fraud detection
- Drug discovery (молекулярные графы)
- Knowledge graphs