In [13]:
import sys
sys.path.append("..")  # 상위 폴더로 이동
import os
import re
import random
import argparse
import pickle
from tqdm import tqdm
import time
import torch
import torch.nn as nn
import torch.optim as optim
import networkx as nx
import torch.nn.functional as F
import numpy as np
import pandas as pd
import seaborn as sns
import json
from datetime import datetime
import scipy.sparse as sp
import scipy.sparse.linalg as spla
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix, coo_matrix
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from PyPDF2 import PdfMerger
from torch.distributions.normal import Normal
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import from_networkx, to_networkx
from torch_geometric.nn import SAGEConv, GATConv, GCNConv, GraphSAGE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from torch_geometric.transforms import RandomNodeSplit

In [None]:
from utills.function import set_seed, generate_graph_data, generate_noisy_graph_data, load_county_graph_data, load_twitch_graph_data, \
            load_wiki_graph_data, load_trans_graph_data, create_ba_graph_pyg, create_er_graph_pyg, create_grid_graph_pyg, create_tree_graph_pyg, \
            normalize, split_graph_data, split_cp_graph_data, evaluate_model_performance, sort_by_y, coverage_width, \
                get_gpu_memory, get_cpu_memory, count_parameters
from utills.model import GQNN_R, GQNN_N, BayesianGNN, MCDropoutGNN, GQNN, QRLoss, RQRLoss, GQNNLoss

set_seed(1127)  

ImportError: cannot import name 'GQNNLoss2' from 'utills.model' (/home/sypark/QpiGNN/training/../utills/model.py)

In [None]:
dataset = 'basic'
nodes = 1000
noise = 0.3
target_coverage = 0.9
hidden_dim = 64
learning_rate = 1e-3
weight = 1e-3

epochs = 500
runs = 10
lambda_factor = 1
num_samples = 100
dropout = 0.2

device = torch.device('cuda:0')

In [None]:
if dataset != '':
    if dataset == 'basic':
        graph_data = generate_graph_data(num_nodes=nodes)
    elif dataset in ('gaussian', 'uniform', 'outlier', 'edge'):
        graph_data = generate_noisy_graph_data(num_nodes=nodes, noise_type=dataset, noise_level=noise)
    elif dataset in ('education', 'election', 'income', 'unemployment'):
        graph_data = load_county_graph_data(dataset, 2012)
    elif dataset in ('DE', 'ENGB', 'ES', 'FR', 'PTBR', 'RU'):
        graph_data = load_twitch_graph_data(dataset)
    elif dataset in ('chameleon', 'crocodile', 'squirrel'):
        graph_data = load_wiki_graph_data(dataset)
    elif dataset in ('Anaheim', 'ChicagoSketch'):
        graph_data = load_trans_graph_data(dataset)
    elif dataset == 'BA':
        graph_data = create_ba_graph_pyg(n=nodes)
    elif dataset == 'ER':
        graph_data = create_er_graph_pyg(n=nodes)
    elif dataset == 'grid':
        graph_data = create_grid_graph_pyg()
    elif dataset == 'tree':
        graph_data = create_tree_graph_pyg()
    
# split data & normalize
train_data, test_data = split_graph_data(graph_data, test_ratio=0.2)
train_min, train_max, y_min, y_max = train_data.x.min(), train_data.x.max(), train_data.y.min(), train_data.y.max()
train_data.x, test_data.x, train_data.y, test_data.y= normalize(train_data.x, train_min, train_max), normalize(test_data.x, train_min, train_max), normalize(train_data.y, y_min, y_max), normalize(test_data.y, y_min, y_max)

print(f"Train data: {train_data.x.shape[0]} nodes, {train_data.edge_index.shape[1]} edges")
print(f"Train edge_index 최대값: {train_data.edge_index.max().item()}")
print(f"Test data: {test_data.x.shape[0]} nodes, {test_data.edge_index.shape[1]} edges")
print(f"Test edge_index 최대값: {test_data.edge_index.max().item()}")

if model == 'CP':
    cp_train_data, calibration_data = split_cp_graph_data(train_data, cali_ratio=0.2)

    print(f"Train data: {cp_train_data.x.shape[0]} nodes, {cp_train_data.edge_index.shape[1]} edges")
    print(f"Train edge_index 최대값: {cp_train_data.edge_index.max().item()}")
    print(f"Calibration data: {calibration_data.x.shape[0]} nodes, {calibration_data.edge_index.shape[1]} edges")
    print(f"Calibration edge_index 최대값: {calibration_data.edge_index.max().item()}")
    print(f"Test data: {test_data.x.shape[0]} nodes, {test_data.edge_index.shape[1]} edges")
    print(f"Test edge_index 최대값: {test_data.edge_index.max().item()}")

# result folder & file
root_dir = f"./pred/{model}/"
os.makedirs(root_dir, exist_ok=True)

if pdf:
    pdf_dir = os.path.join(root_dir, 'img')
    os.makedirs(pdf_dir, exist_ok=True)

if optimal:
    if dataset in ['basic', 'gaussian', 'uniform', 'outlier', 'edge', 'BA', 'ER', 'grid', 'tree']:
        df = pd.read_csv("./lambda/syn/lambda_optimized_results.csv")
        optimal_lambda = df[df['Dataset'] == dataset]['Best Lambda'].values[0]
    else:
        df = pd.read_csv("./lambda/real/lambda_optimized_results.csv")
        optimal_lambda = df[df['Dataset'] == dataset]['Best Lambda'].values[0]

file_name = dataset + '_' + model
if model == 'GQNN':
    if optimal:
        file_name += f'_lf({optimal_lambda})'
    else:
        file_name += f'_lf({lambda_factor})'

In [None]:
in_dim = train_data.x.shape[1]
train_data = train_data.to(device)
pastel_colors = sns.color_palette('Dark2')

In [None]:
if model == 'SQR':
    tau_low = (1 - target_coverage)/2
    tau_upper = 1 - tau_low
    color = pastel_colors[0]
    model = GQNN_R(in_dim=in_dim, hidden_dim=hidden_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight)
    criterion = QRLoss()
    
    torch.cuda.reset_peak_memory_stats() 
    start_time = time.time()
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        
        taus = torch.rand(train_data.x.size(0), 1, dtype=torch.float32, device=device)
        preds = model(train_data.x, train_data.edge_index, taus)
        loss = criterion(preds, train_data.y, taus)
            
        loss.backward()
        optimizer.step()
    
    end_time = time.time()
    training_time = end_time - start_time
    gpu_mem = get_gpu_memory()
    cpu_mem = get_cpu_memory()
    param_count = count_parameters(model)

    result_this_run['training_time_sec'] = round(training_time, 2)
    result_this_run['gpu_mem_MB'] = round(gpu_mem, 2)
    result_this_run['cpu_mem_MB'] = round(cpu_mem, 2)
    result_this_run['param_count'] = param_count
    
    print(f"Training Time: {training_time:.2f}s | GPU Peak: {gpu_mem:.1f}MB | CPU: {cpu_mem:.1f}MB | Params: {param_count:,}")

    print('-' * 40, f'{model}: {dataset} Train Evaluation... ', '-' * 40)
    model.eval()
    tau_lows = torch.full((train_data.x.size(0), 1), tau_low, dtype=torch.float32, device=device)
    tau_uppers = torch.full((train_data.x.size(0), 1), tau_upper, dtype=torch.float32, device=device)

    with torch.no_grad():
        train_low_preds = model(train_data.x, train_data.edge_index, tau_lows).cpu().numpy()
        train_upper_preds = model(train_data.x, train_data.edge_index, tau_uppers).cpu().numpy()
        train_targets = train_data.y.cpu().numpy()
    train_eval = evaluate_model_performance(train_low_preds, train_upper_preds, train_targets, target=target_coverage)
    result_this_run['train_metrics'] = train_eval
    
    print('-' * 40, f'{model}: {dataset} Test Evaluation... ', '-' * 40)
    test_data = test_data.to(device)
    tau_lows = torch.full((test_data.x.size(0), 1), tau_low, dtype=torch.float32, device=device)
    tau_uppers = torch.full((test_data.x.size(0), 1), tau_upper, dtype=torch.float32, device=device)

    with torch.no_grad():
        test_low_preds = model(test_data.x, test_data.edge_index, tau_lows).cpu().numpy()
        test_upper_preds = model(test_data.x, test_data.edge_index, tau_uppers).cpu().numpy()
        test_targets = test_data.y.cpu().numpy()
    test_eval = evaluate_model_performance(test_low_preds, test_upper_preds, test_targets, target=target_coverage)
    result_this_run['test_metrics'] = test_eval
    


In [None]:
elif model == 'RQR':
    color = pastel_colors[1]
    model = GQNN_N(in_dim=in_dim, hidden_dim=hidden_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight)
    criterion = RQRLoss(target=target_coverage, lambda_factor=lambda_factor)  # lambda_factor 고정함
    
    torch.cuda.reset_peak_memory_stats()  # GPU peak 메모리 초기화
    start_time = time.time()    
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        
        preds = model(train_data.x, train_data.edge_index)
        loss = criterion(preds, train_data.y)
            
        loss.backward()
        optimizer.step()
        
    end_time = time.time()
    training_time = end_time - start_time
    gpu_mem = get_gpu_memory()
    cpu_mem = get_cpu_memory()
    param_count = count_parameters(model)

    result_this_run['training_time_sec'] = round(training_time, 2)
    result_this_run['gpu_mem_MB'] = round(gpu_mem, 2)
    result_this_run['cpu_mem_MB'] = round(cpu_mem, 2)
    result_this_run['param_count'] = param_count
    
    print(f"Training Time: {training_time:.2f}s | GPU Peak: {gpu_mem:.1f}MB | CPU: {cpu_mem:.1f}MB | Params: {param_count:,}")


    print('-' * 40, f'{model}: {dataset} Train Evaluation... ', '-' * 40)
    model.eval()

    with torch.no_grad():
        train_preds = model(train_data.x, train_data.edge_index)
        train_low_preds = train_preds[:, 0].cpu().numpy()
        train_upper_preds = train_preds[:, 1].cpu().numpy()
        train_targets = train_data.y.cpu().numpy()
    train_eval = evaluate_model_performance(train_low_preds, train_upper_preds, train_targets, target=target_coverage)
    result_this_run['train_metrics'] = train_eval
    
    print('-' * 40, f'{model}: {dataset} Test Evaluation... ', '-' * 40)
    test_data = test_data.to(device)

    with torch.no_grad():
        test_preds = model(test_data.x, test_data.edge_index)
        test_low_preds = test_preds[:, 0].cpu().numpy()
        test_upper_preds = test_preds[:, 1].cpu().numpy()
        test_targets = test_data.y.cpu().numpy()
    test_eval = evaluate_model_performance(test_low_preds, test_upper_preds, test_targets, target=target_coverage)
    result_this_run['test_metrics'] = test_eval


In [None]:

elif model == 'BNN':
    color = pastel_colors[3]
    model = BayesianGNN(in_dim=in_dim, hidden_dim=hidden_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight)
    
    torch.cuda.reset_peak_memory_stats()  # GPU peak 메모리 초기화
    start_time = time.time()    
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        
        preds = model(train_data.x, train_data.edge_index)
        loss = F.mse_loss(preds, train_data.y.squeeze())
            
        loss.backward()
        optimizer.step()

    end_time = time.time()
    training_time = end_time - start_time
    gpu_mem = get_gpu_memory()
    cpu_mem = get_cpu_memory()
    param_count = count_parameters(model)

    result_this_run['training_time_sec'] = round(training_time, 2)
    result_this_run['gpu_mem_MB'] = round(gpu_mem, 2)
    result_this_run['cpu_mem_MB'] = round(cpu_mem, 2)
    result_this_run['param_count'] = param_count
    
    print(f"Training Time: {training_time:.2f}s | GPU Peak: {gpu_mem:.1f}MB | CPU: {cpu_mem:.1f}MB | Params: {param_count:,}")
                
    print('-' * 40, f'{model}: {dataset} Train Evaluation... ', '-' * 40)
    model.eval()

    preds_list = []
    with torch.no_grad():
        for _ in range(num_samples):
            preds = model(train_data.x, train_data.edge_index)  
            preds_list.append(preds.cpu().numpy())

    preds_array = np.array(preds_list)  # (num_samples, num_nodes)
    mean_preds = preds_array.mean(axis=0)  # 평균 예측값
    std_preds = preds_array.std(axis=0)    # 표준편차

    if target_coverage == 0.9:
        t = 1.645
    elif target_coverage == 0.95:
        t = 1.96
    # 80%: 1.28 / 90%: 1.645 / 95%: 1.96 / 99%: 2.576

    train_low_preds = mean_preds - t * std_preds  
    train_upper_preds = mean_preds + t * std_preds 
    train_targets = train_data.y.cpu().numpy()
    train_eval = evaluate_model_performance(train_low_preds, train_upper_preds, train_targets, target=target_coverage)
    result_this_run['train_metrics'] = train_eval
    
    print('-' * 40, f'{model}: {dataset} Test Evaluation... ', '-' * 40)
    test_data = test_data.to(device)

    preds_list = []
    with torch.no_grad():
        for _ in range(num_samples):
            preds = model(test_data.x, test_data.edge_index)  # Bayesian Sampling
            preds_list.append(preds.cpu().numpy())

    preds_array = np.array(preds_list)  
    mean_preds = preds_array.mean(axis=0)  
    std_preds = preds_array.std(axis=0)    

    test_low_preds = mean_preds - t * std_preds 
    test_upper_preds = mean_preds + t * std_preds 
    test_targets = test_data.y.cpu().numpy()
    test_eval = evaluate_model_performance(test_low_preds, test_upper_preds, test_targets, target=target_coverage)
    result_this_run['test_metrics'] = test_eval
    


In [None]:
elif model == 'MC':
    color = pastel_colors[4]
    model = MCDropoutGNN(in_dim=in_dim, hidden_dim=hidden_dim, dropout=dropout).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight)

    torch.cuda.reset_peak_memory_stats()  # GPU peak 메모리 초기화
    start_time = time.time()    
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        
        preds = model(train_data.x, train_data.edge_index, training=True)
        loss = F.mse_loss(preds, train_data.y.squeeze())
            
        loss.backward()
        optimizer.step()
        
    end_time = time.time()
    training_time = end_time - start_time
    gpu_mem = get_gpu_memory()
    cpu_mem = get_cpu_memory()
    param_count = count_parameters(model)

    result_this_run['training_time_sec'] = round(training_time, 2)
    result_this_run['gpu_mem_MB'] = round(gpu_mem, 2)
    result_this_run['cpu_mem_MB'] = round(cpu_mem, 2)
    result_this_run['param_count'] = param_count
    
    print(f"Training Time: {training_time:.2f}s | GPU Peak: {gpu_mem:.1f}MB | CPU: {cpu_mem:.1f}MB | Params: {param_count:,}")

                    
    print('-' * 40, f'{model}: {dataset} Train Evaluation... ', '-' * 40)
    model.eval()
    
    preds_list = []
    with torch.no_grad():
        for _ in range(num_samples):
            preds = model(train_data.x, train_data.edge_index, training=True)  # Dropout 유지
            preds_list.append(preds.cpu().numpy())

    preds_array = np.array(preds_list)  # (num_samples, num_nodes)
    mean_preds = preds_array.mean(axis=0)  # 평균 예측값
    std_preds = preds_array.std(axis=0)    # 표준편차
    
    if target_coverage == 0.9:
        t = 1.645
    elif target_coverage == 0.95:
        t = 1.96
        
    train_low_preds = mean_preds - t * std_preds 
    train_upper_preds = mean_preds + t * std_preds 
    train_targets = train_data.y.cpu().numpy()
    train_eval = evaluate_model_performance(train_low_preds, train_upper_preds, train_targets, target=target_coverage)
    result_this_run['train_metrics'] = train_eval
    
    print('-' * 40, f'{model}: {dataset} Test Evaluation... ', '-' * 40)
    test_data = test_data.to(device)
    
    preds_list = []
    with torch.no_grad():
        for _ in range(num_samples):
            preds = model(test_data.x, test_data.edge_index, training=True)  # Dropout 유지
            preds_list.append(preds.cpu().numpy())

    preds_array = np.array(preds_list)  # (num_samples, num_nodes)
    mean_preds = preds_array.mean(axis=0)  # 평균 예측값
    std_preds = preds_array.std(axis=0)    # 표준편차

    test_low_preds = mean_preds - t * std_preds  # 95% 신뢰구간 하한
    test_upper_preds = mean_preds + t * std_preds  # 95% 신뢰구간 상한
    test_targets = test_data.y.cpu().numpy()
    test_eval = evaluate_model_performance(test_low_preds, test_upper_preds, test_targets, target=target_coverage)
    result_this_run['test_metrics'] = test_eval
    


In [None]:
elif model == 'GQNN':
    color = pastel_colors[6]
    model = GQNN(in_dim=in_dim, hidden_dim=hidden_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight)
    
    if optimal:
        criterion = GQNNLoss(target_coverage=target_coverage, lambda_factor=optimal_lambda)
    else:
        criterion = GQNNLoss(target_coverage=target_coverage, lambda_factor=lambda_factor)

    torch.cuda.reset_peak_memory_stats()  # GPU peak 메모리 초기화
    start_time = time.time()    
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        
        preds_low, preds_upper = model(train_data.x, train_data.edge_index)
        loss = criterion(preds_low, preds_upper, train_data.y)
            
        loss.backward()
        optimizer.step()
        
        cvg, wdt = coverage_width(train_data.y, preds_low, preds_upper)

    end_time = time.time()
    training_time = end_time - start_time
    gpu_mem = get_gpu_memory()
    cpu_mem = get_cpu_memory()
    param_count = count_parameters(model)

    result_this_run['training_time_sec'] = round(training_time, 2)
    result_this_run['gpu_mem_MB'] = round(gpu_mem, 2)
    result_this_run['cpu_mem_MB'] = round(cpu_mem, 2)
    result_this_run['param_count'] = param_count
    
    print(f"Training Time: {training_time:.2f}s | GPU Peak: {gpu_mem:.1f}MB | CPU: {cpu_mem:.1f}MB | Params: {param_count:,}")

    print('-' * 40, f'{model}: {dataset} Train Evaluation... ', '-' * 40)
    model.eval()

    with torch.no_grad():
        preds_low, preds_upper = model(train_data.x, train_data.edge_index)    
        train_low_preds = preds_low.cpu().numpy()
        train_upper_preds = preds_upper.cpu().numpy()
        train_targets = train_data.y.cpu().numpy()
    train_eval = evaluate_model_performance(train_low_preds, train_upper_preds, train_targets, target=target_coverage)
    result_this_run['train_metrics'] = train_eval
    
    print('-' * 40, f'{model}: {dataset} Test Evaluation... ', '-' * 40)
    test_data = test_data.to(device)

    with torch.no_grad():
        preds_low, preds_upper = model(test_data.x, test_data.edge_index)    
        test_low_preds = preds_low.cpu().numpy()
        test_upper_preds = preds_upper.cpu().numpy()
        test_targets = test_data.y.cpu().numpy()
    test_eval = evaluate_model_performance(test_low_preds, test_upper_preds, test_targets, target=target_coverage)
    result_this_run['test_metrics'] = test_eval


