In [1]:
# 정의 함수 load
from utils.data import *
from utils.model import *
from utils.metric import *
from utils.analysis import *

In [2]:
# Dataset List: dt_config/datasets
dt_config = dataset_config()
datasets= list(dt_config.keys())

# print('Dataset List:')
# print(datasets)

In [3]:
# Dataset Load: dt_dict
# datasets = [
#     'region_job_r', 'region_job_g', 'region_job_2_r', 'region_job_2_g', 
#     'nba_p', 'nba_m', 'german_g', 'german_f', 'german_s', 'german_t', 'german_h', 'german_e'
# ]

dt_dict = {}
for ds in datasets:
    data, df, cfg = load_and_prepare_dataset(dt_config, config_name=ds, seed=1127)
    dt_dict[ds] = {
        'data': data,
        'df': df,
        'cfg': cfg
    }

# Check Dataset: Train/Val/Test 민감속성 비율 동일하게 설정
# print_sensitive_attr_distribution(dt_dict['nba_p']['data'])

Loading region_job dataset from ./dataset/pokec
[INFO] 유효하지 않은 user_id로 인해 제거된 edge 수: 0
[region_job] sens=0: 43962, sens=1: 23834
--------------------------------------------------
Loading region_job dataset from ./dataset/pokec
[INFO] 유효하지 않은 user_id로 인해 제거된 edge 수: 0
[region_job] sens=0: 34308, sens=1: 33488
--------------------------------------------------
Loading region_job_2 dataset from ./dataset/pokec
[INFO] 유효하지 않은 user_id로 인해 제거된 edge 수: 0
[region_job_2] sens=0: 47338, sens=1: 19231
--------------------------------------------------
Loading region_job_2 dataset from ./dataset/pokec
[INFO] 유효하지 않은 user_id로 인해 제거된 edge 수: 0
[region_job_2] sens=0: 34125, sens=1: 32444
--------------------------------------------------
Loading nba dataset from ./dataset/NBA
[INFO] 유효하지 않은 user_id로 인해 제거된 edge 수: 1641
[nba] sens=0: 296, sens=1: 107
--------------------------------------------------
Loading nba dataset from ./dataset/NBA
[INFO] 유효하지 않은 user_id로 인해 제거된 edge 수: 1641
[nba] sens=0: 29

  return torch.sparse.FloatTensor(indices, values, shape)


[german] sens=0: 310, sens=1: 690
--------------------------------------------------
Loading german dataset from ./dataset/NIFTY
[german] sens=0: 310, sens=1: 690
--------------------------------------------------
Loading german dataset from ./dataset/NIFTY
[german] sens=0: 310, sens=1: 690
--------------------------------------------------
Loading german dataset from ./dataset/NIFTY
[german] sens=0: 310, sens=1: 690
--------------------------------------------------
Loading german dataset from ./dataset/NIFTY
[german] sens=0: 310, sens=1: 690
--------------------------------------------------
Loading german dataset from ./dataset/NIFTY
[german] sens=0: 310, sens=1: 690
--------------------------------------------------


In [None]:
# Evluate Dataset: eval_dt_df
eval_dt = {}
ev_dt_list = ['region_job_r', 'region_job_g', 'region_job_2_r', 'region_job_2_g', 'nba_p', 'german_g']
for ds in ev_dt_list:
    data = dt_dict[ds]['data']
    features = data.x
    edge_index = data.edge_index
    sens = data.sensitive_attr

    eval_dt[ds] = {
        "Homophily Ratio": homophily_ratio(edge_index, sens),
        "Assortativity Coefficient": assortativity_coefficient(data),
        "Local Neighborhood Fairness": local_neighborhood_fairness(edge_index, sens),
        "Degree Balance": degree_balance(edge_index, sens),
        'Structural Bias': structural_bias(features, edge_index, sens)
    }

eval_dt_df = pd.DataFrame(eval_dt).T
print(eval_dt_df)

In [4]:
# Experiments Setting
runs=5
epochs=500

cuda=torch.cuda.is_available()
device = torch.device('cuda:0' if cuda else 'cpu')
print(f"Using device: {device}")

seed=1127
np.random.seed(seed)
torch.manual_seed(seed)
if cuda:
    torch.cuda.manual_seed(seed)

# 공통
lr=0.001
weight_decay=1e-5

# model mapping
model_map = {
    'FnRGNN': lambda data: FnRGNN(nfeat=data.x.size(1), hidden_dim=64, dropout=0.5, lm=3, ld=1, mmd_sample=500, lr=lr, weight_decay=weight_decay),
    'FairGNN': lambda data: FairGNN(nfeat=data.x.size(1), hidden_dim=64, model='GCN', dropout=0.5, hidden=128, alpha=4, beta=0.01, lr=lr, weight_decay=weight_decay),
    'FMP': lambda data: FMP(data, num_hidden=64, num_layers=5, num_gnn_layer=2, lambda1=3, lambda2=3, dropout=0.5, num_classes=1, L2=True, cached=False),
    'GMMD': lambda data: GMMD(in_channels=data.x.size(1), hidden_channels=64),
    'EDITS': lambda data: EDITS(nfeat=data.x.size(1), node_num=data.x.size(0), nfeat_out=int(data.x.size(0) / 10), adj_lambda=1e-1, layer_threshold=2, dropout=0.2, lr=lr, weight_decay=weight_decay),
    'MLPRegressor': lambda data: MLPRegressor(in_dim=data.x.size(1), hidden_dim=64),
    'GCNRegressor': lambda data: GCNRegressor(in_dim=data.x.size(1), hidden_dim=64),
    'GATRegressor': lambda data: GATRegressor(in_dim=data.x.size(1), hidden_dim=64, heads=1),
    'GraphSAGERegressor': lambda data: GraphSAGERegressor(in_dim=data.x.size(1), hidden_dim=64),
    'GINRegressor': lambda data: GINRegressor(in_dim=data.x.size(1), hidden_dim=64),
}


Using device: cuda:0


In [6]:
# Training
tr_md_list = ['EDITS', 'FnRGNN', 'FairGNN', 'FMP', 'GMMD', 'MLPRegressor', 'GCNRegressor', 'GATRegressor', 'GraphSAGERegressor', 'GINRegressor']
tr_dt_list = ['region_job_r', 'region_job_2_r', 'nba_p', 'nba_m', 'german_g', 'german_f', 'german_s', 'german_t', 'german_h', 'german_e']

for ds in tr_dt_list:
    data = dt_dict[ds]['data']
    data = data.to(device)

    cfg = dt_dict[ds]['cfg']
    dn = cfg['dn']

    for md in tr_md_list:
        print(f'Train {md} dataset from {ds}')
        os.makedirs(f'./model/{md}', exist_ok=True) 
        model_path = f'./model/{md}/{dn}_md.pth'

        if md == 'FairGNN':
            print(f'Train FairGNN Sensitive Model dataset from {ds}')
            
            fair_sen_model = GCN(nfeat=data.x.shape[1], nhid=128, nclass=1, dropout=0.5).to(device)
            fair_optimizer = optim.Adam(fair_sen_model.parameters(), lr=lr, weight_decay=weight_decay)
            criterion = torch.nn.MSELoss()
            best_mse = float('inf')
            best_result = {}

            for epoch in range(epochs + 1):    
                fair_sen_model.train()
                fair_optimizer.zero_grad()
                output = fair_sen_model(data.x, data.edge_index)
                loss = criterion(output[data.idx_train], data.y[data.idx_train].unsqueeze(1))
                loss.backward()
                fair_optimizer.step()

                fair_fastmode=False
                if not fair_fastmode:
                    fair_sen_model.eval()
                    with torch.no_grad():
                        output = fair_sen_model(data.x, data.edge_index)
                        mse_val = mean_squared_error(data.sensitive_attr[data.idx_val].cpu().numpy(), output[data.idx_val].cpu().numpy())
                        mse_test = mean_squared_error(data.sensitive_attr[data.idx_test].cpu().numpy(), output[data.idx_test].cpu().numpy())

                if epoch % 50 == 0:
                    print(f"Epoch [{epoch}] Test set results:",
                        f"mse_val={mse_val:.4f}, mse_test={mse_test:.4f}")

                    if mse_val < best_mse:
                        best_mse = mse_val
                        best_result = {'mse': mse_test}
                        torch.save(fair_sen_model.state_dict(), f"./checkpoint/GCN_sens_{ds}_ns_{cfg['sens_number']}")

            print(f"The best MSE of estimator: {best_result['mse']:.4f}")
            print("Optimization Finished!")
        
        for run in range(runs + 1):
            model = model_map[md](data).to(device)

            if md not in ['FnRGNN', 'FairGNN', 'EDITS']:
                optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
                criterion = torch.nn.MSELoss()
            elif md == 'FairGNN':
                try:
                    model.estimator.load_state_dict(torch.load( f"./checkpoint/GCN_sens_{ds}_ns_{cfg['sens_number']}", map_location=torch.device(device) ))
                except Exception as e:
                    print(f"Checkpoint load failed: {e}")
            elif md == 'EDITS':
                features = data.x.to(device).to(torch.float32)
                labels = data.y.to(device).to(torch.float32)
                sens = data.sensitive_attr.to(device).to(torch.float32)

                adj = data.adj
                if isinstance(adj, torch.Tensor):
                    adj = adj.to(device).to(torch.float32)
                else:
                    adj = torch.FloatTensor(adj.toarray()).to(device).to(torch.float32)

                idx_train = data.idx_train
                idx_val = data.idx_val
                idx_test = data.idx_test
            
            best_score = float('inf')
            best_model_state = None

            for epoch in range(epochs + 1):
                if md == 'FnRGNN':
                    loss = model.optimize(data)
                elif md == 'FairGNN':
                    model.optimize(data)
                elif md == 'EDITS':
                    lr_adj = 0.001 if epoch > 400 else 0.003
                    model.train()
                    model.optimize(adj, features, idx_train, sens, epoch, lr_adj)
                elif md == 'GMMD':
                    model.train()
                    optimizer.zero_grad()
                    pred, mmd_loss = model(data)
                    pred = pred.squeeze()
                    loss = mmd_loss + criterion(pred[data.idx_train], data.y[data.idx_train].squeeze())
                    loss.backward()
                    optimizer.step()
                else:
                    model.train()
                    optimizer.zero_grad()
                    output = model(data)
                    loss = criterion(output[data.idx_train], data.y[data.idx_train].unsqueeze(1))
                    loss.backward()
                    optimizer.step()

                # validation
                model.eval()
                with torch.no_grad():
                    if md == 'EDITS':
                        adj_sparse = to_scipy_sparse_matrix(data.edge_index).tocoo()
                        adj = sparse_mx_to_torch_sparse_tensor(adj_sparse).to(device)
                        features = data.x
                        if not hasattr(model.adj_renew, "estimator"):
                            model.adj_renew.fit(adj, lr=0.003)
                        output = model(adj, features)[2]
                    elif md in ['FnRGNN', 'FairGNN', 'GMMD']:
                        output, _ = model(data)
                    else:
                        output = model(data)

                    y_true, idx_val, sensitive_attr = data.y, data.idx_val, data.sensitive_attr
                    mse_val = mean_squared_error(y_true[idx_val].cpu(), output[idx_val].cpu())
                    mae_val = mean_absolute_error(y_true[idx_val].cpu(), output[idx_val].cpu())
                    mse_diff, mae_diff, mean_diff = fair_metric_regression(output[idx_val].cpu(), y_true[idx_val].cpu(), sensitive_attr[idx_val].cpu())
                    dist_val = group_distribution_metrics(y_true[idx_val].cpu().numpy().squeeze(), output[idx_val].cpu().numpy().squeeze(), sensitive_attr[idx_val].cpu().numpy())
                    
                    if md == 'FnRGNN':
                        val_score = mse_val + 0.5 * mean_diff
                    else:
                        val_score = mse_val

                    if val_score < best_score:
                        best_mse = val_score
                        best_model_state = copy.deepcopy(model.state_dict())
                        torch.save(best_model_state, model_path)
                    
                    if epoch % 50 == 0:
                        print(f"[{md}] Run {run}, Epoch {epoch} | "
                            f"Val MSE: {mse_val:.2f}, MAE: {mae_val:.2f}, \n"
                            f"MSE Diff: {mse_diff:.2f}, MAE Diff: {mae_diff:.2f}, MEAN Diff: {mean_diff:.2f}, "
                            f"Wasserstein Diff: {dist_val['wasserstein_diff']:.2f}, JS Diff: {dist_val['js_diff']:.2f}"
                        )

Train EDITS dataset from region_job_r


OutOfMemoryError: CUDA out of memory. Tried to allocate 17.12 GiB. GPU 0 has a total capacity of 11.99 GiB of which 0 bytes is free. Of the allocated memory 17.60 GiB is allocated by PyTorch, and 145.20 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Testing
ts_md_list = ['FnRGNN', 'FairGNN', 'FMP', 'GMMD', 'EDITS', 'MLPRegressor', 'GCNRegressor', 'GATRegressor', 'GraphSAGERegressor', 'GINRegressor']
ts_dt_list = ['region_job_r', 'region_job_2_r', 'nba_p', 'nba_m', 'german_g', 'german_f', 'german_s', 'german_t', 'german_h', 'german_e']

ts_results = {}
for ds in ts_dt_list:
    data = dt_dict[ds]['data']
    data = data.to(device)

    cfg = dt_dict[ds]['cfg']
    dn = cfg['dn']

    md_results = {}
    for md in ts_md_list:
        print(f'Test {md} dataset from {ds}')

        result = []
        for run in range(runs + 1):
            model = model_map[md](data).to(device)
            model.load_state_dict(torch.load(model_path))
            
            model.eval()
            with torch.no_grad():
                if md == 'EDITS':
                    adj_sparse = to_scipy_sparse_matrix(data.edge_index).tocoo()
                    adj = sparse_mx_to_torch_sparse_tensor(adj_sparse).to(device)
                    features = data.x
                    if not hasattr(model.adj_renew, "estimator"):
                        model.adj_renew.fit(adj, lr=0.003)
                    output = model(adj, features)[2]
                elif md in ['FnRGNN', 'FairGNN', 'GMMD']:
                    output, _ = model(data)
                else:
                    output = model(data)

                y_true, idx_test, sensitive_attr = data.y, data.idx_test, data.sensitive_attr
                mse_test = mean_squared_error(y_true[idx_test].cpu(), output[idx_test].cpu())
                mae_test = mean_absolute_error(y_true[idx_test].cpu(), output[idx_test].cpu())
                mse_diff, mae_diff, mean_diff = fair_metric_regression(output[idx_test].cpu(), y_true[idx_test].cpu(), sensitive_attr[idx_test].cpu())
                dist_test = group_distribution_metrics(y_true[idx_test].cpu().numpy().squeeze(), output[idx_test].cpu().numpy().squeeze(), sensitive_attr[idx_test].cpu().numpy())
                
            result.append([mse_test, mae_test, mse_diff, mae_diff, mean_diff, dist_test])
                
        md_results[ds] = result

    ts_results[md] = md_results