In [36]:
from graphMatching import *
from networkx import read_edgelist
from scipy.io import loadmat
from model import *
from utils import *

In [37]:
data = "ACM_DBLP" # args.dataset

device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")
train_features = {}
if (data == "ACM_DBLP"):
    train_set = ["ACM", "DBLP"]
    input_dim = 17
    b = np.load('data/ACM-DBLP.npz')
    train_features["ACM"] = [torch.from_numpy(b["x1"]).float()]
    train_features["DBLP"] = [torch.from_numpy(b["x2"]).float()]
    test_pairs = b['test_pairs'].astype(np.int32)
    NUM_HIDDEN_LAYERS = 12
    HIDDEN_DIM = 1024
    output_feature_size = 1024
    lr = 0.0001
    epoch = 100
elif (data == "Douban Online_Offline"):
    a1, f1, a2, f2, test_pairs = load_douban()
    f1 = f1.A
    f2 = f2.A
    train_set = ["Online", "Offline"]
    input_dim = 538
    test_pairs = torch.tensor(np.array(test_pairs, dtype=int)) - 1
    test_pairs = test_pairs.numpy()
    train_features["Online"] = [torch.from_numpy(f1).float()]
    train_features["Offline"] = [torch.from_numpy(f2).float()]
    NUM_HIDDEN_LAYERS = 6
    HIDDEN_DIM = 512
    output_feature_size = 512
    lr = 0.0001
    epoch = 100


In [38]:
""" temp = torch.from_numpy(b["x1"]).float() # G1, features
print(temp.shape)
temp """

' temp = torch.from_numpy(b["x1"]).float() # G1, features\nprint(temp.shape)\ntemp '

In [39]:
""" temp = torch.from_numpy(b["x2"]).float() # G2, features
print(temp.shape)
temp """

' temp = torch.from_numpy(b["x2"]).float() # G2, features\nprint(temp.shape)\ntemp '

In [40]:
""" temp = b["test_pairs"]
print(temp.shape)
temp """

' temp = b["test_pairs"]\nprint(temp.shape)\ntemp '

In [41]:
test_pairs

array([[   0, 6829],
       [   2, 3102],
       [   3, 3584],
       ...,
       [9841, 3392],
       [9850,  306],
       [9868, 9011]], dtype=int32)

In [42]:
train_set

['ACM', 'DBLP']

In [43]:
encoder = "GIN"
use_input_augmentation = True
use_output_augmentation = False
print("Loading training datasets")
train_loader = {}
for dataset in train_set:
    train_loader[dataset] = [load_adj(dataset)]

train_loader

Loading training datasets


{'ACM': [tensor([[0., 1., 1.,  ..., 0., 0., 0.],
          [1., 0., 1.,  ..., 0., 0., 0.],
          [1., 1., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]])],
 'DBLP': [tensor([[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]])]}

In [44]:
train_loader.keys()

dict_keys(['ACM', 'DBLP'])

In [45]:
""" temp = train_loader["ACM"][0]
print(temp.shape)
temp """

' temp = train_loader["ACM"][0]\nprint(temp.shape)\ntemp '

In [46]:
""" temp = train_loader["DBLP"][0]
print(temp.shape)
temp """

' temp = train_loader["DBLP"][0]\nprint(temp.shape)\ntemp '

In [47]:
model = GAE(NUM_HIDDEN_LAYERS,
            input_dim,
            HIDDEN_DIM,
            output_feature_size, activation=F.relu,
            use_input_augmentation=use_input_augmentation,
            use_output_augmentation=use_output_augmentation,
            encoder=encoder).to(device)
model

GAE(
  (base_gcn): GIN(
    (in_proj): Linear(in_features=17, out_features=1024, bias=True)
    (convs): ModuleList(
      (0-13): 14 x GINConv(
        (linear): Linear(in_features=1041, out_features=1024, bias=True)
      )
    )
    (out_proj): Linear(in_features=15360, out_features=1024, bias=True)
  )
)

In [48]:
print("Generating training features")


Generating training features


In [49]:
print("Fitting model")
# fit_GAE_real(data, len(train_set) * (1 + 1), model, epoch, train_loader, train_features, device, lr,test_pairs)
# fit_GAE_real(data, no_samples, GAE, epoch, train_loader, train_features, device, lr, test_pairs):

no_samples = len(train_set) * (1 + 1)
GAE = model
# ---

best_hitAtOne = 0
best_hitAtFive = 0
best_hitAtTen = 0
best_hitAtFifty = 0
optimizer = Adam(GAE.parameters(), lr=lr, weight_decay=5e-4)

for step in tqdm(range(epoch)):
    loss = 0
    
    for dataset in train_loader.keys():
        S = train_loader[dataset][0]
        initial_features = train_features[dataset]
        
        for i in range(len(train_loader[dataset])):
            adj_tensor = train_loader[dataset][i]
            adj = coo_matrix(adj_tensor.numpy())
            adj_norm = preprocess_graph(adj)
            pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
            norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

            adj_label = coo_matrix(S.numpy())
            adj_label = sparse_to_tuple(adj_label)

            adj_norm = torch.sparse.FloatTensor(torch.LongTensor(adj_norm[0].T),
                                                torch.FloatTensor(adj_norm[1]),
                                                torch.Size(adj_norm[2])).to(device)
            adj_label = torch.sparse.FloatTensor(torch.LongTensor(adj_label[0].T),
                                                torch.FloatTensor(adj_label[1]),
                                                torch.Size(adj_label[2])).to(device)

            initial_feature = initial_features[i].to(device)

            weight_mask = adj_label.to_dense().view(-1) == 1
            weight_tensor = torch.ones(weight_mask.size(0))
            weight_tensor[weight_mask] = pos_weight
            weight_tensor = weight_tensor.to(device)
            z = GAE(initial_feature, adj_norm)
            A_pred = torch.sigmoid(torch.matmul(z,z.t()))
            loss += norm * F.binary_cross_entropy(A_pred.view(-1), adj_label.to_dense().view(-1),
                                                        weight=weight_tensor)
    
    optimizer.zero_grad()
    loss = loss / no_samples
    loss.backward()
    optimizer.step()

    # ---
    keys = list(train_loader.keys())
    S1 = train_loader[keys[0]][0]
    S2 = train_loader[keys[1]][0]
    
    adj_S1 = coo_matrix(S1.numpy())
    adj_norm_1 = preprocess_graph(adj_S1)
    adj_norm_1 = torch.sparse.FloatTensor(torch.LongTensor(adj_norm_1[0].T),
                                            torch.FloatTensor(adj_norm_1[1]),
                                            torch.Size(adj_norm_1[2])).to(device)
    adj_S2 = coo_matrix(S2.numpy())
    adj_norm_2 = preprocess_graph(adj_S2)
    adj_norm_2 = torch.sparse.FloatTensor(torch.LongTensor(adj_norm_2[0].T),
                                            torch.FloatTensor(adj_norm_2[1]),
                                            torch.Size(adj_norm_2[2])).to(device)
    if (data == "ACM_DBLP"):
        S1_feat = train_features["ACM"][0]
        S2_feat = train_features["DBLP"][0]
    elif (data == "Douban Online_Offline"):
        S1_feat = train_features["Online"][0]
        S2_feat = train_features["Offline"][0]

    # ---
    S1_emb = GAE(S1_feat.to(device), adj_norm_1).detach()
    S2_emb = GAE(S2_feat.to(device), adj_norm_2).detach()

    D = torch.cdist(S1_emb, S2_emb, 2) # Euclidean distance
    
    if (data == "ACM_DBLP"):
        test_idx = test_pairs[:, 0].astype(np.int32)
        labels = test_pairs[:, 1].astype(np.int32)
    elif (data == "Douban Online_Offline"):
        test_idx = test_pairs[0, :].astype(np.int32)
        labels = test_pairs[1, :].astype(np.int32)
    
    hitAtOne = 0
    hitAtFive = 0
    hitAtTen = 0
    hitAtFifty = 0
    hitAtHundred = 0
    
    # test
    
    for i in range(len(test_idx)): # here
        dist_list = D[test_idx[i]]
        # print(i, test_idx[i], dist_list)
        sorted_neighbors = torch.argsort(dist_list).cpu()
        label = labels[i]
        
        """ if i == 0:
            print(label, sorted_neighbors[0].item(), sorted_neighbors)
            # 6829 6829 tensor([6829, 3102,  601,  ..., 7878, 9701, 2044]) """
        
        for j in range(100):
            if (sorted_neighbors[j].item() == label):
                if (j == 0):
                    hitAtOne += 1
                    hitAtFive += 1
                    hitAtTen += 1
                    hitAtFifty += 1
                    hitAtHundred += 1
                    break
                elif (j <= 4):
                    hitAtFive += 1
                    hitAtTen += 1
                    hitAtFifty += 1
                    hitAtHundred += 1
                    break
                elif (j <= 9):
                    hitAtTen += 1
                    hitAtFifty += 1
                    hitAtHundred += 1
                    break
                elif (j <= 49):
                    hitAtFifty += 1
                    hitAtHundred += 1
                    break
                elif (j <= 100):
                    hitAtHundred += 1
                    break
    
    cur_hitAtOne = hitAtOne / len(test_idx)
    cur_hitAtFive = hitAtFive / len(test_idx)
    cur_hitAtTen = hitAtTen / len(test_idx)
    cur_hitAtFifty = hitAtFifty / len(test_idx)

    if(cur_hitAtOne > best_hitAtOne): best_hitAtOne = cur_hitAtOne
    if (cur_hitAtFive > best_hitAtFive): best_hitAtFive = cur_hitAtFive
    if (cur_hitAtTen > best_hitAtTen): best_hitAtTen = cur_hitAtTen
    if (cur_hitAtFifty > best_hitAtFifty): best_hitAtFifty = cur_hitAtFifty

Fitting model


100%|██████████| 100/100 [05:35<00:00,  3.36s/it]


In [50]:
print("The best results achieved:")
print("Hit@1: ", end="")
print(best_hitAtOne)
print("Hit@5: ", end="")
print(best_hitAtFive)
print("Hit@10: ", end="")
print(best_hitAtTen)
print("Hit@50: ", end="")
print(best_hitAtFifty)

The best results achieved:
Hit@1: 0.7363636363636363
Hit@5: 0.9144268774703558
Hit@10: 0.9515810276679841
Hit@50: 0.9814229249011858


In [51]:
D.shape

torch.Size([9872, 9916])

In [52]:
D

tensor([[17.2530, 18.2046, 16.9607,  ..., 17.5494, 17.1913, 16.9879],
        [15.4623, 14.3858, 15.0783,  ..., 15.6386, 15.3168, 15.1629],
        [20.6827, 21.3635, 20.4721,  ..., 20.8756, 20.5913, 20.4415],
        ...,
        [ 2.2970,  4.4574,  1.3301,  ...,  3.6336,  2.2012,  1.8838],
        [ 1.9505,  4.5780,  1.9546,  ...,  3.5356,  1.7899,  0.4235],
        [ 1.7773,  4.6257,  1.7787,  ...,  3.5053,  1.4415,  1.2437]],
       device='cuda:0')

In [53]:
def hungarian(D):
    print("0")
    P = torch.zeros_like(D)
    matrix = D.tolist()
    m = Munkres()
    print("1")
    indexes = m.compute(matrix)
    print("2")
    total = 0
    for r, c in tqdm(indexes):
        print(r)
        P[r][c] = 1
        total += matrix[r][c]
    return P.t()

# hungarian(D)

Truth

In [54]:
print(test_pairs.shape)
print(test_pairs)

(5060, 2)
[[   0 6829]
 [   2 3102]
 [   3 3584]
 ...
 [9841 3392]
 [9850  306]
 [9868 9011]]


In [55]:
if (data == "ACM_DBLP"):
    test_pairs_ = test_pairs
elif (data == "Douban Online_Offline"):
    test_pairs_ = test_pairs.T
    
truth = test_pairs_[test_pairs_[:, 1].argsort()]
print(truth.shape)
truth

(5060, 2)


array([[3615,    1],
       [1302,    2],
       [ 466,    3],
       ...,
       [6148, 9907],
       [3826, 9911],
       [7275, 9915]], dtype=int32)

Option 1

In [56]:
import numpy as np
from scipy.optimize import linear_sum_assignment

def hungarian_algorithm(cost_matrix):
    # Use the linear_sum_assignment method from scipy
    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    
    # Total cost
    total_cost = cost_matrix[row_ind, col_ind].sum()
    
    # The assignments are returned as (row, col) pairs
    assignments = list(zip(row_ind, col_ind))
    
    return total_cost, assignments

total_cost, assignments = hungarian_algorithm(D.cpu())

# print(f"Total Cost: {total_cost}")
# print(f"Assignments: {assignments}")

In [57]:
option1 = np.array(assignments)
option1 = option1[option1[:, 1].argsort()]
print(option1.shape)
option1

(9872, 2)


array([[2985,    0],
       [3615,    1],
       [1637,    2],
       ...,
       [2873, 9913],
       [8365, 9914],
       [7276, 9915]])

In [58]:
def test_matching(truth, test):
    matching = []
    test = test.tolist()
    truth = truth.tolist()

    for item in test:
        if item in truth:
            matching.append(item)
    return matching

matching = test_matching(truth, option1)
print(len(matching))
print(len(matching) / len(truth))
matching

3394
0.6707509881422925


[[3615, 1],
 [466, 3],
 [1971, 4],
 [685, 9],
 [5778, 11],
 [4103, 12],
 [7144, 14],
 [3608, 15],
 [1928, 18],
 [3708, 20],
 [1304, 23],
 [3382, 28],
 [1141, 29],
 [480, 31],
 [2369, 33],
 [2437, 39],
 [1624, 41],
 [279, 42],
 [4986, 43],
 [610, 46],
 [6660, 59],
 [9066, 60],
 [3513, 62],
 [7322, 65],
 [7778, 67],
 [1625, 68],
 [526, 70],
 [4932, 71],
 [566, 72],
 [1602, 77],
 [3787, 78],
 [4798, 82],
 [344, 83],
 [1322, 84],
 [5151, 87],
 [4019, 94],
 [1475, 97],
 [7869, 98],
 [2067, 100],
 [7975, 105],
 [1108, 106],
 [4160, 107],
 [5080, 109],
 [2530, 110],
 [738, 114],
 [5234, 115],
 [1981, 118],
 [408, 126],
 [1608, 127],
 [1075, 134],
 [6379, 136],
 [9661, 137],
 [7440, 139],
 [730, 141],
 [2384, 143],
 [5862, 147],
 [3997, 148],
 [597, 154],
 [6635, 156],
 [364, 157],
 [2438, 158],
 [135, 162],
 [839, 164],
 [4735, 165],
 [1421, 168],
 [6376, 175],
 [3841, 179],
 [1420, 180],
 [3508, 182],
 [5235, 183],
 [2871, 194],
 [1149, 195],
 [2924, 196],
 [6168, 199],
 [7241, 200],
 [6696,

Option 2

In [59]:
import pygmtools as pygm

X = pygm.hungarian(D.cpu().numpy())
print(X.shape)
X

(9872, 9916)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [60]:
indices = []
row, col = X.shape
for i in range(row):
    for j in range(col):
        if X[i][j] == 1:
            indices.append([i, j])
option2 = np.array(indices)
option2 = option2[option2[:, 1].argsort()]
print(option2.shape)
option2

(9872, 2)


array([[6393,    0],
       [9026,    1],
       [9213,    2],
       ...,
       [6411, 9913],
       [6278, 9914],
       [  32, 9915]])

In [61]:
matching = test_matching(truth, option2)
print(len(matching))
print(len(matching) / len(truth))
matching

0
0.0


[]

Option 3

In [62]:
indices = []
for i in range(D.shape[0]):
    dist_list = D[i]
    sorted_neighbors = torch.argsort(dist_list).cpu()
    indices.append([i, sorted_neighbors[0]])

option3 = np.array(indices)
option3 = option3[option3[:, 1].argsort()]
print(option3.shape)
option3

(9872, 2)


array([[2985,    0],
       [8494,    1],
       [3615,    1],
       ...,
       [3826, 9911],
       [7920, 9911],
       [2873, 9913]])

In [63]:
matching = test_matching(truth, option3)
print(len(matching))
print(len(matching) / len(truth))
matching

3437
0.6792490118577075


[[3615, 1],
 [1302, 2],
 [466, 3],
 [1971, 4],
 [2855, 7],
 [685, 9],
 [5778, 11],
 [4103, 12],
 [7144, 14],
 [3608, 15],
 [1928, 18],
 [7646, 21],
 [1304, 23],
 [3382, 28],
 [1141, 29],
 [480, 31],
 [9099, 32],
 [7672, 38],
 [2437, 39],
 [1624, 41],
 [279, 42],
 [610, 46],
 [5633, 47],
 [6788, 54],
 [6660, 59],
 [9066, 60],
 [3513, 62],
 [7322, 65],
 [7778, 67],
 [1625, 68],
 [526, 70],
 [4932, 71],
 [566, 72],
 [1602, 77],
 [3787, 78],
 [4798, 82],
 [344, 83],
 [1322, 84],
 [5151, 87],
 [5731, 88],
 [4019, 94],
 [1475, 97],
 [7869, 98],
 [2067, 100],
 [3806, 101],
 [7975, 105],
 [1108, 106],
 [4160, 107],
 [5080, 109],
 [2530, 110],
 [738, 114],
 [5234, 115],
 [1981, 118],
 [408, 126],
 [1608, 127],
 [6180, 131],
 [6379, 136],
 [9661, 137],
 [2207, 140],
 [730, 141],
 [2384, 143],
 [3171, 146],
 [5862, 147],
 [3997, 148],
 [4691, 151],
 [597, 154],
 [6635, 156],
 [364, 157],
 [2438, 158],
 [135, 162],
 [839, 164],
 [4735, 165],
 [1421, 168],
 [3841, 179],
 [1420, 180],
 [7753, 181],


---

In [None]:
a = torch.tensor([[0.0,  0.0], [0.0, 1.0], [0.0,  2.0]])
print(a)
b = torch.tensor([[0.0, 1.0 ], [1.0,  1.0]])
print(b)
torch.cdist(a, b, p=2)

In [None]:
a = torch.tensor([[4.01, 3.0, 2.0, 0.1, 4.0]])
print(a)
torch.argsort(a, dim=1)

In [None]:
a = np.array([[9, 2, 3],
              [4, 5, 6],
              [7, 0, 5]])

a[a[:, 2].argsort()]
