In [14]:
import scipy.io as sio
from scipy.sparse import csc_matrix
import numpy as np
import os

In [None]:
def reformat_data_to_mat(dataset, graph1, graph2, gnd_name, ratio):
    data = sio.loadmat(f"datasets/mat/{dataset}.mat")

    new_data = {}
    new_data[graph1] = data[graph1]
    new_data[graph2] = data[graph2]
    new_data[f'{graph1}_node_feat'] = data[f'{graph1}_node_label']
    new_data[f'{graph2}_node_feat'] = data[f'{graph2}_node_label']
    new_data['gnd'] = data[gnd_name]

    n1, n2 = data['n1'][0][0], data['n2'][0][0]
    gnd = data[gnd_name]
    np.random.seed(1234)
    train_indices = np.random.choice(gnd.shape[0], int(gnd.shape[0] * ratio), replace=False)
    train_indices.sort()
    train = gnd[train_indices] - 1
    H = np.zeros((n1, n2))
    H[(train[:, 0], train[:, 1])] = 1
    new_data['H'] = csc_matrix(H.T)

    sio.savemat(f"datasets/{dataset}.mat", new_data)

In [None]:
reformat_data_to_mat("Douban", "online", "offline", "ground_truth", 0.2)

In [None]:
reformat_data_to_mat("flickr-lastfm", "flickr", "lastfm", "gndtruth", 0.2)

In [47]:
def reformat_data_to_np(dataset, graph1, graph2):
    def setdiff(a, b):
        """
        Find the difference of two 2D arrays by excluding common rows.
        
        :param a: array 1 (n1 x 2)
        :param b: array 2 (n2 x 2)
        :return: c: difference of a and b (n3 x 2)
        """
        # Convert the input arrays to structured arrays
        dtype = [('col1', a.dtype), ('col2', a.dtype)]
        structured_a = np.array([tuple(row) for row in a], dtype=dtype)
        structured_b = np.array([tuple(row) for row in b], dtype=dtype)
        
        # Find the difference using set operations
        structured_diff = np.setdiff1d(structured_a, structured_b)
        
        # Convert the structured array back to a regular 2D array
        diff = np.array([list(item) for item in structured_diff])
        
        return diff
    
    data = sio.loadmat(f"datasets/{dataset}.mat")

    A1, A2 = data[graph1].astype(int), data[graph2].astype(int)
    H, gnd = data['H'].astype(int), data['gnd'].astype(np.int64) - 1
    x1, x2 = None, None
    if f'{graph1}_node_feat' in data:
        x1 = data[f'{graph1}_node_feat'].astype(np.float64)
        if type(x1) is not np.ndarray:
            x1 = x1.A
    if f'{graph2}_node_feat' in data:
        x2 = data[f'{graph2}_node_feat'].astype(np.float64)
        if type(x2) is not np.ndarray:
            x2 = x2.A
    
    if type(A1) is not np.ndarray:
        A1 = A1.A
    if type(A2) is not np.ndarray:
        A2 = A2.A
    if type(H) is not np.ndarray:
        H = H.A
    
    edge_index1 = np.array(np.where(A1 == 1))
    edge_index2 = np.array(np.where(A2 == 1))
    print("edge_index1: ", edge_index1.shape)
    print("edge_index2: ", edge_index2.shape)

    anchor_links = np.array(np.where(H.T == 1)).T
    test_pairs = setdiff(gnd, anchor_links)
    print("anchor_links: ", anchor_links.shape)
    print("test_pairs: ", test_pairs.shape)

    if not os.path.exists("datasets/np"):
        os.mkdir("datasets/np")
    np.savez(f'datasets/np/{dataset}.npz', edge_index1=edge_index1, edge_index2=edge_index2, x1=x1, x2=x2, pos_pairs=anchor_links, test_pairs=test_pairs)

In [41]:
reformat_data_to_np("ACM-DBLP-A", "ACM", "DBLP")

edge_index1:  (2, 79122)
edge_index2:  (2, 89616)
anchor_links:  (1265, 2)
test_pairs:  (5060, 2)


In [48]:
reformat_data_to_np("foursquare-twitter", "foursquare", "twitter")

edge_index1:  (2, 108466)
edge_index2:  (2, 261150)
anchor_links:  (321, 2)
test_pairs:  (1288, 2)


In [49]:
reformat_data_to_np("cora", "cora1", "cora2")

edge_index1:  (2, 12668)
edge_index2:  (2, 9084)
anchor_links:  (541, 2)
test_pairs:  (2167, 2)


In [50]:
reformat_data_to_np("phone-email", "phone", "email")

edge_index1:  (2, 82382)
edge_index2:  (2, 9255)
anchor_links:  (200, 2)
test_pairs:  (800, 2)
