In [None]:
import numpy as np
import networkx as nx
import scipy.io
import graph
import task
import func
from algorithms import dissim, netmf
import time

#### prepare random states for train-test-split in node classification (for reproduction)

In [None]:
# generate
rounds = 10
sub_rounds = 10
file_rs = 'node_classification_rs.npy'
rs_mat = np.random.randint(low=1, high=1000, size=(rounds, sub_rounds), dtype=int)
np.save(file_rs, rs_mat)

# read
# file_rs = 'node_classification_rs.npy'  
# rs_mat = np.load(file_rs)               
# print(rs_mat)

#### prepare node labels for node classification

In [None]:
# save a list of strings to txt
def save_list_str(data, file):
    with open(file, 'w') as fp:
        for item in data:
            fp.write('%s\n' % item)
    return

# read a list of strings from txt
def read_list_str(file):
    data = []
    with open(file, 'r') as fp:
        for line in fp:
            data.append(line[:-1])
    return data

graph_name = 'PPI'
node_attr = 'label'
file_labels = 'labels/' + graph_name + '_labels.txt' 
file_Y = 'labels/' + graph_name + '_Y.npy'

# save data
nx_G = graph.gen_graph(graph_name)
labels, n_labels = graph.find_labels(nx_G, node_attr)
save_list_str(labels, file_labels)
n_samples = len(nx_G)
n_labels = len(labels)
Y = np.zeros((n_samples, n_labels), dtype=int)
ns = 0
for node in nx_G.nodes(data=node_attr):
    for label in node[1]:
        Y[ns, labels.index(label)] = 1 
    ns += 1
np.save(file_Y, Y)

# load data
# labels = read_list_str(file_labels)
# Y = np.load(file_Y)

#### for single-label datasets (Citeseer and Cora)

In [None]:
graph_name = 'Cora'
node_attr = 'label'

file_Y = 'labels/' + graph_name + '_TrueLabels.npy'

if graph_name == 'Citeseer':
    label_dict = {'DB': 0, 'Agents': 1, 'ML': 2, 'HCI': 3, 'IR': 4, 'AI': 5}
else: # Core
    label_dict = {'Probabilistic_Methods': 0, 'Case_Based': 1, 'Neural_Networks': 2, 'Theory': 3, 
              'Reinforcement_Learning': 4, 'Rule_Learning': 5, 'Genetic_Algorithms': 6}

# save data
nx_G = graph.gen_graph(graph_name)
n_samples = len(nx_G)
Y = np.zeros(n_samples, dtype=int)
ns = 0
for node in nx_G.nodes(data=node_attr):
    assert len(node[1]) == 1  # single label
    Y[ns] = label_dict[node[1][0]]
    ns += 1
np.save(file_Y, Y)

# load data
# Y = np.load(file_Y)
# print(Y)

### Prepare matrices (node classification)
#### HOPE: prepare adjacency

In [None]:
graph_name = 'BlogCatalog'
nx_G = graph.gen_graph(graph_name)
A = np.array(nx.to_numpy_matrix(nx_G, weight=None)) # consider undirected graphs here
file = 'mats/Katz_nc/unweighted_adj/' + graph_name + '.mat'
scipy.io.savemat(file, mdict={'adj': A})

#### Katz (might not be same as the result given by Matlab for large graphs)

In [None]:
graph_name = 'BlogCatalog'
ratio = 0.95
nx_G = graph.gen_graph(graph_name)
graph_size = nx_G.number_of_nodes()
A = np.array(nx.to_numpy_matrix(nx_G, weight=None))
beta_dict = {'Citeseer': 0.0691, 'Cora': 0.0660, 'PPI': 0.0126,'BlogCatalog': 0.0029}     
beta = beta_dict[graph_name]
S = np.matmul(np.linalg.inv(np.eye(graph_size)-beta*A), beta*A)
file = 'mats/Katz_nc/katz/' + graph_name + '_' + str(ratio) + '.npy'
np.save(file, S)

#### prepare deepwalk based similarity

In [None]:
graph_name = 'Cora'
window = 10
b = 1
nx_G = graph.gen_graph(graph_name)
A = nx.to_scipy_sparse_matrix(nx_G, weight='weight')
S = netmf.comp_S_DW(A, window, b) # numpy matrix
file = 'mats/DW_nc/'+graph_name+'_T'+str(window)+'_b'+str(b)+'.npy'
np.save(file, S)

#### prepare FE based similarities

In [None]:
# inf = 1e8
# graph_name = 'Cora'
# beta_list = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1]

# nx_G = graph.gen_graph(graph_name)
# A = np.array(nx.to_numpy_matrix(nx_G, weight='weight'))
# d = np.sum(A, 0)
# C = dissim.adj2cost(A, inf)
# graph_size = len(nx_G)
# H = np.eye(graph_size) - np.ones((graph_size, graph_size))/graph_size

# for beta in beta_list:
#     print(beta)
#     #start = time.time()
#     Delta_FE = dissim.FE(A, C, d, beta)
#     #end = time.time()
#     #print(end-start)
#     K = -0.5*np.matmul(np.matmul(H,Delta_FE),H) 
#     file_fe = 'mats/FE_nc/'+graph_name+'_'+str(beta)+'.npy'
#     np.save(file_fe, K)

In [None]:
inf = 1e8
graph_name = 'Citeseer'
# beta_list = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1]
beta_list = [20]

nx_G = graph.gen_graph(graph_name)
A = np.array(nx.to_numpy_matrix(nx_G, weight='weight'))
d = np.sum(A, 0)
C = dissim.adj2cost(A, inf)

for beta in beta_list:
    print(beta)
    Delta_FE = dissim.FE(A, C, d, beta)
    file_fe = 'mats_v2/FE_nc/'+graph_name+'_'+str(beta)+'.npy'
    np.save(file_fe, Delta_FE)

#### prepare data for link prediction

In [None]:
graph_name = 'Citeseer'      
test_ratio = 0.3
rs = [965, 177, 218, 342, 383, 254, 108, 37, 760, 404]
rounds = len(rs) # 10
dir_preprocessed = 'LP_data/' + graph_name + '_' + str(test_ratio) + '/' # need to create in advance
nx_G = graph.gen_graph(graph_name)
for round_i in range(rounds):
    train_graph, train_edges, train_labels, test_edges, test_labels = task.lp_split_train_test(nx_G, test_ratio, rs[round_i])
    print(round_i, train_graph.number_of_nodes(), len(train_edges), len(test_edges))
    f_train_graph  = dir_preprocessed + 'train_graph_'  + str(round_i) + '.gpickle'
    f_train_edges  = dir_preprocessed + 'train_edges_'  + str(round_i) + '.txt'
    f_train_labels = dir_preprocessed + 'train_labels_' + str(round_i) + '.txt'
    f_test_edges   = dir_preprocessed + 'test_edges_'   + str(round_i) + '.txt'
    f_test_labels  = dir_preprocessed + 'test_labels_'  + str(round_i) + '.txt'
    nx.write_gpickle(train_graph, f_train_graph)
    func.save_list(train_edges, f_train_edges)
    func.save_list(train_labels, f_train_labels)
    func.save_list(test_edges, f_test_edges)
    func.save_list(test_labels, f_test_labels)    


In [None]:
# # check the data
# nx_G_edges = list(map(set,list(nx_G.edges())))
# train_graph_edges = list(map(set,list(train_graph.edges())))

# ns = 0
# for edge in train_edges:
#     if train_labels[ns] == 1:
#         assert set(edge) in train_graph_edges
#     else:
#         assert set(edge) not in nx_G_edges
#     ns += 1    

# ns = 0
# for edge in test_edges:
#     if test_labels[ns] == 1:
#         assert set(edge) not in train_graph_edges
#         assert set(edge) in nx_G_edges
#     else:
#         assert set(edge) not in nx_G_edges
#     ns += 1

### Prepare matrices (link prediction)
#### HOPE: prepare adjacency

In [None]:
graph_name = 'BlogCatalog'      
test_ratio = 0.2
rounds = 10
dir_preprocessed = 'LP_data/' + graph_name + '_' + str(test_ratio) + '/'
for round_i in range(rounds):
    print(round_i)
    f_train_graph  = dir_preprocessed + 'train_graph_'  + str(round_i) + '.gpickle'
    train_graph  = func.load_list(f_train_graph)
    A = np.array(nx.to_numpy_matrix(train_graph, weight=None))
    file = 'mats/Katz_lp_' + str(test_ratio) + '/unweighted_adj/' + graph_name + '_' + str(round_i) + '.mat'
    scipy.io.savemat(file, mdict={'adj': A})

#### Katz (to be changed)

In [None]:
graph_name = 'BlogCatalog'
test_ratio = 0.5
rounds = 1
dir_preprocessed = 'LP_data/' + graph_name + '_' + str(test_ratio) + '/'
ratio = 0.95
beta_dict = {'Citeseer': [0.1229, 0.1122, 0.1133, 0.1110, 0.1176, 0.1162, 0.1215, 0.1120, 0.1095, 0.1202], 
             'Cora': [0.1049, 0.0967, 0.0955, 0.1044, 0.0964, 0.1015, 0.0977, 0.0922, 0.1017, 0.1018], 
             'PPI': [0.0246, 0.0248, 0.0241, 0.0246, 0.0243, 0.0246, 0.0247, 0.0248, 0.0249, 0.0245],
             'BlogCatalog': [0.0058, 0.0058, 0.0058, 0.0058, 0.0058, 0.0058, 0.0058, 0.0058, 0.0058, 0.0058]}   
for round_i in range(rounds):
    print(round_i)
    f_train_graph  = dir_preprocessed + 'train_graph_'  + str(round_i) + '.gpickle'
    nx_G  = func.load_list(f_train_graph)
    graph_size = nx_G.number_of_nodes()
    A = np.array(nx.to_numpy_matrix(nx_G, weight=None))
    beta = beta_dict[graph_name][round_i]
    S = np.matmul(np.linalg.inv(np.eye(graph_size)-beta*A), beta*A)
    file = 'mats/Katz_lp/katz/' + graph_name + '_' + str(ratio) + '_' + str(round_i) + '.npy'
#     np.save(file, S)

#### prepare deepwalk based similarity

In [None]:
graph_name = 'Citeseer'
test_ratio = 0.3
rounds = 10
window = 10
b = 1
dir_preprocessed = 'LP_data/' + graph_name + '_' + str(test_ratio) + '/'
for round_i in range(rounds):
    print(round_i)
    f_train_graph  = dir_preprocessed + 'train_graph_'  + str(round_i) + '.gpickle'
    train_graph  = func.load_list(f_train_graph)
    A = nx.to_scipy_sparse_matrix(train_graph, weight='weight')
    S = netmf.comp_S_DW(A, window, b) # numpy matrix
    file = 'mats/DW_lp_'+str(test_ratio)+'/'+graph_name+'_T'+str(window)+'_b'+str(b)+'_'+str(round_i)+'.npy'
    np.save(file, S)

#### prepare FE based similarities

In [None]:
# graph_name = 'BlogCatalog'
# test_ratio = 0.2
# rounds = 10
# inf = 1e8
# beta_list = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1]
# dir_preprocessed = 'LP_data/' + graph_name + '_' + str(test_ratio) + '/'

# for round_i in range(rounds):
#     print(round_i)
#     f_train_graph  = dir_preprocessed + 'train_graph_'  + str(round_i) + '.gpickle'
#     nx_G  = func.load_list(f_train_graph)
#     A = np.array(nx.to_numpy_matrix(nx_G, weight='weight'))
#     d = np.sum(A, 0)
#     C = dissim.adj2cost(A, inf)
#     graph_size = len(nx_G)
#     H = np.eye(graph_size) - np.ones((graph_size, graph_size))/graph_size
    
#     for beta in beta_list:
#         print(beta)
#         Delta_FE = dissim.FE(A, C, d, beta)
#         K = -0.5*np.matmul(np.matmul(H,Delta_FE),H) 
#         file = 'mats/FE_lp_'+str(test_ratio)+'/'+graph_name+'_'+str(beta)+'_'+str(round_i)+'.npy'
#         np.save(file, K)

In [None]:
graph_name = 'BlogCatalog'
test_ratio = 0.3
rounds = 10
inf = 1e8
beta_list = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1]
dir_preprocessed = 'LP_data/' + graph_name + '_' + str(test_ratio) + '/'

for round_i in range(rounds):
    print(round_i)
    f_train_graph  = dir_preprocessed + 'train_graph_'  + str(round_i) + '.gpickle'
    nx_G  = func.load_list(f_train_graph)
    A = np.array(nx.to_numpy_matrix(nx_G, weight='weight'))
    d = np.sum(A, 0)
    C = dissim.adj2cost(A, inf)
    
    for beta in beta_list:
        print(beta)
        Delta_FE = dissim.FE(A, C, d, beta)
        file = 'mats_v2/FE_lp_'+str(test_ratio)+'/'+graph_name+'_'+str(beta)+'_'+str(round_i)+'.npy'
        np.save(file, Delta_FE)