In [1]:
import pickle
import random
import numpy as np
import networkx as nx
from sklearn.decomposition import PCA

import torch

from smartsampling import *
from evaluation import *

In [2]:
seed = 1
device = "cuda" if torch.cuda.is_available() else "cpu"

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if device=="cuda":
    torch.cuda.manual_seed(seed)

In [3]:
node_features = pickle.load(open('data/node_features.pkl', 'rb'))
# coauthors_matrix = pickle.load(open('data/coauthors_matrix.pkl', 'rb'))
# shortest_path_length = pickle.load(open('data/shortest_path_length.pkl', 'rb'))

In [4]:
text_features = np.array(list(node_features.values()))

pca = PCA(n_components=100)
pca.fit(text_features)
emb_features = pca.transform(text_features)

nnodes = text_features.shape[0]
device = "cuda" if torch.cuda.is_available() else "cpu"

# G = nx.from_scipy_sparse_matrix(coauthors_matrix)
# G = list(G.subgraph(c) for c in nx.connected_components(G))[0].copy()

In [5]:
p_space = [0.2, 0.5, 1, 2, 5]
q_space = [0.2, 0.5, 1, 2, 5]
c_space = [1, 2, 3, 4, 5]

nsamplers = 1000
lr = 0.005
weight_decay = 5e-4
dropout = 0.4
nepoch = 100
penalty = 3e-4

ratio = 0.1
nfold = 1 #5

In [6]:
# for fold in range(nfold):
#     train_G, prediction_links = prepare(G.copy(), ratio)
#     pickle.dump((train_G, prediction_links), open('link_prediction/input_{}.pkl'.format(fold+1), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
#     print(fold+1)

In [7]:
lp_results = []
save_results = []

for fold in range(1, nfold+1):
    print("Start Fold: {}".format(fold))
    
    train_G, prediction_links = pickle.load(open('link_prediction/input_{}.pkl'.format(fold), 'rb'))
    
    model = SmartSampling(text_features, emb_features, p_space, q_space, c_space, 
                      nnodes, nsamplers, lr, weight_decay, dropout, device, 
                      train_G)
    embeddings = model.train(nepoch, penalty)
    torch.save(model.state_dict(), 'link_prediction/model_{}.pkl'.format(fold))
    pickle.dump(embeddings, open('link_prediction/embeddings_{}.pkl'.format(fold), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
    
    lp_acc, lp_f1 = lp_evaluate(embeddings.cpu().numpy(), prediction_links)
    lp_results.append((lp_acc, lp_f1))
    save_results.append("Fold: {}, Acc: {:.4f}, F1: {:.4f}".format(fold, lp_acc, lp_f1))
    print(save_results[-1])
    
    del model
    torch.cuda.empty_cache()
    
analysis = []
results = np.array(lp_results)
for i in range(results.shape[1]):
    analysis.append((np.mean(results[:,i]), np.std(results[:,i])))

print()
final_results = "Final results: " +\
      "Acc = {:.4f} +- {:.4f}, ".format(analysis[0][0], analysis[0][1]) +\
      "F1 = {:.4f} +- {:.4f}".format(analysis[1][0], analysis[1][1])
print(final_results)

parameters = "Parameters: " +\
        "nepoch: {}, ".format(nepoch) +\
        "nsampler: {}, ".format(nsamplers) +\
        "penalty: {}".format(penalty)

with open('train_link_prediction.txt','a') as file:
    file.write(parameters+'\n')
    for save_result in save_results:
        file.write(save_result+'\n')
    file.write(final_results+'\n')
    file.write('\n')
    file.close()

Start Fold: 1
Epoch: 20, Gain: 9.0649, Time: 111.5093s
Epoch: 40, Gain: 11.2474, Time: 98.7959s
Epoch: 60, Gain: 16.1540, Time: 98.5421s
Epoch: 80, Gain: 19.2911, Time: 98.4587s
Epoch: 100, Gain: 19.2616, Time: 100.1872s
Fold: 1, Acc: 0.5047, F1: 0.5584

Final results: Acc = 0.5047 +- 0.0000, F1 = 0.5584 +- 0.0000
