In [None]:
import numpy as np
import random
from tqdm import *
import os
import sklearn.preprocessing
from utils import *
from graph_utils import *
from rank_metrics import *

import time

params = get_cmdline_params()
model_name = "STHgraph_{}_{}_step{}".format(params.walk_type, params.modelinfo, params.walk_steps)

##################################################################################################

nameManager = createGraphNameManager(params.dataset)
data = Load_Graph_Dataset(nameManager.bow_fn)

print('num train:{}'.format(data.n_trains))
print('num test:{}'.format(data.n_tests))
print('num vocabs:{}'.format(data.n_feas))
print('num labels:{}'.format(data.n_tags))

##################################################################################################

train_graph = GraphData(nameManager.train_graph)
test_graph = GraphData(nameManager.test_graph)
    
#################################################################################################

from scipy.sparse.linalg import eigsh
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.svm import LinearSVC

class STH:
    def __init__(self, num_bits):
        super(STH, self).__init__()
        
        self.num_bits = num_bits
        self.clfs = [LinearSVC() for n in range(num_bits)]
        
    def create_weight_matrix(self, train_mat, num_train, graph):
        columns = []
        rows = []
        weights = []
        for node_id in range(num_train):
            col = graph.graph[node_id]
            #col = DFS_walk(graph, node_id, 20)
            #col = second_order_neighbor_walk(graph, node_id)
            #print(node_id)
            if len(col) <= 0:
                col = [node_id]
            #assert(len(col) > 0)
                
            row = [node_id] * len(col)
            w = cosine_similarity(train_mat[node_id], train_mat[col])
            #w = [[0.9] * len(col)]

            columns += col
            rows += row
            weights += list(w[0])

        W = coo_matrix((weights, (rows, columns)), shape=(num_train, num_train))
        return W
    
    def fit_transform(self, train_mat, num_train, graph):
        W = self.create_weight_matrix(train_mat, num_train, graph)
        D = np.asarray(W.sum(axis=1)).squeeze() + 0.0001 # adding damping value for a numerical stabability
        D = scipy.sparse.diags(D)
        L = D - W
    
        L = scipy.sparse.csc_matrix(L)
        D = scipy.sparse.csc_matrix(D)

        num_attempts = 0
        max_attempts = 3
        success = False
        
        while not success:
            E, Y = eigsh(L, k=self.num_bits+1, M=D, which='SM')
            success = np.all(np.isreal(Y))
            
            if not success:
                print("Warning: Some eigenvalues are not real values. Retry to solve Eigen-decomposition.")
                num_attempts += 1
            
            if num_attempts > max_attempts:
                assert(np.all(np.isreal(Y))) # if this fails, re-run fit again
                assert(False) # Check your data 
        
        Y = np.real(Y)
        Y = Y[:, 1:]
        
        medHash = MedianHashing()
        cbTrain = medHash.fit_transform(Y)    
        for b in range(0, cbTrain.shape[1]):
            self.clfs[b].fit(train_mat, cbTrain[:, b])
        return cbTrain
    
    def transform(self, test_mat, num_test):
        cbTest = np.zeros((num_test, self.num_bits), dtype=np.int64)
        for b in range(0, self.num_bits):
            cbTest[:,b] = self.clfs[b].predict(test_mat)
        return cbTest
   
os.environ["CUDA_VISIBLE_DEVICES"]=params.gpu_num

sth_model = STH(params.nbits)

cbTrain = sth_model.fit_transform(data.train, data.n_trains, train_graph)
cbTest = sth_model.transform(data.test, data.n_tests)

gnd_train = data.gnd_train.toarray()
gnd_test = data.gnd_test.toarray()

eval_results = DotMap()

top_k_indices = retrieveTopKDoc(cbTrain, cbTest, batchSize=params.test_batch_size, TopK=100)
relevances = countNumRelevantDoc(gnd_train, gnd_test, top_k_indices)
relevances = relevances.cpu().numpy()

eval_results.ndcg_at_5 = np.mean([ndcg_at_k(r, 5) for r in relevances[:, :5]])
eval_results.ndcg_at_10 = np.mean([ndcg_at_k(r, 10) for r in relevances[:, :10]])
eval_results.ndcg_at_20 = np.mean([ndcg_at_k(r, 20) for r in relevances[:, :20]])
eval_results.ndcg_at_50 = np.mean([ndcg_at_k(r, 50) for r in relevances[:, :50]])
eval_results.ndcg_at_100 = np.mean([ndcg_at_k(r, 100) for r in relevances[:, :100]])

relevances = (relevances > 0)
eval_results.prec_at_5 = np.mean(np.sum(relevances[:, :5], axis=1)) / 100
eval_results.prec_at_10 = np.mean(np.sum(relevances[:, :10], axis=1)) / 100
eval_results.prec_at_20 = np.mean(np.sum(relevances[:, :20], axis=1)) / 100
eval_results.prec_at_50 = np.mean(np.sum(relevances[:, :50], axis=1)) / 100
eval_results.prec_at_100 = np.mean(np.sum(relevances[:, :100], axis=1)) / 100

best_results = EvalResult(eval_results)

print('*' * 80)
model_name = "STH_graph"
if params.save:
    import scipy.io
    data_path = os.path.join(os.environ['HOME'], 'projects/graph_embedding/save_bincode', params.dataset)
    save_fn = os.path.join(data_path, '{}.bincode.{}.mat'.format(model_name, params.nbits))

    print("save the binary code to {} ...".format(save_fn))
    cbTrain = sth_model.fit_transform(data.train, data.n_trains, train_graph)
    cbTest = sth_model.transform(data.test, data.n_tests)
    
    scipy.io.savemat(save_fn, mdict={'train': cbTrain, 'test': cbTest})
    print('save data to {}'.format(save_fn))

if params.save_results:
    fn = "results/{}/results.{}.csv".format(params.dataset, params.nbits)
    save_eval_results(fn, model_name, best_results)

print('*' * 80)
print("{}".format(model_name))

metrics = ['prec_at_{}'.format(n) for n in ['5', '10', '20', '50', '100']]
prec_results = ",".join(["{:.3f}".format(best_results.best_scores[metric]) for metric in metrics])
print("prec: {}".format(prec_results))

metrics = ['ndcg_at_{}'.format(n) for n in ['5', '10', '20', '50', '100']]
ndcg_results = ",".join(["{:.3f}".format(best_results.best_scores[metric]) for metric in metrics])
print("ndcg: {}".format(ndcg_results))
