In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from datasets import *
from utils import *
from scipy import sparse

from model.EdgeReg import *
from model.EdgeReg_v2 import *

In [2]:
gpunum = "2"
nbits = 128

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"]=gpunum
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
num_samples = 1

dataset_name = 'pubmed'
data_dir = os.path.join('dataset/clean', dataset_name)

train_batch_size=100
test_batch_size=100

train_set = TextDataset(dataset_name, data_dir, subset='train')
test_set = TextDataset(dataset_name, data_dir, subset='test')
train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=train_batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=test_batch_size, shuffle=True)

In [5]:
y_dim = train_set.num_classes()
num_bits = nbits
num_features = train_set[0][1].size(0)
num_nodes = len(train_set)
edge_weight = 1.0
dropout_prob = 0.1

In [6]:
if num_samples == 1:
    model = EdgeReg(dataset_name, num_features, num_nodes, num_bits, dropoutProb=dropout_prob, device=device)
else:
    print("number of samples (T) = {}".format(num_samples))
    model = EdgeReg_v2(dataset_name, num_features, num_nodes, num_bits, dropoutProb=dropout_prob, device=device, T=num_samples)


In [7]:
if num_samples == 1:
    saved_model_file = 'saved_models/node2hash.{}.T{}.bit{}.pth'.format(dataset_name, num_samples, nbits)
else:
    saved_model_file = 'saved_models/node2hash_v2.{}.T{}.bit{}.pth'.format(dataset_name, num_samples, nbits)

model.load_state_dict(torch.load(saved_model_file))
model.to(device)

EdgeReg(
  (encoder): Sequential(
    (0): Linear(in_features=500, out_features=1000, bias=True)
    (1): ReLU(inplace)
    (2): Linear(in_features=1000, out_features=1000, bias=True)
    (3): ReLU(inplace)
    (4): Dropout(p=0.1)
  )
  (h_to_mu): Linear(in_features=1000, out_features=128, bias=True)
  (h_to_logvar): Sequential(
    (0): Linear(in_features=1000, out_features=128, bias=True)
    (1): Sigmoid()
  )
  (decoder): Sequential(
    (0): Linear(in_features=128, out_features=500, bias=True)
    (1): LogSoftmax()
  )
  (nn_decoder): Sequential(
    (0): Linear(in_features=128, out_features=18230, bias=True)
    (1): LogSoftmax()
  )
)

In [8]:
import torch.nn.functional as F

# get non-binary code
with torch.no_grad():
    train_zy = [(model.encode(xb.to(model.device))[0], yb) for _, xb, yb, _ in train_loader]
    train_z, train_y = zip(*train_zy)
    train_z = torch.cat(train_z, dim=0)
    train_y = torch.cat(train_y, dim=0)
    
    test_zy = [(model.encode(xb.to(model.device))[0], yb) for _, xb, yb, _ in test_loader]
    test_z, test_y = zip(*test_zy)
    test_z = torch.cat(test_z, dim=0)
    test_y = torch.cat(test_y, dim=0)
    
    train_z_batch = train_z.unsqueeze(-1).transpose(2,0)
    test_z_batch = test_z.unsqueeze(-1)
    
    # compute cosine similarity
    dist = F.cosine_similarity(test_z_batch, train_z_batch, dim=1)
    ranklist = torch.argsort(dist, dim=1, descending=True)
    top100 = ranklist[:, :100]
    
    prec_at_100 = []
    for eval_index in range(0, test_y.size(0)):
        top100_labels = torch.index_select(train_y.to(device), 0, top100[eval_index]).type(torch.cuda.ByteTensor)
        groundtruth_label = test_y[eval_index].type(torch.cuda.ByteTensor)
        matches = (groundtruth_label.unsqueeze(0) & top100_labels).sum(dim=1) > 0
        num_corrects = matches.sum().type(torch.cuda.FloatTensor)
        prec_at_100.append((num_corrects/100.).item())   

    print('average prec at 100 = {:.4f}'.format(np.mean(prec_at_100)))

average prec at 100 = 0.7731
