In [1]:
import os
from os.path import join
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

# Train data

In [2]:
dataset_name = 'reuters'
data_dir = os.path.join('../VDSH/dataset/', dataset_name)
fn = 'train.NN.pkl'
train_df = pd.read_pickle(os.path.join(data_dir, fn))

num_trains = len(train_df)
bows_mat = sparse.vstack(list(train_df.bow))

In [13]:
if dataset_name in ['ng20']:
    # convert the label to a sparse matrix
    labels = list(train_df.label)
    num_labels = (np.max(labels) - np.min(labels)) + 1
    one_hot_mat = np.eye(num_labels, dtype=int)
    label_mat = sparse.csr_matrix(one_hot_mat[labels])
else:
    label_mat = sparse.vstack(list(train_df.label))
    num_labels = label_mat.shape[1]

In [14]:
dist = cosine_similarity(bows_mat, bows_mat)
indices = np.argsort(-dist, axis=1)

docid2index = {docid: index for index, docid in enumerate(list(train_df.index))}
index2docid = {index: docid for index, docid in enumerate(list(train_df.index))}

In [15]:
import functools 
top_nn = list(map(lambda v: index2docid[v], indices.reshape(-1)))
top_nn = np.array(top_nn).reshape(num_trains, num_trains)
assert(np.all([v in train_df.index for v in top_nn[:, 0]])) # makesure all docid does exist in the train_df

In [16]:
data = {'doc_id': list(train_df.index),
        'bow': list(train_df.bow),
        'label': [arr for arr in label_mat],
        'neighbors': [list(arr) for arr in top_nn[:, 1:101]]}

new_df = pd.DataFrame.from_dict(data)
new_df.set_index('doc_id', inplace=True)

new_df.to_pickle('dataset/clean/{}/{}.train.pkl'.format(dataset_name, dataset_name))

# Test data

In [17]:
data_dir = '../VDSH/dataset/{}'.format(dataset_name)
fn = 'test.NN.pkl'
test_df = pd.read_pickle(os.path.join(data_dir, fn))

num_tests = len(test_df)
test_bows_mat = sparse.vstack(list(test_df.bow))

In [18]:
if dataset_name in ['ng20']:
    # convert the label to a sparse matrix
    labels = list(test_df.label)
    label_mat = sparse.csr_matrix(one_hot_mat[labels])
else:
    label_mat = sparse.vstack(list(test_df.label))

In [19]:
dist = cosine_similarity(test_bows_mat, bows_mat)
indices = np.argsort(-dist, axis=1)

In [20]:
top_nn = list(map(lambda v: index2docid[v], indices.reshape(-1)))
top_nn = np.array(top_nn).reshape(num_tests, num_trains)

In [21]:
data = {'doc_id': list(test_df.index),
        'bow': list(test_df.bow),
        'label': [arr for arr in label_mat],
        'neighbors': [list(arr) for arr in top_nn[:, :100]]}

new_df = pd.DataFrame.from_dict(data)
new_df.set_index('doc_id', inplace=True)

new_df.to_pickle('dataset/clean/{}/{}.test.pkl'.format(dataset_name, dataset_name))