In [None]:
import os
import numpy as np
import pandas as pd
import json
import pickle
from scipy import sparse
import scipy.io

dataset_name = 'dblp'
data_path = os.path.join('../dataset/raw/{}'.format(dataset_name))

In [None]:
citations = []
incomming = {}

for i in range(4):
    fn = os.path.join(data_path, 'dblp-ref-{}.json'.format(i))
    with open(fn) as in_fn:
        for line in in_fn:
            paper = json.loads(line.strip())
            citations.append(paper)

            if 'references' in paper:
                for ref_id in paper['references']:
                    if ref_id in incomming:
                        incomming[ref_id].append(paper['id'])
                    else:
                        incomming[ref_id] = [paper['id']]
                        
df = pd.DataFrame(citations)

In [None]:
is_first_line = True
conferences = {}
with open('../dataset/clean/dblp/venue_info.tsv') as in_csv:
    for line in in_csv:
        tokens = line.strip().split('\t')
        if is_first_line:
            #print(tokens)
            is_first_line = False
        else:
            conf_name = tokens[0]
        
            labels = [int(num_str) for num_str in tokens[2].split(',')]
            labels = [n-2 for n in labels if n > 1] # remove the first label (signal processing has too many documents)
            
            conferences[conf_name] = {'name': conf_name, 'label': labels}
        #conferences[conf_name] = {'name': conf_name, }

max_labels = np.max([np.max(val['label']) for key, val in conferences.items()])
min_labels = np.min([np.min(val['label']) for key, val in conferences.items()])
num_labels = max_labels - min_labels + 1
print('label min:{} max:{} total:{}'.format(min_labels, max_labels, num_labels))

In [None]:
# remove any row that is not present in the selected venues
def is_selected_venue(row):
    return (row in conferences)

print("num paper (before): {}".format(len(df)))
df = df[df.venue.apply(is_selected_venue)]
print("num paper (after): {}".format(len(df)))

In [None]:
cut_off_years = 2016

df_train = df[df.year < cut_off_years]
df_test = df[df.year >= cut_off_years]
num_trains = len(df_train)
num_tests = len(df_test)
print("num trains: {} num tests: {} ratio: {:.4f}".format(num_trains, num_tests, num_tests / num_trains))

In [None]:
#venue_count = df_train.groupby('venue').count().sort_values(['abstract'], ascending=False).abstract

In [None]:
def assign_labels(venue):
    label_list = conferences[venue]['label']
    return np.sum(np.eye(num_labels)[label_list], axis=0).astype(np.int)

df_train = df_train.copy()
df_train['label'] = df_train.venue.apply(assign_labels)
df_train.set_index('id', inplace=True) # set paper as the row index

df_test = df_test.copy()
df_test['label'] = df_test.venue.apply(assign_labels)
df_test.set_index('id', inplace=True) # set paper as the row index

num_train_doc_per_labels = np.sum(np.array(list(df_train.label)), axis=0)
num_test_doc_per_labels = np.sum(np.array(list(df_test.label)), axis=0)
print(num_train_doc_per_labels)
print(num_test_doc_per_labels)

In [None]:
# remove any row that does not have abstract, title, paperId, or venue
print("num paper = {}".format(len(df_train)))
df_train.dropna(axis=0, subset=['abstract', 'venue', 'year', 'label'], inplace=True)
print("num paper = {}".format(len(df_train)))

In [None]:
# This method adds incoming edges to each node as well as removing any edge that points outside the train set
def createEdges(row):
    if row.references is not np.nan:
        outgoing_edges = [r for r in row.references if r in df_train.index]
    else:
        outgoing_edges = []
        
    if row.name in incomming:
        incomming_edges = [r for r in incomming[row.name] if r in df_train.index]
    else:
        incomming_edges = []
    return outgoing_edges + incomming_edges
    
df_train['links'] = df_train.apply(createEdges, axis=1)

# Remove any row that has no link
print("num paper = {}".format(len(df_train)))
df_train = df_train[df_train.links.apply(len) > 0]
print("num paper = {}".format(len(df_train)))

# There must be no train nodes that references to non-train nodes
def count_invalid_edges(refs):
    return len([r for r in refs if r not in df_train.index])
    
assert(len(df_train[df_train.links.apply(count_invalid_edges) > 0]) == 0)

In [None]:
global_id_2_train_id = {node_id: idx for idx, node_id in enumerate(df_train.index)}

def convert_2_train_id(ref):
    return [global_id_2_train_id[r] for r in ref]

train_edges = df_train.links.apply(convert_2_train_id)
   
train_graph = {}
for node_id, value in train_edges.iteritems():
    train_graph[global_id_2_train_id[node_id]] = value
    
print('num train: {}'.format(len(train_graph)))

# Process Test Data

In [None]:
# remove any row that does not have abstract, title, paperId, or venue
print("num paper = {}".format(len(df_test)))
df_test.dropna(axis=0, subset=['abstract', 'venue', 'year', 'label'], inplace=True)
print("num paper = {}".format(len(df_test)))

In [None]:
# This method adds incoming edges to each node as well as removing any edge that points outside the train set
def createEdges(row):
    if row.references is not np.nan:
        outgoing_edges = [r for r in row.references if r in df_train.index]
    else:
        outgoing_edges = []
        
    if row.name in incomming:
        incomming_edges = [r for r in incomming[row.name] if r in df_train.index]
    else:
        incomming_edges = []
    return outgoing_edges + incomming_edges
    
df_test['links'] = df_test.apply(createEdges, axis=1)

# Remove any row that has no link
print("num paper = {}".format(len(df_test)))
df_test = df_test[df_test.links.apply(len) > 0]
print("num paper = {}".format(len(df_test)))

# There must be no train nodes that references to non-train nodes
def count_invalid_edges(refs):
    return len([r for r in refs if r not in df_train.index])
    
assert(len(df_test[df_test.links.apply(count_invalid_edges) > 0]) == 0)

In [None]:
global_id_2_test_id = {node_id: idx for idx, node_id in enumerate(df_test.index)}

# each link MUST point to the train nodes
test_edges = df_test.links.apply(convert_2_train_id)
   
test_graph = {}
for node_id, value in test_edges.iteritems():
    test_graph[global_id_2_test_id[node_id]] = value
    
print('num test: {}'.format(len(test_graph)))

# Save Graph Data

In [None]:
data_path = '../dataset/clean/dblp'
save_fn = os.path.join(data_path, 'ind.{}.train.graph.pk'.format(dataset_name))
pickle.dump(train_graph, open(save_fn, 'wb'))
print('save graph data to {}'.format(save_fn))

save_fn = os.path.join(data_path, 'ind.{}.test.graph.pk'.format(dataset_name))
pickle.dump(test_graph, open(save_fn, 'wb'))
print('save graph data to {}'.format(save_fn))

# Process contents

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=5, sublinear_tf=True, max_features=10000)

train_feas = vectorizer.fit_transform(list(df_train.abstract))
print(np.nonzero(np.sum(train_feas, axis=1))[0].shape)

test_feas = vectorizer.transform(list(df_test.abstract))
print(np.nonzero(np.sum(test_feas, axis=1))[0].shape)

gnd_train = sparse.csr_matrix(np.array(list(df_train.label)))
gnd_test = sparse.csr_matrix(np.array(list(df_test.label)))

In [None]:

assert(train_feas.shape[1] == test_feas.shape[1])
assert(gnd_train.shape[1] == gnd_test.shape[1])
assert(train_feas.shape[0] == gnd_train.shape[0])
assert(test_feas.shape[0] == gnd_test.shape[0])

data_path = '../dataset/clean/dblp'
save_fn = os.path.join(data_path, 'ind.{}.mat'.format(dataset_name))

scipy.io.savemat(save_fn, 
                 mdict={'train': train_feas, 
                        'test': test_feas, 
                        'cv': test_feas,
                        'gnd_train': gnd_train, 
                        'gnd_test': gnd_test,
                        'gnd_cv': gnd_test})

print('save data to {}'.format(save_fn))

# Convert to dataframe with the format as doc_id, bow, label, and neighbors

In [None]:
# create a connection matrix
n_train = train_feas.shape[0]
row = []
col = []
for doc_id in train_graph:
    row += [doc_id] * len(train_graph[doc_id])
    col += train_graph[doc_id]
data = [1] * len(row)
train_connections = sparse.csr_matrix((data, (row, col)), shape=(n_train, n_train))

In [None]:
n_test = test_feas.shape[0]
row = []
col = []
for doc_id in test_graph:
    row += [doc_id] * len(test_graph[doc_id])
    col += test_graph[doc_id]
data = [1] * len(row)
test_connections = sparse.csr_matrix((data, (row, col)), shape=(n_test, n_train)) # test graph points to train graph

In [None]:
from tqdm import tqdm

save_dir = os.path.join('../dataset/clean', dataset_name)
##########################################################################################

train = []
for doc_id in tqdm(train_graph):
    doc = {'doc_id': doc_id, 'bow': train_feas[doc_id], 
           'label': gnd_train[doc_id], 'neighbors': train_connections[doc_id]}
    train.append(doc)

train_df = pd.DataFrame.from_dict(train)
train_df.set_index('doc_id', inplace=True)

fn = os.path.join(save_dir, '{}.train.pkl'.format(dataset_name))
train_df.to_pickle(fn)
##########################################################################################

test = []
for doc_id in tqdm(test_graph):
    doc = {'doc_id': doc_id, 'bow': test_feas[doc_id], 
           'label': gnd_test[doc_id], 'neighbors': test_connections[doc_id]}
    test.append(doc)

test_df = pd.DataFrame.from_dict(test)
test_df.set_index('doc_id', inplace=True)

fn = os.path.join(save_dir, '{}.test.pkl'.format(dataset_name))
test_df.to_pickle(fn)