# This script preprocess Reddit dataset

In [1]:
import os
import numpy as np
import pandas as pd
from dotmap import DotMap
import json
from scipy import sparse
import networkx as nx
from networkx.readwrite import json_graph
from tqdm import *

dataset_name = 'reddit'
data_path = os.path.join('../dataset/raw/{}'.format(dataset_name))

In [2]:
def load_data(normalize=True):
    graph_fn = os.path.join(data_path, '{}-G.json'.format(dataset_name))

    print('load graph data ...')
    G_data = json.load(open(graph_fn))
    G = json_graph.node_link_graph(G_data)
    if isinstance(G.nodes()[0], int):
        conversion = lambda n : int(n)
    else:
        conversion = lambda n : n

    print('load features, id map, and class map ...')
    features_fn = os.path.join(data_path, '{}-feats.npy'.format(dataset_name))
    feats = np.load(features_fn)
        
    id_map_fn = os.path.join(data_path, '{}-id_map.json'.format(dataset_name))
    id_map = json.load(open(id_map_fn))
    id_map = {k:int(v) for k,v in id_map.items()}
    
    class_fn = os.path.join(data_path, '{}-class_map.json'.format(dataset_name))
    class_map = json.load(open(class_fn))
    if isinstance(list(class_map.values())[0], list):
        lab_conversion = lambda n : n
    else:
        lab_conversion = lambda n : int(n)

    class_map = {k:lab_conversion(v) for k,v in class_map.items()}

    ## Remove all nodes that do not have val/test annotations
    ## (necessary because of networkx weirdness with the Reddit data)
    broken_nodes = [node for node in G.nodes() if not 'val' in G.node[node] or not 'test' in G.node[node]]
    G.remove_nodes_from(broken_nodes)
    print("Removed {:d} nodes that lacked proper annotations due to networkx versioning issues".format(len(broken_nodes)))

    ## Make sure the graph has edge train_removed annotations
    ## (some datasets might already have this..)
    print("Loaded data.. now preprocessing..")
    for edge in G.edges():
        if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or
            G.node[edge[0]]['test'] or G.node[edge[1]]['test']):
            G[edge[0]][edge[1]]['train_removed'] = True
        else:
            G[edge[0]][edge[1]]['train_removed'] = False

    if normalize and not feats is None:
        from sklearn.preprocessing import StandardScaler
        train_ids = np.array([id_map[n] for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']])
        train_feats = feats[train_ids]
        scaler = StandardScaler()
        scaler.fit(train_feats)
        feats = scaler.transform(feats)

    return G, feats, id_map, class_map

In [3]:
G, feats, id_map, class_map = load_data(normalize=False)
print(feats.shape)

load graph data ...
load features, id map, and class map ...
Removed 231443 nodes that lacked proper annotations due to networkx versioning issues
Loaded data.. now preprocessing..
(232965, 602)


In [4]:
graphs = {}

with open(os.path.join(data_path, 'reddit-adjlist.txt')) as in_fn:
    for line in in_fn:
        line = line.strip()
        if line[0] == '#':
            continue
        
        tokens = line.split()
        node_id = tokens[0]
        assert(node_id not in graphs)
        
        node = DotMap()
        node.node_id = node_id
        node.outgoing = tokens[1:]
        node.incoming = []
        graphs[node_id] = node
        
sink_nodes = {}
for node_id in tqdm(graphs):
    for out_node_id in graphs[node_id].outgoing:
        if out_node_id in graphs:
            graphs[out_node_id].incoming.append(node_id)
        else:
            if out_node_id not in sink_nodes:
                node = DotMap()
                node.node_id = out_node_id
                node.incoming = [node_id]
                node.outgoing = []
                sink_nodes[out_node_id] = node
            else:
                sink_nodes[out_node_id].incoming.append(node_id)

for node_id in sink_nodes:
    graphs[node_id] = sink_nodes[node_id]

100%|██████████| 134999/134999 [02:10<00:00, 1034.12it/s]


In [5]:
# for split train-test-cv
TRAIN_FLAG = 0
TEST_FLAG = 1
CV_FLAG = 2

for node_id in G.nodes():
    if node_id in graphs:
        is_validate = G.node[node_id]['val']
        is_test = G.node[node_id]['test']
       
        if is_test:
            graphs[node_id].kind = TEST_FLAG
        elif is_validate:
            graphs[node_id].kind = CV_FLAG
        else:
            graphs[node_id].kind = TRAIN_FLAG
            
# add class labels
for node_id, class_id in class_map.items():
    if node_id in graphs:
        graphs[node_id].class_id = class_id
        
# add node features
for node_id, index in tqdm(id_map.items()):
    if node_id in graphs:
        graphs[node_id].features = list(feats[index])

100%|██████████| 232965/232965 [00:18<00:00, 12449.51it/s]


In [6]:
graph_data = []
for node_id, node in tqdm(graphs.items()):
    # combine in and out edges
    out_edges = list(set([id_map[n] for n in node.outgoing]))
    in_edges = list(set([id_map[n] for n in node.incoming]))
    neighbors = list(set(out_edges + in_edges))
    
    node_data = {'post_id': node.node_id,  
                 'node_id': id_map[node.node_id],
                 'neighbors': neighbors,
                 'in_edges': in_edges, 'out_edges': out_edges,
                 'label': node.class_id, 'kind': node.kind,
                 'features': node.features}
    
    graph_data.append(node_data)

df = pd.DataFrame(graph_data)
df.set_index('node_id', inplace=True) # set paper as the row index

100%|██████████| 232383/232383 [01:11<00:00, 3229.77it/s]


In [None]:
save_data_path = os.path.join('../dataset/clean/{}'.format(dataset_name))
save_fn = os.path.join(save_data_path, '{}.data.pkl'.format(dataset_name))
df.to_pickle(save_fn)

# Preprocess Graph Dataset

In [None]:
save_data_path = os.path.join('../dataset/clean/{}'.format(dataset_name))
data_fn = os.path.join(save_data_path, '{}.data.pkl'.format(dataset_name))
df = pd.from_pickle(load_fn)

In [7]:
# We remove any row that has no neighbors
print("num nodes = {}".format(len(df)))
df = df[df.neighbors.apply(len) > 0]
print("num nodes = {}".format(len(df)))

num nodes = 232383
num nodes = 232383


In [8]:
df_train = df[df.kind == TRAIN_FLAG]
df_test = df[df.kind == TEST_FLAG]
df_cv = df[df.kind == CV_FLAG]

print("num train: {} num test: {} num cv: {}".format(len(df_train), 
                                                     len(df_test), 
                                                     len(df_cv)))

num train: 152157 num test: 55228 num cv: 23660


In [9]:
# Remove any non-train neighbors
def remove_test_and_cv_edges(row):
    return [r for r in row if r in df_train.index]

df_train = df_train.copy()
df_train.neighbors = df_train.neighbors.apply(remove_test_and_cv_edges)

df_train = df_train[df_train.neighbors.apply(len) > 0]
print("num trains: {}".format(len(df_train)))

# Remove any row that points to a removed train node
df_train.neighbors = df_train.neighbors.apply(remove_test_and_cv_edges)
df_train.neighbors.apply(len).describe()

print("num trains: {}".format(len(df_train)))

num trains: 151741
num trains: 151741


# Process Test and Validatation Set

In [10]:
print("num test: {}".format(len(df_test)))
df_test = df_test.copy()
df_test.neighbors = df_test.neighbors.apply(remove_test_and_cv_edges)
df_test = df_test[df_test.neighbors.apply(len) > 0]
print("num test: {}".format(len(df_test)))

print("num cv: {}".format(len(df_cv)))
df_cv = df_cv.copy()
df_cv.neighbors = df_cv.neighbors.apply(remove_test_and_cv_edges)
df_cv = df_cv[df_cv.neighbors.apply(len) > 0]
print("num cv: {}".format(len(df_cv)))

num test: 55228
num test: 53736
num cv: 23660
num cv: 23012


# Save Data

In [11]:
global_id_2_train_id = {global_idx: idx for idx, global_idx 
                        in enumerate(df_train.index)}

def convert_2_train_id(row):
    return [global_id_2_train_id[r] for r in row]

train_edges = df_train.neighbors.apply(convert_2_train_id)

train_graph = {}
for node_id, value in train_edges.iteritems():
    train_graph[global_id_2_train_id[node_id]] = value
    

In [15]:
import pickle
save_data_path = os.path.join('../dataset/clean/{}'.format(dataset_name))
save_fn = os.path.join(save_data_path, 'int.{}.train.graph.pkl'.format(dataset_name))
pickle.dump(train_graph, open(save_fn, 'wb'))
print('save graph data to {}'.format(save_fn))

save graph data to ../dataset/clean/reddit/int.reddit.train.graph.pkl


In [16]:
global_id_2_test_id = {global_idx: idx for idx, global_idx in enumerate(df_test.index)}

# Convert each globalId to trainId because all test nodes only point to train nodes
test_edges = df_test.neighbors.apply(convert_2_train_id) 
test_graph = {}
for node_id, value in test_edges.iteritems():
    test_graph[global_id_2_test_id[node_id]] = value


In [17]:
save_fn = os.path.join(save_data_path, 'ind.{}.test.graph.pkl'.format(dataset_name))
pickle.dump(test_graph, open(save_fn, 'wb'))
print('save graph data to {}'.format(save_fn))

save graph data to ../dataset/clean/reddit/ind.reddit.test.graph.pkl


In [18]:
global_id_2_cv_id = {global_idx: idx for idx, global_idx 
                        in enumerate(df_cv.index)}

# Convert each globalId to trainId because all cv nodes only point to train nodes
cv_edges = df_cv.neighbors.apply(convert_2_train_id) 
cv_graph = {}
for node_id, value in cv_edges.iteritems():
    cv_graph[global_id_2_cv_id[node_id]] = value
    
save_fn = os.path.join(save_data_path, 'ind.{}.cv.graph.pkl'.format(dataset_name))
pickle.dump(test_graph, open(save_fn, 'wb'))
print('save graph data to {}'.format(save_fn))

save graph data to ../dataset/clean/reddit/ind.reddit.cv.graph.pkl


# Get Document features

In [19]:
train_features = list(df_train.features)
train_features = sparse.csr_matrix(train_features)
train_labels = list(df_train.label)

######################################################################################
min_class_id = np.min(train_labels)
max_class_id = np.max(train_labels)
num_classes = max_class_id - min_class_id + 1

gnd_train = sparse.csr_matrix(np.eye(num_classes)[train_labels])

######################################################################################
test_features = list(df_test.features)
test_features = sparse.csr_matrix(test_features)

test_labels = list(df_test.label)
gnd_test = sparse.csr_matrix(np.eye(num_classes)[test_labels])

######################################################################################
cv_features = list(df_cv.features)
cv_features = sparse.csr_matrix(cv_features)

cv_labels = list(df_cv.label)
gnd_cv = sparse.csr_matrix(np.eye(num_classes)[cv_labels])

In [20]:
assert(train_features.shape[1] == test_features.shape[1] == cv_features.shape[1])
assert(gnd_train.shape[1] == gnd_test.shape[1] == gnd_cv.shape[1])
assert(train_features.shape[0] == gnd_train.shape[0])
assert(test_features.shape[0] == gnd_test.shape[0])
assert(cv_features.shape[0] == gnd_cv.shape[0])

# data_path = os.path.join(os.environ['HOME'], 'projects/graph_embedding/graph_dataset', dataset_name)
# save_fn = os.path.join(data_path, 'ind.{}.mat'.format(dataset_name))

# scipy.io.savemat(save_fn, 
#                  mdict={'train': train_features, 
#                         'test': test_features, 
#                         'cv': cv_features,
#                         'gnd_train': gnd_train, 
#                         'gnd_test': gnd_test,
#                         'gnd_cv': gnd_cv})

# print('save data to {}'.format(save_fn))

In [23]:
import scipy.io

save_fn = os.path.join(save_data_path, 'ind.{}.mat'.format(dataset_name))
scipy.io.savemat(save_fn, 
                 mdict={'train': train_features, 
                        'test': test_features, 
                        'cv': cv_features,
                        'gnd_train': gnd_train, 
                        'gnd_test': gnd_test,
                        'gnd_cv': gnd_cv})

print('save data to {}'.format(save_fn))

save data to ../dataset/clean/reddit/ind.reddit.mat
