https://linqs.soe.ucsc.edu/data

In [20]:
import torch
import numpy as np
import scipy.sparse as sp
import networkx as nx
import pickle as pkl
import random as rd
import pandas as pd
import warnings; warnings.filterwarnings('ignore')

### Import raw data

In [89]:
elist = pd.read_csv('data/raw/citeseer.cites', sep='\t', header=None, names=['source', 'target'])
elist['source'] = elist['source'].astype(str); elist['target'] = elist['target'].astype(str); elist

Unnamed: 0,source,target
0,100157,100157
1,100157,364207
2,100157,38848
3,100157,bradshaw97introduction
4,100157,bylund99coordinating
...,...,...
4727,zhao98empirical,zhao99discriminant
4728,zheng98stochastic,90601
4729,zheng98stochastic,zheng98integrating
4730,zhu00incorporating,clarke01exploiting


In [93]:
feat = pd.read_csv('data/raw/citeseer.content', sep='\t', header=None)
feat[0] = feat[0].astype(str); feat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3695,3696,3697,3698,3699,3700,3701,3702,3703,3704
0,100157,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Agents
1,100598,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,IR
2,105684,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Agents
3,11099,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,DB
4,114091,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3307,zhang99query,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,DB
3308,zhang99situated,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ML
3309,zhang99towards,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,IR
3310,zhou00implementation,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,DB


### Find and remove unnecessary nodes

In [None]:
G = nx.from_pandas_edgelist(elist, 'source', 'target')
connected_nodes_list = list(G.nodes())

In [95]:
feat_idx2drop = []
for i in feat.index:
    if feat[0][i] not in connected_nodes_list:
        feat_idx2drop.append(i)

In [96]:
feat_idx2drop 

[]

In [104]:
name2id = {}
for i in feat.index:
    name2id[feat[0][i]] = i

In [105]:
elist_id = elist.copy()
idx2drop = []
for i in elist_id.index:
    try:
        elist_id.iloc[i][0] = name2id[elist_id.iloc[i][0]]
        elist_id.iloc[i][1] = name2id[elist_id.iloc[i][1]]
    except KeyError:
        idx2drop.append(i)

In [108]:
len(idx2drop)

17

In [109]:
elist_id = elist_id.drop(elist_id.index[idx2drop])
elist_id = elist_id.reset_index(drop=True); elist_id

Unnamed: 0,source,target
0,0,0
1,0,99
2,0,111
3,0,381
4,0,415
...,...,...
4710,2171,2172
4711,2174,1385
4712,2174,2173
4713,1008,455


### Adjacency matrix

In [110]:
G = nx.from_pandas_edgelist(elist_id, 'source', 'target')

In [111]:
len(G.nodes())

3312

In [129]:
adj = nx.adjacency_matrix(G, nodelist=sorted(G.nodes()))
adj = adj.todense()

In [132]:
np.fill_diagonal(adj, 0)
adj = sp.csr_matrix(adj)

In [133]:
# with open('data/citeseer.graph', 'wb') as f:
#     pkl.dump(adj, f)

### Feature

In [117]:
feature = feat.iloc[:,1:-1]
feature = feature.T.reset_index(drop=True).T; feature

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3693,3694,3695,3696,3697,3698,3699,3700,3701,3702
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3307,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3308,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3309,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3310,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [135]:
sp_feature = sp.csr_matrix(feature).astype(float)

In [136]:
sp_feature

<3312x3703 sparse matrix of type '<class 'numpy.float64'>'
	with 105165 stored elements in Compressed Sparse Row format>

In [137]:
# with open('data/citeseer.feature', 'wb') as f:
#     pkl.dump(sp_feature, f)

### Labels

In [122]:
labels = feat.iloc[:,-1] 
labels = labels.values; labels

array(['Agents', 'IR', 'Agents', ..., 'IR', 'DB', 'ML'], dtype=object)

In [126]:
# with open('data/citeseer.labels', 'wb') as f:
#     pkl.dump(labels, f)