In [1]:
import pandas as pd
import numpy as np
import os
import pickle

import dgl

Using backend: pytorch


In [2]:
# path
base_path = '../'
publish_path = 'data'

nodes_path = os.path.join(base_path, publish_path, 'IDandLabels_pre.csv')

### 读取节点列表

In [3]:
nodes_df = pd.read_csv(nodes_path, dtype={'Label':str})
print(nodes_df.shape)
nodes_df.tail(4)

(3030932, 3)


Unnamed: 0,node_idx,paper_id,Label
3030928,3030928,ed617beed24821ae803225bb2370b4b7,
3030929,3030929,bc733f44393aa8ce8c6f82d99def90fb,
3030930,3030930,63522b926aec18a4a0204594366d015e,
3030931,3030931,f0a9a2c5981746028c74dc495f438d82,


### 转换标签为数字

In [4]:
# 先检查一下标签的分布
label_dist = nodes_df.groupby(by='Label').count()  #标签为空的未统计
print(label_dist.shape)
label_dist

(23, 2)


Unnamed: 0_level_0,node_idx,paper_id
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2503,2503
B,57716,57716
C,95938,95938
D,96366,96366
E,43082,43082
F,31004,31004
G,38486,38486
H,62084,62084
I,19924,19924
J,21446,21446


#### 可以看到一共有23个标签，A类最少，D类最多，基本每类都有几万个。下面从0开始，重构标签


In [5]:
# 按A-W的顺序，从0开始转换
for i, l in enumerate(label_dist.index.to_list()):
    nodes_df.loc[(nodes_df.Label==l), 'label'] = i

nodes_df.label.fillna(-1, inplace=True)
nodes_df.label = nodes_df.label.astype('int')
nodes_df.head(4)

Unnamed: 0,node_idx,paper_id,Label,label
0,0,78f43b8b62f040347fec0be44e5f08bd,,-1
1,1,a971601a0286d2701aa5cde46e63a9fd,G,6
2,2,a48c92cc8f67a8327adac7ff62d24a53,W,22
3,3,4736ef4d2512bb23954118adcb605b5e,H,7


#### 只保留新的node index、标签和原始的分割标签

In [7]:
nodes = nodes_df[['node_idx', 'label']]
nodes.tail(4)

Unnamed: 0,node_idx,label
3030928,3030928,-1
3030929,3030929,-1
3030930,3030930,-1
3030931,3030931,-1


## 划分Train/Test

这里按照9:1的比例划分Train/Test

In [8]:
# 获取所有的标签
train_test_labels_df = nodes[nodes.label >= 0]

# 按照0~22每个标签划分train/test  分层采样
train_labels_idx = np.array([0])
test_labels_idx = np.array([0])
split_ratio = 0.9

for label in range(23):
    label_idx = train_test_labels_df[train_test_labels_df.label == label].node_idx.to_numpy()
    split_point = int(label_idx.shape[0] * split_ratio)
    
    # 把每个标签的train和test的index添加到整个列表
    train_labels_idx = np.append(train_labels_idx, label_idx[: split_point])
    test_labels_idx = np.append(test_labels_idx, label_idx[split_point: ])

In [9]:
# 获取Train/Validation/Test标签index
train_labels_idx = train_labels_idx[1: ]   #删除初始化的第一个原始
test_labels_idx = test_labels_idx[1: ]   #删除初始化的第一个原始

In [12]:
# 获取完整的标签列表
labels = nodes.label.to_numpy()

In [13]:
# 保存标签以及Train/Validation/Test的index为二进制格式方便后面建模时的快速读取
label_path = os.path.join(base_path, publish_path, 'labels_pre.pkl')

with open(label_path, 'wb') as f:
    pickle.dump({'train_labels_idx': train_labels_idx, 
                 'test_labels_idx': test_labels_idx, 
                 'label': labels}, f)