In [1]:
import pandas as pd
import numpy as np
import os
import gc

In [2]:
# path
base_path = '../'
publish_path = 'data'

link_p1_path = os.path.join(base_path, publish_path, 'link_phase1_pre.csv')
nodes_path = os.path.join(base_path, publish_path, 'nodes_pre.csv')

### 读取边列表并统计节点数量

In [3]:
edge_df = pd.read_csv(link_p1_path)
print(edge_df.shape)
edge_df.head()

(29162247, 3)


Unnamed: 0,paper_id,reference_paper_id,phase
0,f10da75ad1eaf16eb2ffe0d85b76b332,711ef25bdb2c2421c0131af77b3ede1d,phase1
1,9ac5a4327bd4f3dcb424c93ca9b84087,2d91c73304c5e8a94a0e5b4956093f71,phase1
2,9d91bfd4703e55dd814dfffb3d63fc33,33d4fdfe3967a1ffde9311bfe6827ef9,phase1
3,e1bdbce05528952ed6579795373782d4,4bda690abec912b3b7b228b01fb6819a,phase1
4,eb623ac4b10df96835921edabbde2951,c1a05bdfc88a73bf2830e705b2f39dbb,phase1


In [4]:
edge_df.phase.describe()

count     29162247
unique           1
top         phase1
freq      29162247
Name: phase, dtype: object

In [5]:
edge_nodes = pd.concat([edge_df['paper_id'], edge_df['reference_paper_id']])
edge_nodes = pd.DataFrame(edge_nodes.drop_duplicates())
edge_nodes.rename(columns={0:'paper_id'}, inplace=True)

print(edge_nodes.shape)
edge_nodes.head(4)

(3030932, 1)


Unnamed: 0,paper_id
0,f10da75ad1eaf16eb2ffe0d85b76b332
1,9ac5a4327bd4f3dcb424c93ca9b84087
2,9d91bfd4703e55dd814dfffb3d63fc33
3,e1bdbce05528952ed6579795373782d4


#### 在边列表，一共出现了3,030932个节点(paper_id)

In [6]:
def process_node(line):
    nid, feat_json, label = line.strip().split('\"')
    
    feat_list = [float(feat[1:-1]) for feat in feat_json[1:-1].split(', ')]  #去除[] ''
    
    if len(feat_list) != 300:
        print('此行数据有问题 {}'.format(line))
    
    return nid[:-1], feat_list, label[1:]

In [7]:
# 先构建ID和Label的关系，保证ID的顺序和Feature的顺序一致即可
nid_list = []
label_list = []

with open(nodes_path, 'r') as f:
    i = 0
    
    for line in f:
        if i > 0:
            nid, _, label = process_node(line)
            nid_list.append(nid)
            label_list.append(label)           #部分标签我空
        i += 1
        if i % 100000 == 0:
            print('Processed {} train rows'.format(i))


nid_arr = np.array(nid_list)
label_arr = np.array(label_list)
    
nid_label_df = pd.DataFrame({'paper_id':nid_arr, 'Label': label_arr})

Processed 100000 train rows
Processed 200000 train rows
Processed 300000 train rows
Processed 400000 train rows
Processed 500000 train rows
Processed 600000 train rows
Processed 700000 train rows
Processed 800000 train rows
Processed 900000 train rows
Processed 1000000 train rows
Processed 1100000 train rows
Processed 1200000 train rows
Processed 1300000 train rows
Processed 1400000 train rows
Processed 1500000 train rows
Processed 1600000 train rows
Processed 1700000 train rows
Processed 1800000 train rows
Processed 1900000 train rows
Processed 2000000 train rows
Processed 2100000 train rows
Processed 2200000 train rows
Processed 2300000 train rows
Processed 2400000 train rows
Processed 2500000 train rows
Processed 2600000 train rows
Processed 2700000 train rows
Processed 2800000 train rows
Processed 2900000 train rows
Processed 3000000 train rows


In [8]:
nid_label_df.reset_index(inplace=True)
nid_label_df.rename(columns={'index':'node_idx'}, inplace=True)
print(nid_label_df.shape)
nid_label_df.head(4)

(3030932, 3)


Unnamed: 0,node_idx,paper_id,Label
0,0,78f43b8b62f040347fec0be44e5f08bd,
1,1,a971601a0286d2701aa5cde46e63a9fd,G
2,2,a48c92cc8f67a8327adac7ff62d24a53,W
3,3,4736ef4d2512bb23954118adcb605b5e,H


In [9]:
# 检查ID是否有重复 
ids = nid_label_df.paper_id.drop_duplicates()
ids.shape

(3030932,)

In [10]:
# 保存ID和Label到本地文件
nid_label_df.to_csv(os.path.join(base_path, publish_path, './IDandLabels_pre.csv'), index=False)

In [11]:
nid_label_df.head()

Unnamed: 0,node_idx,paper_id,Label
0,0,78f43b8b62f040347fec0be44e5f08bd,
1,1,a971601a0286d2701aa5cde46e63a9fd,G
2,2,a48c92cc8f67a8327adac7ff62d24a53,W
3,3,4736ef4d2512bb23954118adcb605b5e,H
4,4,917de373f8b3cb2dfe245b25ac72a73e,
