In [1]:
import numpy as np
import pandas as pd

# 1. 读入cora数据

In [2]:
# 读入.content文件
cora_content = pd.read_csv('./data/cora/cora.content', sep='\t', header=None)

print(cora_content.shape)
print(cora_content.head(5))

(2708, 1435)
      0     1     2     3     4     5     6     7     8     9     ...  1425  \
0    31336     0     0     0     0     0     0     0     0     0  ...     0   
1  1061127     0     0     0     0     0     0     0     0     0  ...     0   
2  1106406     0     0     0     0     0     0     0     0     0  ...     0   
3    13195     0     0     0     0     0     0     0     0     0  ...     0   
4    37879     0     0     0     0     0     0     0     0     0  ...     0   

   1426  1427  1428  1429  1430  1431  1432  1433                    1434  
0     0     1     0     0     0     0     0     0         Neural_Networks  
1     1     0     0     0     0     0     0     0           Rule_Learning  
2     0     0     0     0     0     0     0     0  Reinforcement_Learning  
3     0     0     0     0     0     0     0     0  Reinforcement_Learning  
4     0     0     0     0     0     0     0     0   Probabilistic_Methods  

[5 rows x 1435 columns]


In [6]:
# 读取.cites文件
cora_cites = pd.read_csv('./data/cora/cora.cites', sep='\t', header=None)

print(cora_cites.shape)
print(cora_cites.head(5))

(5429, 2)
    0        1
0  35     1033
1  35   103482
2  35   103515
3  35  1050679
4  35  1103960


# 2. paper_id和[0, 2707]一一映射

In [7]:
content_idx = list(cora_content.index) # [0 … 2707]
paper_id = list(cora_content.iloc[:, 0]) # cora_content的第1列
paper_map = dict(zip(paper_id, content_idx))
print(paper_map[1061127])

1


# 3. 提取feature matrix (2708, 1433)

In [8]:
feature = cora_content.iloc[:, 1:-1] 
print(feature.shape)
print(feature.head(5))

(2708, 1433)
   1     2     3     4     5     6     7     8     9     10    ...  1424  \
0     0     0     0     0     0     0     0     0     0     0  ...     0   
1     0     0     0     0     0     0     0     0     0     0  ...     0   
2     0     0     0     0     0     0     0     0     0     0  ...     0   
3     0     0     0     0     0     0     0     0     0     0  ...     0   
4     0     0     0     0     0     0     0     0     0     0  ...     0   

   1425  1426  1427  1428  1429  1430  1431  1432  1433  
0     0     0     1     0     0     0     0     0     0  
1     0     1     0     0     0     0     0     0     0  
2     0     0     0     0     0     0     0     0     0  
3     0     0     0     0     0     0     0     0     0  
4     0     0     0     0     0     0     0     0     0  

[5 rows x 1433 columns]


# 4. 标签one-hot编码

In [10]:
label = cora_content.iloc[:, -1] # 最后一列
label = pd.get_dummies(label) # 独热编码
print(label.head(5))

   Case_Based  Genetic_Algorithms  Neural_Networks  Probabilistic_Methods  \
0           0                   0                1                      0   
1           0                   0                0                      0   
2           0                   0                0                      0   
3           0                   0                0                      0   
4           0                   0                0                      1   

   Reinforcement_Learning  Rule_Learning  Theory  
0                       0              0       0  
1                       0              1       0  
2                       1              0       0  
3                       1              0       0  
4                       0              0       0  


# 5. 创建adjacent matrix

In [14]:
size = cora_content.shape[0] # 2708
adj_mat = np.zeros((size, size)) # (2708 * 2708)

# 创建邻接矩阵
for x, y in zip(cora_cites[0], cora_cites[1]):
    i, j = paper_map[x], paper_map[y]
    adj_mat[i][j] = adj_mat[j][i] = 1 # 对称矩阵，（论文i和论文j有引用关系）
print(sum(sum(adj_mat)))


10556.0


# 6. 数据格式转换（如果需要的话）

In [16]:
# 转换前feature, label, adj_mat都是pandas的DataFrame格式
feature = np.array(feature)
label = np.array(label)
adj_mat = np.array(adj_mat)