In [1]:
from torch_geometric.datasets import Planetoid
import torch

## 1.Cora数据集的处理

### 1.1 下载数据集

In [2]:
# 下载并保存预处理的数据集
dataset_cora = Planetoid(root='./cora/', name='Cora')

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


In [3]:
# 打印数据集
print(dataset_cora)

Cora()


### 1.2 法一：使用[0]方式从dataset中提取data

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [23]:
# 提取data，并转换为device格式
data_cora = dataset_cora[0].to(device)
# 打印dataset的属性
print(dataset_cora.num_classes)  # 标签的类别数量
print(dataset_cora.num_node_features)  # 节点特征的维度
print(len(dataset_cora))  # 数据集图的个数
# 打印data
print(data_cora)

7
1433
1
Data(edge_index=[2, 10556], test_mask=[2708], train_mask=[2708], val_mask=[2708], x=[2708, 1433], y=[2708])


### 1.3 查看data的各项属性

In [18]:
# 提取各项属性
x = data_cora.x  # 节点的特征矩阵[N,input_dim]
edge_index = data_cora.edge_index  # 边索引[2,E]
y = data_cora.y  # 节点标签数组
# 打印形状
print(x.shape)
print(edge_index.shape)
print(y.shape)

torch.Size([2708, 1433])
torch.Size([2, 10556])
torch.Size([2708])


In [19]:
# 提取训练、验证和测试集的mask
train_mask = data_cora.train_mask  # 训练mask[N,]
val_mask = data_cora.val_mask
test_mask = data_cora.test_mask
# 打印训练、验证和测试集的数量
print(train_mask.sum().item())
print(val_mask.sum().item())
print(test_mask.sum().item())

140
500
1000


In [20]:
print(data_cora.num_nodes)  # 节点数
print(data_cora.num_edges)  # 边数
print(data_cora.num_node_features)  # 节点特征维度

2708
10556
1433


In [22]:
print(data_cora.is_directed())  # 是否是有向图
print(data_cora.contains_isolated_nodes())  # 是否包含孤立节点
print(data_cora.contains_self_loops())  # 是否包含self-loop

False
False
False


### 1.4 法二：使用DataLoader来载入data

In [24]:
from torch_geometric.data import DataLoader

In [27]:
# 使用DataLoader来载入data
# 注意，这里的batch_size是图的个数，而不是节点的个数
cora_loader = DataLoader(dataset=dataset_cora, batch_size=32, shuffle=True)
# minibatch
# Cora只有一个图，所以只循环了一次
for batch in cora_loader:
    print(batch)

Batch(batch=[2708], edge_index=[2, 10556], test_mask=[2708], train_mask=[2708], val_mask=[2708], x=[2708, 1433], y=[2708])


In [28]:
print(batch.num_graphs)  # 图的数量

1


## 2.Citeseer数据集的处理

In [29]:
dataset_citeseer = Planetoid(root='./citeseer/',name='Citeseer')

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.test.index
Processing...
Done!


### 其他操作同上面的Cora数据集

In [30]:
print(dataset_citeseer)

Citeseer()


In [31]:
data_citeseer = dataset_citeseer[0].to(device)
print(dataset_citeseer.num_classes)
print(dataset_citeseer.num_node_features)
print(len(dataset_citeseer))
print(data_citeseer)

6
3703
1
Data(edge_index=[2, 9104], test_mask=[3327], train_mask=[3327], val_mask=[3327], x=[3327, 3703], y=[3327])


In [32]:
x = data_citeseer.x  
edge_index = data_citeseer.edge_index 
y = data_citeseer.y  
print(x.shape)
print(edge_index.shape)
print(y.shape)

torch.Size([3327, 3703])
torch.Size([2, 9104])
torch.Size([3327])


In [33]:
train_mask = data_citeseer.train_mask 
val_mask = data_citeseer.val_mask
test_mask = data_citeseer.test_mask
print(train_mask.sum().item())
print(val_mask.sum().item())
print(test_mask.sum().item())

120
500
1000


In [34]:
print(data_citeseer.num_nodes)  
print(data_citeseer.num_edges) 
print(data_citeseer.num_node_features)  

3327
9104
3703


In [36]:
print(data_citeseer.is_directed())  
print(data_citeseer.contains_isolated_nodes())  # 是否包含孤立节点:Citeseer有孤立节点！！！
print(data_citeseer.contains_self_loops())  

False
True
False


## 3.Pubmed数据集的处理

In [37]:
dataset_pubmed = Planetoid(root='./pubmed/',name='Pubmed')

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.test.index
Processing...
Done!


### 其他操作同上面的Cora数据集

In [38]:
data_pubmed = dataset_pubmed[0].to(device)
print(dataset_pubmed.num_classes)
print(dataset_pubmed.num_node_features)
print(len(dataset_pubmed))
print(data_pubmed)

3
500
1
Data(edge_index=[2, 88648], test_mask=[19717], train_mask=[19717], val_mask=[19717], x=[19717, 500], y=[19717])


In [39]:
x = data_pubmed.x  
edge_index = data_pubmed.edge_index 
y = data_pubmed.y  
print(x.shape)
print(edge_index.shape)
print(y.shape)

torch.Size([19717, 500])
torch.Size([2, 88648])
torch.Size([19717])


In [40]:
train_mask = data_pubmed.train_mask 
val_mask = data_pubmed.val_mask
test_mask = data_pubmed.test_mask
print(train_mask.sum().item())
print(val_mask.sum().item())
print(test_mask.sum().item())

60
500
1000


In [41]:
print(data_pubmed.num_nodes)  
print(data_pubmed.num_edges) 
print(data_pubmed.num_node_features) 

19717
88648
500


In [42]:
print(data_pubmed.is_directed())  
print(data_pubmed.contains_isolated_nodes())  
print(data_pubmed.contains_self_loops()) 

False
False
False
