# GCN

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from tqdm import tqdm
from jupyterthemes import jtplot
from functools import reduce
import scipy.sparse as sp
jtplot.style()

# Cora 数据集处理

In [4]:
df_content = pd.read_csv('./cora.content', sep='\t', header=None)
df_content.rename(columns={0: 'artical', 1434: 'type'}, inplace=True)
df_content

Unnamed: 0,artical,1,2,3,4,5,6,7,8,9,...,1425,1426,1427,1428,1429,1430,1431,1432,1433,type
0,31336,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Neural_Networks
1,1061127,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,Rule_Learning
2,1106406,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
3,13195,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
4,37879,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Probabilistic_Methods
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,1128975,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2704,1128977,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2705,1128978,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Genetic_Algorithms
2706,117328,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Case_Based


In [5]:
# 标签编码
df_label = pd.factorize(df_content['type'])
df_content['label'] = df_label[0]
df_content

Unnamed: 0,artical,1,2,3,4,5,6,7,8,9,...,1426,1427,1428,1429,1430,1431,1432,1433,type,label
0,31336,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,Neural_Networks,0
1,1061127,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,Rule_Learning,1
2,1106406,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Reinforcement_Learning,2
3,13195,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Reinforcement_Learning,2
4,37879,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Probabilistic_Methods,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,1128975,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Genetic_Algorithms,5
2704,1128977,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Genetic_Algorithms,5
2705,1128978,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Genetic_Algorithms,5
2706,117328,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,Case_Based,6


In [6]:
# 论文编号 -> 索引映射
content2idx = {content: idx for idx, content in zip (range(df_content['artical'].nunique()), df_content['artical'].unique())}
df_content['idx'] = df_content['artical'].apply(lambda x: content2idx[x])
df_content

Unnamed: 0,artical,1,2,3,4,5,6,7,8,9,...,1427,1428,1429,1430,1431,1432,1433,type,label,idx
0,31336,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,Neural_Networks,0,0
1,1061127,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Rule_Learning,1,1
2,1106406,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Reinforcement_Learning,2,2
3,13195,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Reinforcement_Learning,2,3
4,37879,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Probabilistic_Methods,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703,1128975,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Genetic_Algorithms,5,2703
2704,1128977,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Genetic_Algorithms,5,2704
2705,1128978,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,Genetic_Algorithms,5,2705
2706,117328,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,Case_Based,6,2706


In [7]:
df_content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2708 entries, 0 to 2707
Columns: 1437 entries, artical to idx
dtypes: int64(1436), object(1)
memory usage: 29.7+ MB


In [8]:
df_cites = pd.read_csv('./cora.cites', sep='\t', header=None)
df_cites.rename(columns={0: 'cited', 1: 'original'}, inplace=True)
df_cites['cited_idx'] = df_cites['cited'].apply(lambda x: content2idx[x])
df_cites['original_idx'] = df_cites['original'].apply(lambda x: content2idx[x])
df_cites

Unnamed: 0,cited,original,cited_idx,original_idx
0,35,1033,163,402
1,35,103482,163,659
2,35,103515,163,1696
3,35,1050679,163,2295
4,35,1103960,163,1274
...,...,...,...,...
5424,853116,19621,1886,745
5425,853116,853155,1886,1902
5426,853118,1140289,1887,2258
5427,853155,853118,1902,1887


In [9]:
df_cites.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5429 entries, 0 to 5428
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   cited         5429 non-null   int64
 1   original      5429 non-null   int64
 2   cited_idx     5429 non-null   int64
 3   original_idx  5429 non-null   int64
dtypes: int64(4)
memory usage: 169.8 KB


In [10]:
df_cites[(df_cites['cited_idx'] == 1886) & (df_cites['original_idx'] == 745)]

Unnamed: 0,cited,original,cited_idx,original_idx
5424,853116,19621,1886,745


In [11]:
df_cites[(df_cites['cited_idx'] == 745) & (df_cites['original_idx'] == 1886)]

Unnamed: 0,cited,original,cited_idx,original_idx
2228,19621,853116,745,1886


# 数据拆分

In [12]:
df_features = df_content[[x for x in range(1, 1434)]]
df_label = df_content[['label']]

# GCN model

In [13]:
import torch.nn as nn
import torch
import pytorch_lightning as pl
from torch.utils.data import TensorDataset, DataLoader, Dataset
from pytorch_lightning import loggers as pl_loggers

## 构造图卷积算子

In [14]:
# 邻接矩阵 A
A = np.zeros((len(content2idx), len(content2idx))) 

for i, j in df_cites[['cited_idx', 'original_idx']].values:
    A[j][i] = 1
    
A_hat = A + np.diag([1 for _ in range(len(content2idx))])

# 节点的度 D_hat
D_hat = np.diag(A_hat.sum(axis=1))

# 正则化拉普拉斯矩阵 L_sym
D_hat_i = np.linalg.inv(np.power(D_hat, 0.5))
L_sym = D_hat_i.dot(A_hat).dot(D_hat_i)

## Network

In [15]:
class GCN(nn.Module):
    def __init__(self, in_dim, out_dim, L):
        super(GCN, self).__init__()
        self.fc1 = nn.Linear(in_dim, 36)
        self.fc2 = nn.Linear(36, out_dim)
        self.relu = nn.ReLU()
        self.L = L

    def forward(self, x):
        x = self.relu(torch.mm(self.L, self.fc1(x)))
        x = self.relu(torch.mm(self.L, self.fc2(x)))
        return x

In [16]:
class TrainModel(pl.LightningModule):
    
    def __init__(self, network):
        super(TrainModel, self).__init__()
        self.backbone = network
        self.loss = nn.CrossEntropyLoss()

    def training_step(self, batch, batch_idx):
        x, y = batch
        output = self.backbone(x)
        loss = self.loss(output, y)
        self.log('Training Loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        output = self.backbone(x)
        loss = self.loss(output, y)
        self.log('Validation Loss', loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters())
        return optimizer

    def forward(self, x):
        predict = self.backbone(x)
        return predict

In [17]:
x_tensor = torch.FloatTensor(df_features.values)
y_tensor = torch.LongTensor(df_label.values)

# 直接调用默认
data_set = TensorDataset(x_tensor, y_tensor) 

# 加载 DataLoader
data_loader = DataLoader(data_set, batch_size=32, shuffle=True)

In [18]:
%load_ext tensorboard

In [None]:
gcn = GCN(1433, 7, torch.Tensor(L_sym))
tb_logger = pl_loggers.TensorBoardLogger('logs/')

model = TrainModel(gcn)
trainer = pl.Trainer(max_epochs=50, gpus=0)
trainer.fit(model, data_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
