# base : train.py
# data : cora

元データはhttps://linqs.soe.ucsc.edu/dataにある
- content
- cites
の2つのファイル

1.cites
<ID of cited paper> <ID of citing paper>
引用されている論文のID、引用している論文のID
5430のエッジ

2.content
<paper_id> <word_attributes>+ <class_label>
2708x1435の行列
論文ID+単語(1433)+class
(例)31336, 0,0,1,…,0,Neural_Networks

In [1]:
from keras.layers import Input, Dropout
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l2

from kegra.layers.graph import GraphConvolution
from kegra.utils import *

import time

Using TensorFlow backend.


In [2]:
# Define parameters
DATASET = 'cora'
FILTER = 'localpool'  # 'chebyshev'
MAX_DEGREE = 2  # maximum polynomial degree
SYM_NORM = True  # symmetric (True) vs. left-only (False) normalization
NB_EPOCH = 200
PATIENCE = 10  # early stopping patience

In [3]:
# Get data
X, A, y = load_data(dataset=DATASET)
y_train, y_val, y_test, idx_train, idx_val, idx_test, train_mask = get_splits(y)

# Normalize X
X /= X.sum(1).reshape(-1, 1)

Loading cora dataset...
Dataset has 2708 nodes, 5429 edges, 1433 features.


In [20]:
get_splits?

[0;31mSignature:[0m [0mget_splits[0m[0;34m([0m[0my[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      ~/anaconda3/lib/python3.6/site-packages/kegra-0.0.1-py3.6.egg/kegra/utils.py
[0;31mType:[0m      function


In [4]:
X.shape

(2708, 1433)

In [5]:
A.shape

(2708, 2708)

In [19]:
y.shape

(2708, 7)

In [22]:
y_test.shape

(2708, 7)

In [25]:
idx_test

range(500, 1500)

In [26]:
y_test.shape

(2708, 7)

# ↑この辺の処理（split）がよくわからん

In [6]:
if FILTER == 'localpool':
    """ Local pooling filters (see 'renormalization trick' in Kipf & Welling, arXiv 2016) """
    print('Using local pooling filters...')
    A_ = preprocess_adj(A, SYM_NORM)
    support = 1
    graph = [X, A_]
    G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True)]

elif FILTER == 'chebyshev':
    """ Chebyshev polynomial basis filters (Defferard et al., NIPS 2016)  """
    print('Using Chebyshev polynomial basis filters...')
    L = normalized_laplacian(A, SYM_NORM)
    L_scaled = rescale_laplacian(L)
    T_k = chebyshev_polynomial(L_scaled, MAX_DEGREE)
    support = MAX_DEGREE + 1
    graph = [X]+T_k
    G = [Input(shape=(None, None), batch_shape=(None, None), sparse=True) for _ in range(support)]
    

else:
    raise Exception('Invalid filter type.')

Using local pooling filters...


In [9]:
X_in = Input(shape=(X.shape[1],))

# Define model architecture
# NOTE: We pass arguments for graph convolutional layers as a list of tensors.
# This is somewhat hacky, more elegant options would require rewriting the Layer base class.
H = Dropout(0.5)(X_in)
H = GraphConvolution(16, support, activation='relu', kernel_regularizer=l2(5e-4))([H]+G)
H = Dropout(0.5)(H)
Y = GraphConvolution(y.shape[1], support, activation='softmax')([H]+G)

# Compile model
model = Model(inputs=[X_in]+G, outputs=Y)
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01))

In [11]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_3 (InputLayer)             (None, 1433)          0                                            
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 1433)          0           input_3[0][0]                    
____________________________________________________________________________________________________
input_1 (InputLayer)             (None, None)          0                                            
____________________________________________________________________________________________________
graph_convolution_1 (GraphConvol (None, 16)            22944       dropout_1[0][0]                  
                                                                   input_1[0][0]           

In [13]:
from keras.utils import plot_model
plot_model(model, to_file='model.png')

<img src="model.png">

In [14]:
# Helper variables for main training loop
wait = 0
preds = None
best_val_loss = 99999

In [15]:
# Fit
for epoch in range(1, NB_EPOCH+1):

    # Log wall-clock time
    t = time.time()

    # Single training iteration (we mask nodes without labels for loss calculation)
    model.fit(graph, y_train, sample_weight=train_mask,
              batch_size=A.shape[0], epochs=1, shuffle=False, verbose=0)

    # Predict on full dataset
    preds = model.predict(graph, batch_size=A.shape[0])

    # Train / validation scores
    train_val_loss, train_val_acc = evaluate_preds(preds, [y_train, y_val],
                                                   [idx_train, idx_val])
    print("Epoch: {:04d}".format(epoch),
          "train_loss= {:.4f}".format(train_val_loss[0]),
          "train_acc= {:.4f}".format(train_val_acc[0]),
          "val_loss= {:.4f}".format(train_val_loss[1]),
          "val_acc= {:.4f}".format(train_val_acc[1]),
          "time= {:.4f}".format(time.time() - t))

    # Early stopping
    if train_val_loss[1] < best_val_loss:
        best_val_loss = train_val_loss[1]
        wait = 0
    else:
        if wait >= PATIENCE:
            print('Epoch {}: early stopping'.format(epoch))
            break
        wait += 1


Epoch: 0001 train_loss= 1.9343 train_acc= 0.3571 val_loss= 1.9363 val_acc= 0.3300 time= 1.4581
Epoch: 0002 train_loss= 1.9241 train_acc= 0.2286 val_loss= 1.9287 val_acc= 0.1800 time= 0.0169
Epoch: 0003 train_loss= 1.9132 train_acc= 0.2214 val_loss= 1.9203 val_acc= 0.1733 time= 0.0170
Epoch: 0004 train_loss= 1.9013 train_acc= 0.2429 val_loss= 1.9112 val_acc= 0.1933 time= 0.0170
Epoch: 0005 train_loss= 1.8885 train_acc= 0.2786 val_loss= 1.9015 val_acc= 0.2233 time= 0.0168
Epoch: 0006 train_loss= 1.8756 train_acc= 0.2786 val_loss= 1.8919 val_acc= 0.2167 time= 0.0163
Epoch: 0007 train_loss= 1.8624 train_acc= 0.2857 val_loss= 1.8821 val_acc= 0.2233 time= 0.0167
Epoch: 0008 train_loss= 1.8488 train_acc= 0.2929 val_loss= 1.8718 val_acc= 0.2200 time= 0.0166
Epoch: 0009 train_loss= 1.8352 train_acc= 0.3214 val_loss= 1.8616 val_acc= 0.2333 time= 0.0168
Epoch: 0010 train_loss= 1.8217 train_acc= 0.3286 val_loss= 1.8514 val_acc= 0.2533 time= 0.0171
Epoch: 0011 train_loss= 1.8081 train_acc= 0.3571 v

In [16]:
# Testing
test_loss, test_acc = evaluate_preds(preds, [y_test], [idx_test])
print("Test set results:",
      "loss= {:.4f}".format(test_loss[0]),
      "accuracy= {:.4f}".format(test_acc[0]))

Test set results: loss= 0.9387 accuracy= 0.7690


# RFでXからyを予測する

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
RF = RandomForestClassifier()
RF.fit(X[idx_train], y[idx_train])
pred = RF.predict(X[idx_test])

In [36]:
accuracy_score(y, pred)

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.]])

In [33]:
y_train.shape

(2708, 7)

In [39]:
y == y_train

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ..., 
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ..., False,  True,  True],
       [ True, False,  True, ...,  True,  True,  True]], dtype=bool)