In [72]:
from model import GCN
from utils import load_data, split, accuracy
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
import time
from sklearn.metrics import confusion_matrix
import warnings; warnings.filterwarnings("ignore")

In [22]:
with open('space.labels', 'rb') as f:
    labels_num = pkl.load(f)

#### Import & preprocess dataset

In [23]:
dataset = 'space'
names = ['graph.jac', 'feature', 'labels']

In [24]:
adj, feature, labels = load_data(dataset, names)

#### Setting

In [25]:
epochs = 200
learning_rate = 0.01
weight_decay = 5e-4
num_hidden = 64
dropout = 0.5

#### Split 

In [62]:
train_ratio = 0.7
val_ratio = 0 # len_val = len_train * val_ratio

In [63]:
len_train= int(len(feature)*train_ratio);             print('Train: {} %'.format(np.round((len_train/len(feature))*100, 2)))
len_val= int(len_train*val_ratio);                    print('Val: {} %'.format(np.round((len_val/len(feature))*100, 2)))
len_test= len(feature)-(len_train+len_val);           print('Test: {} %'.format(np.round((len_test/len(feature))*100, 2)))
idx_train, idx_val, idx_test = split(len(feature), len_train, len_val, len_test)
print('num_train: {} num_val: {} num_test: {}'.format(len(idx_train), len(idx_val), len(idx_test)))

Train: 69.64 %
Val: 0.0 %
Test: 30.36 %
num_train: 172 num_val: 0 num_test: 75


#### Define GCN model

In [64]:
model = GCN( nfeat=feature.shape[1],
             nhid=num_hidden,
             nclass=labels.max().item() + 1,
             dropout=dropout
            )
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

#### Define train & test process

In [65]:
train_accs= []; train_losses= []; val_accs= []; val_losses= []
def train(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output = model(feature, adj)
    train_loss = F.nll_loss(output[idx_train], labels[idx_train])
    train_acc = accuracy(output[idx_train], labels[idx_train])
    train_loss.backward()
    optimizer.step()
    
    model.eval()
    output = model(feature, adj)
    
    val_loss = F.nll_loss(output[idx_val], labels[idx_val])
    val_acc = accuracy(output[idx_val], labels[idx_val])
    
    train_accs.append(train_acc.item()); train_losses.append(train_loss.item())
    val_accs.append(val_acc.item()); val_losses.append(val_loss.item())
    
    if epoch % 50 == 0 or epoch == 199:
        print('Epoch: {:04d}'.format(epoch+1),
              'train_loss: {:.4f}'.format(train_loss.item()),
              'train_acc: {:.4f}'.format(train_acc.item()),
              'val_loss: {:.4f}'.format(val_loss.item()),
              'val_acc: {:.4f}'.format(val_acc.item()),
              'time: {:.4f}s'.format(time.time() - t))

In [66]:
def test():
    model.eval()
    output = model(feature, adj)
    loss_test = F.nll_loss(output[idx_test], labels[idx_test])
    acc_test = accuracy(output[idx_test], labels[idx_test])
    print('Test accuracy: {:.4f}'.format(np.round(acc_test.item(), 4)), '\nTest loss: {:.4f}'.format(np.round(loss_test.item(), 4)))
    preds = output[idx_test].max(1)[1].type_as(labels)
    return preds, output[idx_test]

#### Execution

In [67]:
t_total = time.time()
for epoch in range(epochs):
    train(epoch)
print("Optimization Finished!")
print("Total time elapsed: {:.4f}s\n".format(time.time() - t_total))
preds, output = test()

Epoch: 0001 train_loss: 2.3683 train_acc: 0.0174 val_loss: nan val_acc: nan time: 0.0090s
Epoch: 0051 train_loss: 0.3495 train_acc: 0.9244 val_loss: nan val_acc: nan time: 0.0090s
Epoch: 0101 train_loss: 0.2373 train_acc: 0.9244 val_loss: nan val_acc: nan time: 0.0066s
Epoch: 0151 train_loss: 0.2002 train_acc: 0.9302 val_loss: nan val_acc: nan time: 0.0040s
Epoch: 0200 train_loss: 0.2197 train_acc: 0.9128 val_loss: nan val_acc: nan time: 0.0040s
Optimization Finished!
Total time elapsed: 1.1088s

Test accuracy: 0.9200 
Test loss: 0.5661


#### Get softmax value

In [68]:
softmax = pd.DataFrame(output.detach().numpy(), index=idx_test)
softmax['Pred_labels'] = preds
softmax['True_labels'] = labels[idx_test]
print('\nTest set num: {}\n'.format(len(idx_test)))
softmax.iloc[:10]


Test set num: 75



Unnamed: 0,0,1,2,3,4,5,6,7,8,Pred_labels,True_labels
187,-10.997313,-6.35808,-4.758696,-5.678112,-8.792611,-10.929997,-0.019912,-8.28654,-5.194512,6,6
232,-10.997313,-6.35808,-4.758696,-5.678112,-8.792611,-10.929997,-0.019912,-8.28654,-5.194512,6,6
53,-0.011254,-10.380276,-14.413383,-11.06674,-8.173734,-15.186069,-23.705299,-4.525486,-10.355816,0,0
126,-0.035224,-19.599194,-19.58713,-9.888286,-9.49565,-13.978822,-32.60733,-3.367578,-11.433852,0,0
116,-2.355193,-23.528435,-25.175152,-11.137051,-0.099703,-15.955678,-44.175915,-14.080028,-12.743876,4,4
77,-0.035224,-19.599194,-19.58713,-9.888286,-9.49565,-13.978822,-32.60733,-3.367578,-11.433852,0,0
218,-10.997313,-6.35808,-4.758696,-5.678112,-8.792611,-10.929997,-0.019912,-8.28654,-5.194512,6,6
215,-15.088444,-0.111076,-6.870939,-10.481592,-10.093784,-11.333372,-2.340819,-7.748065,-4.915768,1,1
154,-0.02184,-17.807791,-17.853971,-8.652423,-7.114151,-12.385812,-30.288214,-3.883855,-10.135798,0,0
60,-18.641581,-16.631819,-18.309448,-8.059993,-7.816788,-0.00136,-21.835138,-12.211949,-7.361725,5,5


In [None]:
# print(metrics.classification_report(preds, labels[idx_test]))

In [78]:
report = metrics.classification_report(preds, labels[idx_test], output_dict=True)
report_df = pd.DataFrame(report).transpose(); report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.961538,0.925926,0.943396,27.0
1,1.0,0.833333,0.909091,12.0
2,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,6.0
4,0.75,1.0,0.857143,3.0
5,1.0,1.0,1.0,5.0
6,1.0,1.0,1.0,15.0
7,0.666667,1.0,0.8,2.0
8,1.0,0.6,0.75,5.0
accuracy,0.92,0.92,0.92,0.92


- save to csv

In [None]:
# softmax.to_csv('sample_name.csv', index=False)
# report_df.to_csv('sample_name_2.csv', index=False)

### Confusion matrix

In [12]:
y_pred = preds
y_test = np.array(labels[idx_test])
labels_ = set(np.array(labels_num))

In [13]:
cm= pd.DataFrame(confusion_matrix(y_test, y_pred), index=labels_, columns=labels_)
cmv= cm.values
acc= [np.round((cmv[i][i])/(np.sum(cmv[i])), 2) for i in range(len(cm))]
cm['acc by class']= acc
cm

Unnamed: 0,33606,33416,33227,33101,33105,33234,33109,33400,33114,acc by class
33606,17,0,0,0,0,0,0,0,0,1.0
33416,0,8,0,0,0,0,0,0,0,1.0
33227,0,1,0,0,0,0,0,0,2,0.0
33101,0,0,0,5,0,0,0,0,0,1.0
33105,0,0,0,0,1,0,0,0,0,1.0
33234,0,0,0,0,0,4,0,0,0,1.0
33109,0,1,0,0,0,0,7,0,0,0.88
33400,0,0,0,0,0,0,0,2,0,1.0
33114,0,0,0,0,0,0,0,0,2,1.0


In [13]:
# elements = {'train_accs': train_accs, 'train_losses':train_losses, 'val_accs':val_accs, 'val_losses':val_losses}
# for k, v in elements.items():
#     with open('{}.pkl'.format(k), 'wb') as f:
#         pkl.dump(v, f)