In [1]:
import itertools

from sklearn.model_selection import train_test_split
import sklearn.metrics

import pandas as pd
import numpy as np

import dgl
from dgl.nn import GraphConv

import torch
import torch.nn as nn
import torch.nn.functional as F

import time
from tqdm import tqdm

Using backend: pytorch


In [2]:
!ls dataset2

agency-claim-user.csv	      final_nodes.csv	   news-share-user.csv_sample
agency-claim-user_sample.csv  news_content.csv	   news-share-user_sample.csv
agency_content.csv	      news.csv		   news-share-user_sample.sv
agency.csv		      news_sample.csv	   user-reply-user.csv
agency_sample.csv	      news-share-user.csv  user-reply-user_sample.csv


In [3]:
DATAPATH = 'dataset2/'
node = pd.read_csv(DATAPATH + 'final_nodes.csv')
edge = pd.read_csv(DATAPATH + 'user-reply-user.csv')
edge.drop('news_id',axis=1,inplace=True)

In [4]:
node2 = node.copy()
edge2 = edge.copy()

In [5]:
display(node.head(5))
display(edge.head(5))

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,765,766,767,label,type_0,type_1,type_2,type_3,type_4,type_5
0,1255484783884369921,0.050507,0.084058,-0.045004,0.012769,-0.31719,0.187268,0.13051,-0.259223,-0.160112,...,0.423616,0.589071,0.10133,0,0,0,0,1,0,0
1,1249534887691984897,0.050507,0.084058,-0.045004,0.012769,-0.31719,0.187268,0.13051,-0.259223,-0.160112,...,0.423616,0.589071,0.10133,0,0,0,0,1,0,0
2,1250174164268396544,0.050507,0.084058,-0.045004,0.012769,-0.31719,0.187268,0.13051,-0.259223,-0.160112,...,0.423616,0.589071,0.10133,0,0,0,0,1,0,0
3,1247680031423254529,0.050507,0.084058,-0.045004,0.012769,-0.31719,0.187268,0.13051,-0.259223,-0.160112,...,0.423616,0.589071,0.10133,0,0,0,0,1,0,0
4,1247618373883740162,0.050507,0.084058,-0.045004,0.012769,-0.31719,0.187268,0.13051,-0.259223,-0.160112,...,0.423616,0.589071,0.10133,0,0,0,0,1,0,0


Unnamed: 0,tweet_id,reply_id
0,1255484783884369921,1254913226325872642
1,1255484783884369921,1254985211265159169
2,1255484783884369921,1255484999203258370
3,1249534887691984897,1249535327829712898
4,1249534887691984897,1249536037489184769


## Node schema

In [6]:
nids = node2.pop('target')
node_labels = node2.pop('label')
node_features = node2

### reset index ( library should start index 0)

In [10]:
nids_df = nids.reset_index()
#nids_df.drop('target', axis=1, inplace=True)
func = dict(zip(nids_df.target, nids_df.index))

### Same function at reset index 

In [11]:
edge['tweet_id'] = edge['tweet_id'].apply(lambda x : func[x])
edge['reply_id'] = edge['reply_id'].apply(lambda x : func[x])

In [12]:
graph = dgl.graph((edge.tweet_id, edge.reply_id))
node_features = torch.FloatTensor(node_features.values)
node_labels = torch.LongTensor(node_labels)

In [13]:
print('Graph')
print(graph)
print('Shape of node features:', node_features.shape)
print('Shape of node labels:', node_labels.shape)

num_features = node_features.shape[1]
num_classes = (node_labels.max() + 1).item()
print('Number of classes:', num_classes)

Graph
Graph(num_nodes=143478, num_edges=122593,
      ndata_schemes={}
      edata_schemes={})
Shape of node features: torch.Size([143478, 774])
Shape of node labels: torch.Size([143478])
Number of classes: 2


### feature & label

In [14]:
graph.ndata['feat'] = node_features
graph.ndata['label'] = node_labels

### train test split

In [15]:
n_nodes = nids.shape[0]
train_mask = torch.zeros(n_nodes, dtype=torch.bool)
valid_mask = torch.zeros(n_nodes, dtype=torch.bool)
test_mask = torch.zeros(n_nodes, dtype=torch.bool)

In [16]:
n_train = int(n_nodes * 0.6)
n_val = int(n_nodes * 0.2)

In [17]:
train_mask[:n_train] = True
valid_mask[n_train:n_train + n_val] = True
test_mask[n_train + n_val:] = True

In [18]:
graph.ndata['train_mask'] = train_mask
graph.ndata['valid_mask'] = valid_mask
graph.ndata['test_mask'] = test_mask

In [19]:
in_feats = node_features.shape[1]
n_classes = 2
n_edges = graph.number_of_edges()
print("""----Data statistics------'
  #Edges %d
  #Classes %d
  #Train samples %d
  #Val samples %d
  #Test samples %d""" %
      (n_edges, n_classes,
          train_mask.int().sum().item(),
          valid_mask.int().sum().item(),
          test_mask.int().sum().item()))

----Data statistics------'
  #Edges 122593
  #Classes 2
  #Train samples 86086
  #Val samples 28695
  #Test samples 28697


In [20]:
class GCNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)
        
    def forward(self, g, feature):
        with g.local_scope():
            gcn_msg = fn.copy_src(src='h', out='m')
            gcn_reduce = fn.sum(msg='m', out='h')
            g.ndata['h'] = feature
            g.update_all(gcn_msg, gcn_reduce)
            h = g.ndata['h']
            return self.linear(h)

In [21]:
# ----------- 2. create model -------------- #
# build a two-layer GCN model
class GCN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes, dropout_rate):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.dropout = nn.Dropout(dropout_rate)
        self.conv2 = GraphConv(h_feats, num_classes)
    
    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h
    
# Create the model with given dimensions 
# input layer dimension: 1433, node features
# hidden layer dimension: 16
# output layer dimension: n_classes
model = GCN(in_feats, 16, n_classes, 0.5)

In [22]:

def evaluate(g,model, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(g,features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

In [23]:
def evaluate_2(g, model, features, labels, mask):
    model.eval()
    with torch.no_grad():
        predictions = model(g, features)
        predictions = predictions[mask]
        labels = labels[mask]
        _, indices_predictions = torch.max(predictions, dim=1)
        precision = sklearn.metrics.precision_score(labels , indices_predictions, average='micro')
        recall = sklearn.metrics.recall_score(labels , indices_predictions, average='micro')
        f1_score = 2 * (precision * recall) / (precision + recall)
        return f1_score, precision, recall

In [24]:
graph = dgl.add_self_loop(graph)

In [25]:
# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(model.parameters()), lr=1e-4)
loss_fcn = torch.nn.CrossEntropyLoss()
# ----------- 4. training -------------------------------- #
n_epochs=50
for epoch in range(n_epochs):
        model.train()

        # forward
        
        logits = model(graph , node_features)
        
        loss = loss_fcn(logits[train_mask], node_labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        acc = evaluate(graph ,model, node_features, node_labels, valid_mask)
        f1_score, precision, recall = evaluate_2(graph , model, node_features, node_labels, valid_mask)
        
        
        if epoch%20==0:
            print("Epoch {:05d} | Loss {:.4f} | Accuracy {:.4f} | "
                  .format(epoch, loss.item(), acc))
            print("Epoch {:05d} | Loss {:.4f} | f1score {:.4f} | precision {:.4f} | recall {:.4f}"
                  .format(epoch, loss.item(), f1_score, precision, recall))
print()

Epoch 00000 | Loss 0.6943 | Accuracy 0.4157 | 
Epoch 00000 | Loss 0.6943 | f1score 0.4157 | precision 0.4157 | recall 0.4157
Epoch 00020 | Loss 0.6071 | Accuracy 0.9036 | 
Epoch 00020 | Loss 0.6071 | f1score 0.9036 | precision 0.9036 | recall 0.9036
Epoch 00040 | Loss 0.5354 | Accuracy 0.9901 | 
Epoch 00040 | Loss 0.5354 | f1score 0.9901 | precision 0.9901 | recall 0.9901



In [26]:
# ----------- 5. check results ------------------------ #
acc = evaluate(graph,model, node_features, node_labels, test_mask)
f1_score, precision, recall = evaluate_2(graph , model, node_features, node_labels, test_mask)
print("Test accuracy {:.2%}, Test f1_score {:.2%}, Test precision {:.2%}, Test recall {:.2%}".format(acc, f1_score, precision, recall))

Test accuracy 99.82%, Test f1_score 99.82%, Test precision 99.82%, Test recall 99.82%
