<a href="https://colab.research.google.com/github/svlataki/DataChallenge/blob/main/gnn_transformer_sentences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import csv
import re
import zipfile
from io import BytesIO
import networkx as nx
import numpy as np
import scipy.sparse as sp

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import pickle

with open('/content/gdrive/My Drive/embedding_features.pkl', 'rb') as fp:
    features = np.asarray(pickle.load(fp))

In [4]:
# Read training data
train_domains = list()
y_train = list()
with open("/content/gdrive/My Drive/train.txt", 'r') as f:
    for line in f:
        l = line.split(',')
        train_domains.append(l[0])
        y_train.append(int(l[1][:-1]))

# Read test data
test_domains = list()
with open("/content/gdrive/My Drive/test.txt", 'r') as f:
    for line in f:
        l = line.split(',')
        test_domains.append(l[0])

# Create a directed graph
G = nx.read_edgelist('/content/gdrive/My Drive/edgelist.txt', delimiter=' ', create_using=nx.DiGraph())
node_to_idx = dict()
for i, node in enumerate(G.nodes()):
    node_to_idx[node] = i

print('Number of nodes:', G.number_of_nodes())
print('Number of edges:', G.number_of_edges())

# Read textual content of webpages of domain names
text = dict()
with zipfile.ZipFile('/content/gdrive/My Drive/domains.zip', "r") as zfile:
    for filename in zfile.namelist():
        if re.search(r'\.zip$', filename) is not None:
            zfiledata = BytesIO(zfile.read(filename))
            with zipfile.ZipFile(zfiledata) as zfile2:
                text[filename[:-4]] = ''
                for name2 in zfile2.namelist():
                    file = zfile2.read(name2)
                    text[filename[:-4]] += file.decode('utf16') + ' '


idx = [node_to_idx[domain] for domain in text]

# To reduce memory 
text = None

Number of nodes: 65208
Number of edges: 1642073


In [5]:
from sklearn.model_selection import train_test_split

# Yields indices to split data into training and test sets
idx_train = [node_to_idx[node] for node in train_domains]
idx_test = [node_to_idx[node] for node in test_domains]

# Split training set into training and validation sets
idx_train, idx_val, y_train, y_val = train_test_split(idx_train, y_train, test_size=0.2)

In [6]:
def normalize_adjacency(A):
    """Normalizes adjacency matrix"""
    
    n = A.shape[0]
    A = A + sp.identity(n)
    indegs = A.dot(np.ones(n))
    inv_indegs = np.power(indegs, -1)
    D = sp.diags(inv_indegs)
    A_normalized = D.dot(A)

    return A_normalized

def sparse_to_torch_sparse(M):
    """Converts a sparse SciPy matrix to a sparse PyTorch tensor"""
    M = M.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((M.row, M.col)).astype(np.int64))
    values = torch.from_numpy(M.data)
    shape = torch.Size(M.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [7]:
epochs = 125
n_hidden = 100
lr = 0.01
dropout_rate = 0.5
n_class = 10

# Read data
n = G.number_of_nodes() # Number of nodes
adj = nx.adjacency_matrix(G).T
adj = normalize_adjacency(adj)

# Initialize the features of the nodes as random vectors of dimension 8
updated_features = np.zeros((n, 768))
for i in range(features.shape[0]):
    updated_features[idx[i],:] = features[i,:]

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

def sparse_to_torch_sparse(M):
    """Converts a sparse SciPy matrix to a sparse PyTorch tensor"""
    M = M.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((M.row, M.col)).astype(np.int64))
    values = torch.from_numpy(M.data)
    shape = torch.Size(M.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

class GNN(nn.Module):
    """Simple GNN model"""
    def __init__(self, n_feat, hidden_dim, n_class, dropout):
        super(GNN, self).__init__()

        self.fc1 = nn.Linear(n_feat, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, n_class)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x_in, adj):
        # Implement the graph neural network
        # Add 2 message passing layers followed by a fully connected layer

        h1 = self.fc1(x_in)
        h1 = self.relu(torch.mm(adj, h1))
        h1 = self.dropout(h1)

        h2 = self.fc2(h1)
        h2  = self.relu(torch.mm(adj, h2))
        h2 = self.dropout(h2)

        x = self.fc3(h2)

        return F.log_softmax(x, dim=1)

In [9]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Transform the numpy matrices/vectors to torch tensors
y_train = torch.LongTensor(y_train).to(device)
y_val = torch.LongTensor(y_val).to(device)
adj = sparse_to_torch_sparse(adj).to(device)
idx_train = torch.LongTensor(idx_train).to(device)
idx_val = torch.LongTensor(idx_val).to(device)
idx_test = torch.LongTensor(idx_test).to(device)
# Creates the model and specifies the optimizer
model = GNN(features.shape[1], n_hidden, n_class, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_function = nn.CrossEntropyLoss()

In [10]:
updated_features = torch.FloatTensor(updated_features).to(device)


In [11]:

# Train the model
best_val_loss = 100
for epoch in range(1, epochs+1):
    model.train()    
    optimizer.zero_grad()
    output = model(updated_features, adj)
    loss = loss_function(output[idx_train], y_train)
    loss.backward()
    optimizer.step()

    model.eval()
    loss_val = 0
    output = model(updated_features, adj)
    loss_val = loss_function(output[idx_val], y_val)

    # Remember best validation loss and save checkpoint
    is_best = loss_val <= best_val_loss
    best_val_loss = min(loss_val, best_val_loss)
    if is_best:
        torch.save({
            'state_dict': model.state_dict(),
            'optimizer' : optimizer.state_dict(),
        }, 'model_best.pth.tar')


    if epoch%10==0:
        print('Epoch: {:03d}, Train Loss: {:.7f}, Val Loss: {:.7f}'.format(epoch, loss, loss_val))


Epoch: 010, Train Loss: 1.6808897, Val Loss: 1.4924887
Epoch: 020, Train Loss: 1.4097511, Val Loss: 1.3100301
Epoch: 030, Train Loss: 1.2497356, Val Loss: 1.1991882
Epoch: 040, Train Loss: 1.1665410, Val Loss: 1.1812040
Epoch: 050, Train Loss: 1.1073515, Val Loss: 1.1158328
Epoch: 060, Train Loss: 1.0578264, Val Loss: 1.0705806
Epoch: 070, Train Loss: 0.9851183, Val Loss: 1.0286365
Epoch: 080, Train Loss: 0.9434975, Val Loss: 0.9948658
Epoch: 090, Train Loss: 0.8974210, Val Loss: 0.9464062
Epoch: 100, Train Loss: 0.8449539, Val Loss: 0.9348775
Epoch: 110, Train Loss: 0.7963455, Val Loss: 0.9124903
Epoch: 120, Train Loss: 0.7674673, Val Loss: 0.8799836


In [13]:
y_pred_argmax = torch.exp(output[idx_val]).detach().cpu().numpy().argmax(axis=1)
from sklearn.metrics import classification_report

print(classification_report(y_val.detach().cpu().numpy(), y_pred_argmax))

              precision    recall  f1-score   support

           0       0.77      0.86      0.81        84
           1       0.84      0.78      0.81        27
           2       0.61      0.47      0.53        36
           3       0.76      0.83      0.79        60
           4       0.83      0.62      0.71         8
           5       0.62      0.83      0.71         6
           6       0.35      0.43      0.39        14
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00         5
           9       0.67      0.75      0.71         8

    accuracy                           0.72       252
   macro avg       0.55      0.56      0.55       252
weighted avg       0.70      0.72      0.71       252



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
y_pred = torch.exp(output[idx_test]).detach().cpu().numpy()

In [15]:
# Write predictions to a file
with open('sample_submission_bert_sentences.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(10):
        lst.append('class_'+str(i))
    lst.insert(0, "domain_name")
    writer.writerow(lst)
    for i,test_host in enumerate(test_domains):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_host)
        writer.writerow(lst)