# In this notebook we will benifit from the new features embeeding to train GCN model

In [1]:
from IPython.display import clear_output


# !pip install tensorflow
# !pip install  spacy
# !pip install tqdm
# !pip install plotly
!pip install jupyter-black
!pip install imblearn
!pip install joblib --upgrade

clear_output()

## Data preprocessing

### Import libraries

In [1]:
import csv
import re
import numpy as np
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm

tqdm.pandas()

In [2]:
%load_ext jupyter_black

In [3]:
np.set_printoptions(precision=4)

### Important functions 

In [4]:
def softmax(x):
    y = np.exp(x - np.max(x))
    f_x = y / np.sum(np.exp(x))
    return f_x


softmax_vect = np.vectorize(softmax)

In [5]:
# weighted multi-class log loss
from sklearn.metrics import log_loss


def weighted_mc_log_loss(y_true, y_pred, y_pred_proba):
    loss = log_loss(y_true, y_pred_proba, labels=np.unique(y_true))
    accuracy = round((y_true == y_pred).sum() / len(y_true) * 100, 2)
    return print(f"{loss = } and accuracy {accuracy = }")

In [6]:
def submit(y_pred_proba, name=""):
    # Write predictions to a file
    with open("../Submissions/" + name + "Graph_gcn.csv", "w") as csvfile:
        writer = csv.writer(csvfile, delimiter=",")
        lst = list()
        for i in range(18):
            lst.append("class" + str(i))
        lst.insert(0, "name")
        writer.writerow(lst)
        for i, protein in enumerate(proteins_test):
            lst = y_pred_proba[i, :].tolist()
            lst.insert(0, protein)
            writer.writerow(lst)

# using structure

In [7]:
import torch

# Initialize device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [8]:
import time
import scipy.sparse as sp

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

path = "./data/"


def load_data():
    """
    Function that loads graphs
    """
    graph_indicator = np.loadtxt(path + "graph_indicator.txt", dtype=np.int64)
    _, graph_size = np.unique(graph_indicator, return_counts=True)

    edges = np.loadtxt(path + "edgelist.txt", dtype=np.int64, delimiter=",")
    edges_inv = np.vstack((edges[:, 1], edges[:, 0]))
    edges = np.vstack((edges, edges_inv.T))
    s = edges[:, 0] * graph_indicator.size + edges[:, 1]
    idx_sort = np.argsort(s)
    edges = edges[idx_sort, :]
    edges, idx_unique = np.unique(edges, axis=0, return_index=True)
    A = sp.csr_matrix(
        (np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
        shape=(graph_indicator.size, graph_indicator.size),
    )

    x = np.loadtxt(path + "node_attributes.txt", delimiter=",")
    edge_attr = np.loadtxt(path + "edge_attributes.txt", delimiter=",")
    edge_attr = np.vstack((edge_attr, edge_attr))
    edge_attr = edge_attr[idx_sort, :]
    edge_attr = edge_attr[idx_unique, :]

    adj = []
    features = []
    edge_features = []
    idx_n = 0
    idx_m = 0
    for i in range(graph_size.size):
        adj.append(A[idx_n : idx_n + graph_size[i], idx_n : idx_n + graph_size[i]])
        edge_features.append(edge_attr[idx_m : idx_m + adj[i].nnz, :])
        features.append(x[idx_n : idx_n + graph_size[i], :])
        idx_n += graph_size[i]
        idx_m += adj[i].nnz

    return adj, features, edge_features

In [9]:
def normalize_adjacency(A):
    """
    Function that normalizes an adjacency matrix
    """
    n = A.shape[0]
    A += sp.identity(n)
    degs = A.dot(np.ones(n))
    inv_degs = np.power(degs, -1)
    D = sp.diags(inv_degs)
    A_normalized = D.dot(A)

    return A_normalized


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """
    Function that converts a Scipy sparse matrix to a sparse Torch tensor
    """
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)
    )
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [10]:
# Load graphs
adj, features, edge_features = load_data()

# Normalize adjacency matrices
# adj = [normalize_adjacency(A) for A in adj]

# Split data into training and test sets
adj_train = list()
features_train = list()
edge_features_train = list()
y_train = list()
adj_test = list()
features_test = list()
edge_features_test = list()
proteins_test = list()

In [11]:
with open(path + "graph_labels.txt", "r") as f:
    for i, line in enumerate(f):
        t = line.split(",")
        if len(t[1][:-1]) == 0:
            proteins_test.append(t[0])
            adj_test.append(adj[i])
            features_test.append(features[i])
            edge_features_test.append(edge_features[i])
        else:
            adj_train.append(adj[i])
            features_train.append(features[i])
            edge_features_train.append(edge_features[i])
            y_train.append(int(t[1][:-1]))

In [12]:
import gc

del edge_features
gc.collect(generation=2)

6419

In [13]:
import joblib as joblib

# features_train = joblib.load("new_features_3B_params.sav")
features_test = joblib.load("new_features_test_3B_params.sav")
n_input = features_train[0].shape[1]

In [14]:
# from torch_geometric.data import Data

# for i in range(len(features_train)):
#     features_train[i] = Data(
#         x=torch.tensor(features_train[i]).float(),
#         edge_index=torch.tensor(adj_train[i].todense()).nonzero().t().contiguous(),
#         y=y_train[i],
#     )

In [14]:
from torch_geometric.data import Data

for i in range(len(features_test)):
    features_test[i] = Data(
        x=torch.tensor(features_test[i]).float(),
        edge_index=torch.tensor(adj_test[i].todense()).nonzero().t().contiguous(),
    )

In [18]:
features_train[0]

Data(x=[185, 2646], edge_index=[2, 3813], y=8)

we have 726 vector embeedings

In [23]:
# from torch_geometric.data import Data


# def get_Data(adj_train, features_train, edge_features_train, y_train):
#     data = []
#     for i in range(len(features_train)):
#         adj_t = torch.tensor(adj_train[i].todense())
#         edge_index = adj_t.nonzero().t().contiguous()
#         x = torch.tensor(features_train[i]).float()
#         edge_attr = torch.tensor(edge_features_train[i]).float()

#         data.append(
#             Data(
#                 x=x,
#                 edge_index=edge_index,
#                 # edge_attr=edge_attr,
#                 y=y_train[i],
#             )
#         )
#     return data


# features_train = get_Data(adj_train, features_train, edge_features_train, y_train)

In [24]:
# def get_Data_pred(adj_train, features_train, edge_features_train):
#     data = []
#     for i in range(len(features_train)):
#         adj_t = torch.tensor(adj_train[i].todense())
#         edge_index = adj_t.nonzero().t().contiguous()
#         x = torch.tensor(features_train[i]).float()
#         edge_attr = torch.tensor(edge_features_train[i]).float()

#         data.append(
#             Data(
#                 x=x,
#                 edge_index=edge_index,
#                 # edge_attr=edge_attr
#             )
#         )
#     return data


# features_test = get_Data_pred(adj_test, features_test, edge_features_test)

In [16]:
train_dataset = features_train[: int(len(features_train) * 9 / 10)]
test_dataset = features_train[int(len(features_train) * 9 / 10) :]

print(f"Number of training graphs: {len(train_dataset)}")
print(f"Number of test graphs: {len(test_dataset)}")

Number of training graphs: 4399
Number of test graphs: 489


In [15]:
from torch_geometric.loader import DataLoader, NeighborLoader

# train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)
pred_loader = DataLoader(features_test, batch_size=4, shuffle=False)

In [18]:
# show 3 first batches
for step, data in enumerate(train_loader):
    if step < 3:
        print(f"Step {step + 1}:")
        print("=======")
        print(f"Number of graphs in the current batch: {data.num_graphs}")
        print(data)
        print()

Step 1:
Number of graphs in the current batch: 4
DataBatch(x=[1024, 2646], edge_index=[2, 19210], y=[4], batch=[1024], ptr=[5])

Step 2:
Number of graphs in the current batch: 4
DataBatch(x=[495, 2646], edge_index=[2, 8561], y=[4], batch=[495], ptr=[5])

Step 3:
Number of graphs in the current batch: 4
DataBatch(x=[771, 2646], edge_index=[2, 14257], y=[4], batch=[771], ptr=[5])



# Train

In [18]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch_geometric.nn import GCNConv, GATConv, GINConv, global_add_pool, SAGEConv
from torch_geometric.nn import global_mean_pool

num_classes = 18


class GNN(torch.nn.Module):
    def __init__(
        self,
        model_name="GCN",
        hidden_channels=[64, 64, 32],
        num_heads=[1, 1, 1],
        dropout=0.02,
        n_classes=num_classes,
        input_dim=2646,
    ):
        super(GNN, self).__init__()
        self.n_classes = n_classes
        self.dropout = dropout
        self.hidden_dim = hidden_channels[2]
        self.model_name = model_name

        if model_name == "GCN":
            self.layer1 = GCNConv(
                in_channels=input_dim, out_channels=hidden_channels[1]
            )
            self.layer2 = GCNConv(
                in_channels=hidden_channels[1], out_channels=hidden_channels[2]
            )
            # self.layer3 = GCNConv(
            #     in_channels=hidden_channels[1], out_channels=hidden_channels[2]
            # )

        elif model_name == "GAT":
            self.layer1 = GATConv(
                in_channels=input_dim,
                out_channels=hidden_channels[0],
                heads=num_heads[0],
                edge_dim=5,
            )
            self.layer2 = GATConv(
                hidden_channels[0] * num_heads[0],
                hidden_channels[1],
                heads=num_heads[1],
                edge_dim=5,
            )
            self.layer3 = GATConv(
                hidden_channels[1] * num_heads[1],
                hidden_channels[2],
                heads=1,
                edge_dim=5,
                concat=False,
            )

        elif model_name == "GraphSAGE":
            self.layer1 = SAGEConv(input_dim, hidden_channels[0], aggr="lstm")
            self.layer2 = SAGEConv(hidden_channels[0], hidden_channels[1], aggr="lstm")
            self.layer3 = SAGEConv(hidden_channels[1], hidden_channels[2], aggr="lstm")

        self.decoder = nn.Linear(hidden_channels[2], n_classes)

    def forward(self, x, edge_index, edge_attr, batch):
        if self.model_name == "GAT":
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = self.layer1(x, edge_index, edge_attr)
            x = F.elu(x)

            x = F.dropout(x, p=self.dropout, training=self.training)
            x = self.layer2(x, edge_index, edge_attr)
            x = F.elu(x)

            x = F.dropout(x, p=self.dropout, training=self.training)
            x = self.layer3(x, edge_index, edge_attr)
        else:
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = self.layer1(x, edge_index)
            x = F.elu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)

            x = self.layer2(x, edge_index)

        x = F.elu(x)
        x = global_mean_pool(x, batch)
        x = self.decoder(x)
        return x
        # return F.log_softmax(x, dim=-1)


model = GNN().to(device)
print(model)

GNN(
  (layer1): GCNConv(2646, 64)
  (layer2): GCNConv(64, 32)
  (decoder): Linear(in_features=32, out_features=18, bias=True)
)


In [20]:
# model = GCN(hidden_channels=64)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) #weight_decay=1e-4
criterion = torch.nn.CrossEntropyLoss().to(device)


def train():
    loss_train = 0
    correct = 0
    model.train()

    for i, data in enumerate(
        train_loader
    ):  # Iterate in batches over the training dataset.
        data.to(device)
        out = model(
            data.x, data.edge_index, _, data.batch
        )  # Perform a single forward pass.
        loss = criterion(out, data.y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.
        # test part
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        correct += int((pred == data.y).sum())  # Check against ground-truth labels.
        loss_train += criterion(out, data.y)


    return correct / len(train_loader.dataset) * 100, loss_train / (i + 1)


def test(loader):
    model.eval()
    loss = 0
    correct = 0
    for i, data in enumerate(
        loader
    ):  # Iterate in batches over the training/test dataset.
        data.to(device)
        out = model(data.x, data.edge_index,_, data.batch)
        pred = out.argmax(dim=1)  # Use the class with highest probability.
        correct += int((pred == data.y).sum())  # Check against ground-truth labels.
        loss += criterion(out, data.y)
        
        
    # torch.cuda.empty_cache()
    return correct / len(loader.dataset) * 100, loss / (
        i + 1
    )  # Derive ratio of correct predictions. , proba


min_val_loss = float("inf")
for epoch in range(1, 401):
    train_acc, train_loss = train()
    # test_acc, test_loss = test(test_loader)

    if epoch % 1 == 0:
        test_loss = test(test_loader)[1].to("cpu").detach().numpy()
        print(
            "Epoch: {:03d}".format(epoch),
            "loss_train: {:.4f}".format(train_loss),
            "acc_train: {:.4f}".format(train_acc),
            "loss_val: {:.4f}".format(test_loss),
            # "acc_val: {:.4f}".format(test_acc),
        )
        # print(test(test_loader))
        if min_val_loss > test_loss:
            print(
                f"Validation Loss Decreased({min_val_loss:.6f}--->{test_loss:.6f}) \t Saving The Model"
            )
            min_val_loss = test_loss
            # Saving State Dict
            torch.save(model.state_dict(), "saved_model_150m_pars_GAT.pth")
            print("\n")
        else:
            print("Validation loss increased :(")
        # print(f"Epoch: {epoch:03d}")

Epoch: 001 loss_train: 2.3409 acc_train: 32.4392 loss_val: 2.1596
Validation Loss Decreased(inf--->2.159626) 	 Saving The Model


Epoch: 002 loss_train: 1.8861 acc_train: 49.8522 loss_val: 1.8533
Validation Loss Decreased(2.159626--->1.853253) 	 Saving The Model


Epoch: 003 loss_train: 1.6268 acc_train: 57.0812 loss_val: 1.6481
Validation Loss Decreased(1.853253--->1.648117) 	 Saving The Model


Epoch: 004 loss_train: 1.4380 acc_train: 61.4912 loss_val: 1.4490
Validation Loss Decreased(1.648117--->1.449011) 	 Saving The Model


Epoch: 005 loss_train: 1.3078 acc_train: 64.2646 loss_val: 1.3626
Validation Loss Decreased(1.449011--->1.362644) 	 Saving The Model


Epoch: 006 loss_train: 1.2094 acc_train: 66.0150 loss_val: 1.2627
Validation Loss Decreased(1.362644--->1.262653) 	 Saving The Model


Epoch: 007 loss_train: 1.1361 acc_train: 68.3337 loss_val: 1.2205
Validation Loss Decreased(1.262653--->1.220511) 	 Saving The Model


Epoch: 008 loss_train: 1.0797 acc_train: 69.4476 loss_val: 1

KeyboardInterrupt: 

In [30]:
torch.cuda.empty_cache()


AttributeError: module 'torch.cuda' has no attribute 'reset'

In [19]:
model.load_state_dict(torch.load("saved_model_150m_params_GAT.pth"))
model.eval()

GNN(
  (layer1): GCNConv(2646, 64)
  (layer2): GCNConv(64, 32)
  (decoder): Linear(in_features=32, out_features=18, bias=True)
)

In [20]:
softmax = nn.Softmax(dim=1)


def predict(loader):
    model.eval()

    correct = 0
    pred = []
    for data in loader:  # Iterate in batches over the training/test dataset.
        data.to(device)
        out = model(data.x, data.edge_index, _, data.batch)
        out = softmax(out)
        pred.append(out.to("cpu").numpy())
        # pred.append(out.argmax(dim=1))  # Use the class with highest probability.

    return np.concatenate(pred, axis=0)  # Derive ratio of correct predictions. , proba

In [21]:
with torch.no_grad():
    predictions = predict(pred_loader)

In [43]:
def submit(y_pred_proba):
    # Write predictions to a file
    with open(
        "../Submissions/geometric_GCNcov_150_param__1.17_loss.csv", "w"
    ) as csvfile:
        writer = csv.writer(csvfile, delimiter=",")
        lst = list()
        for i in range(18):
            lst.append("class" + str(i))
        lst.insert(0, "name")
        writer.writerow(lst)
        for i, protein in enumerate(proteins_test):
            lst = y_pred_proba[i, :].tolist()
            lst.insert(0, protein)
            writer.writerow(lst)

In [22]:
submit(predictions, "GCN_fINal")

In [117]:
predictions[0].sum()

1.0