# GCN for Text Classification
PyTorch reimplementation of **Graph Convolutional Networks for Text Classification (AAAI 2019)**

# Table of Content
* [Preamble](#Preamble)
* [Data preparation](#Data-preparation)
    * [Load raw data](#Load-raw-data)
    * [Train-test split](#Train-test-split)
    * [Setup vocabulary utils](#Setup-vocabulary-utils)
    * [Load pretrained Embeddings](#Load-pretrained-Embeddings)
        * [Word2Vec Module](#Word2Vec-Module)
    * [Setup statistics utils](#Setup-statistics-utils)
    * [Build Text graph](#Build-Text-graph)
* [PyTorch Dataset class](#PyTorch-Dataset-class)
* [TextGCN Module](#TextGCN-Module)
* [Training](#Training)
    * [Prepare text graphs](#Prepare-text-graphs)
    * [Training configs](#Training-configs)
    * [Verify training accuracy](#Verify-training-accuracy)

# Preamble

In [191]:
# Preamble
import time, random
import re, string
import os, sys
import math
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from tqdm import tqdm

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
CPU = torch.device("cpu")
BATCH_SIZE = 65536

# Data preparation
The word embeddings is trained on YELP-review dataset: https://www.kaggle.com/datasets/yelp-dataset/yelp-dataset

## Load raw data

In [192]:
# # CSV Preparation
# data_file = open("/kaggle/input/yelp-dataset/yelp_academic_dataset_review.json")
# data = []

# # cnt = 1569264 # Size of YELP 2015 dataset
# cnt = 65536

# for line in data_file:
#     data.append(json.loads(line))
#     cnt -= 1
#     if cnt == 0:
#         break
    
# data_file.close()
# df = pd.DataFrame(data)

# print("Number of datapoints:", len(df))
# df.head()

train_ds = pd.read_csv('/kaggle/input/smolcsv/r52-train-stemmed.csv')
val_ds = pd.read_csv('/kaggle/input/smolcsv/r52-dev-stemmed.csv')
test_ds = pd.read_csv('/kaggle/input/smolcsv/r52-test-stemmed.csv')

train_ds.head()

Unnamed: 0,text,edge,intent,intent_label,label
0,bahia cocoa review shower continu week bahia c...,bahia cocoa review shower continu week bahia c...,cocoa,1,0
1,champion product approv stock split champion p...,champion product approv stock split champion p...,earn,2,1
2,comput termin system cpml complet sale comput ...,comput termin system cpml complet sale comput ...,acq,3,2
3,cobanco inc cbco year net shr ct dlr net asset...,cobanco inc cbco year net shr ct dlr net asset...,earn,2,1
4,intern inc qtr jan oper shr loss two ct profit...,intern inc qtr jan oper shr loss two ct profit...,earn,2,1


## Train-test split
Only trainset is needed for building pretrained **TextGCN**.

In [193]:
# df_size = len(df)
# idx = [x for x in range(df_size)]
# random.Random(555).shuffle(idx)

# train_num = int(df_size)

# train_idx = idx[:train_num]

# train_df = df.iloc[train_idx]

print('Size of trainset:', len(train_ds))
print('Size of valset:', len(val_ds))
print('Size of testset:', len(test_ds))

Size of trainset: 5879
Size of valset: 2568
Size of testset: 2568


## Setup vocabulary utils
Here the frequency counter, vocab object and tokenizer is defined.

Reference: https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

In [194]:
# Set up Vocab
# Source: https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab, build_vocab_from_iterator

from collections import Counter

tokenizer = get_tokenizer("basic_english")

def get_counter(texts):
    counter = Counter()
    for text in texts:
        counter.update(tokenizer(text))
    return counter

def get_vocab(texts):
    counter = get_counter(texts)
    vocabulary = vocab(
        counter,
        specials= ["<unk>"],
        min_freq= 1
    )
    vocabulary.set_default_index(vocabulary["<unk>"])
    return vocabulary

def text_pipeline(text, vocabulary):
    return vocabulary(tokenizer(text))

## Load pretrained Embeddings
Here the pretrained embeddings along with the grand vocabulary is prepared. However, pretrained high-level feature representation (or embeddings) does not work well with **TextGCN**. The reason seems to stem from $G$ where edge weights are based on **discrete statistical quantities** ($TF-IDF$ and $PMI$) while word embeddings assume a **continuous space** for word representation and interaction (via **continuous** function, for example, **cosine similarity**)

In [195]:
# Prepare vocab, counter
grand_vocab = torch.load('/kaggle/input/hlt-word2vec/vocab.pth')
counter = torch.load('/kaggle/input/hlt-word2vec/counter.pth')

### Word2Vec Module
The Word2Vec Negative Sampling module is taken from: https://www.kaggle.com/code/vhminh2210/negsampling-exps

In [196]:
VOCAB_SIZE = len(grand_vocab)
INIT_EMBEDDING_DIM = 200

In [197]:
# Negative sampling embedding module
class NegSamplingEmbedding(nn.Module):
    '''
    Vocab_size: V
    Embedding_size: E
    Text_length: L
    Batch_size: B
    
    Consult: https://github.com/mindspore-courses/DeepNLP-models-MindSpore/
            blob/main/notebooks/02.Skip-gram-Negative-Sampling.ipynb
    '''
    def __init__(self, vocab_size, embedding_size):
        super(NegSamplingEmbedding, self).__init__()
        self.U = nn.Embedding(vocab_size, embedding_size) # Center embedding
        self.V = nn.Embedding(vocab_size, embedding_size) # Outside embedding
        self.LogSig = nn.LogSigmoid()
        
    def forward(self, wc, wo, wk, mask_c, mask_o, check_shape= False):
        vc = self.V(wc) # Center embedding. Shape: (B, L, E)
        uo = self.U(wo) # Outside embedding. Shape: (B, L, C, E)
        uk = self.U(wk) # Random embedding. Shape: (B, L, C, E, K)
        
        if check_shape:
            B = uk.shape[0]
            L = uk.shape[1]
            C = uk.shape[2]
            K = uk.shape[3]
            E = uk.shape[4]
            print(f"Basic shapes: B = {B}; L = {L}; C = {C}; K = {K}; E = {E}")
            print('*********************************')
            print('Shape of vc:', vc.shape)
            print('Shape of uo:', uo.shape)
            print('Shape of uk:', uk.shape)
            print('*********************************')
        cmp1 = torch.einsum('blce,ble->blc', uo, vc) # Shape: (B, L, C)
        cmp2 = torch.einsum('blcke,ble->blck', uk, vc) # Shape: (B, L, C, K)
        
        cmp1 = self.LogSig(cmp1) * mask_o # Shape: (B, L, C)
        cmp2 = self.LogSig(-cmp2) # Shape: (B, L, C, K)
        cmp2 = torch.einsum('blck->blc', cmp2) * mask_o # Shape: (B, L, C)
    
        cmp1 = torch.einsum('blc->bl', cmp1) # Shape: (B, L)
        cmp2 = torch.einsum('blc->bl', cmp2) # Shape: (B, L)
        
        loss = torch.mean(cmp1 + cmp2)
        
        if check_shape:
            print('Shape of cmp1:', cmp1.shape)
            print('Shape of cmp2:', cmp2.shape)
            print('Shape of LOSS:', loss.shape)
        return -loss

In [198]:
# Prepare word embeddings
WORD2VEC_PATH = '/kaggle/input/hlt-word2vec/word2vec.pth'
word2vec = NegSamplingEmbedding(VOCAB_SIZE, INIT_EMBEDDING_DIM)
word2vec.load_state_dict(torch.load(WORD2VEC_PATH, map_location= DEVICE))
word2vec.eval()

W_e = word2vec.V.weight.detach().to(DEVICE)

## Setup statistics utils
Here the statistics metrics $TF-IDF$ and $PMI$ are defined. Words in Vocab are indexed from $1\dots N$ and documents are indexed from $1\dots V$

In [199]:
import scipy.sparse as sp # Sparse matrix utilities
from collections import defaultdict 

In [200]:
from scipy.sparse import coo_matrix

# Sparse matrix utils
def build_sparse(W_dict, shape):
    W_row, W_col, W_data = [], [], []
    for (current, nxt) in W_dict.keys():
        W_row.append(current)
        W_col.append(nxt)
        W_data.append(W_dict[(current, nxt)])
        
    return sp.csr_matrix((W_data, (W_row, W_col)), shape= shape)

def ret_zero():
    return 0

def csr2spMat(X_csr):
    '''
    Consult: https://stackoverflow.com/questions/50665141/converting-a-scipy-coo-matrix-to-pytorch-sparse-tensor
    '''
    X_coo = sp.csr_matrix.tocoo(X_csr, copy= True)
    
    values = X_coo.data
    indices = np.vstack((X_coo.row, X_coo.col))

    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = X_coo.shape

    return torch.sparse.FloatTensor(i, v, torch.Size(shape)).to(DEVICE)

In [201]:
# Statistics
def get_W(texts, window_size):
    '''
    Get the number of windows over texts.
    <texts> is expected to have type of list(torch.Tensor)
    
    Batch_size: B = len(texts)
    '''
    res = 0
    for text in texts:
        res += max(len(text) - window_size + 1, 1)
    return res


def build_W(texts, vocabulary, window_size, check= False):
    '''
    Build the co-occurence matrix W which will be represented using csr_matrix((data, (row_ind, col_ind))
    <texts> is expected to have type of list(torch.Tensor)
    <vocab> is the Vocabulary built over text
    
    Batch size: B = len(texts)
    Vocab size: V = len(vocabulary)
    Number of Node: N = B + V
    
    Output:
    -----
    W_dict (defaultdict) : Edge-list for construction of W
    W (sp.csr_matrix): Co-occurence matrix. Shape = (V, V)
    mW (np.ndarray): Occurence vector. Can be viewed as marginallized W. Shape = (V)
    '''
    B = len(texts)
    V = len(vocabulary)
    N = B + V
    mW = np.zeros((V)) # Margin W - Occurence vector
    
    W_dict = defaultdict(ret_zero) 
    
    if check:
        print('*********************************')
        print('Tokenized texts:')
    for text in texts:
        L = len(text)
        if check:
            print(text)
        for i in range(max(L - window_size + 1, 1)):
            master_range = text[i : i + window_size]
            mW[np.unique(master_range)] += 1
            for j in range(window_size):
                if i + j >= L:
                    break
                current = text[i+j]
                W_dict[(current, current)] = W_dict[(current, current)] + 1
                for nxt in master_range:
                    if current != nxt:
                        W_dict[(current, nxt)] = W_dict[(current, nxt)] + 1
                    
    W = build_sparse(W_dict, shape= (V, V))

    return W_dict, W, mW

def calc_pmi(texts, vocabulary, window_size):
    '''
    Calculate PMI
    <texts> is expected to have type of list(torch.Tensor)
    <vocab> is the Vocabulary built over text
    
    Vocab size: V
    Text length: L
    Batch size: B
    
    Output:
    -----
    pmi_dict (defaultdict) : Edge-list for construction of pmi
    pmi (sp.csr_matrix): Pairwise PMI matrix. Shape = (V, V)
    '''
    # Preparations
    nW = get_W(texts, window_size) # Number of windows
    W_dict, W, mW = build_W(texts, vocabulary, window_size) # W_ij. Shape: (V, V)
    
    V = len(vocabulary)
    pmi_dict = defaultdict(ret_zero)
    margin_p = (mW / nW).reshape(-1, 1)
    
    # Constructing PMI edge list.
    for (i, j) in W_dict.keys():
        if i == j:
            continue
        pij = W_dict[(i, j)] / nW
        pi = margin_p[i]
        pj = margin_p[j]
        
        prelog = pij / (pi * pj)
        if prelog <= 1:
            continue
        pmi_dict[(i, j)] = math.log2(prelog)
        
    pmi = build_sparse(pmi_dict, shape= (V, V))
    
    return pmi_dict, pmi
    

def calc_tf_idf(texts, vocabulary):
    '''
    Calculate TF-IDFs
    <texts> is expected to have type of list(torch.Tensor)
    <vocab> is the Vocabulary built over text
    
    Vocab size: V
    Text length: L
    Batch size: B
    
    NOTES:  For text graph constructions, words will be indexed from 0 to V-1 
            while documents will be indexed from V to V + D - 1 
    '''
    V = len(vocabulary)
    B = len(texts)
    
    tf_dict = defaultdict(ret_zero) # Term frequency - Shape: (B, V)
    idf_dict = defaultdict(ret_zero) # Inverse document frequency - Shape: (V,)
    tf_idf_dict = defaultdict(ret_zero) # TF - IDF - Shape: (B, V)
    
    idf = np.zeros((V)) # No. of documents containing token. The array is 0-based instead of V-based
    
    # Term frequency
    for i, text in enumerate(texts):
#         max_freq = 0
        for j in range(text.shape[0]):
            tf_dict[(V + i, text[j])] += 1
#             max_freq = max(max_freq, tf_dict[(V + i, text[j])])
            
            idf_dict[(V + i, text[j])] = 1
        # Normalize term frequency
        for j in range(text.shape[0]):
            tf_dict[(V + i, text[j])] /= len(text)
    
    # Inverse document frequency
    for (i, j) in idf_dict.keys():
        idf[j] += idf_dict[(i, j)]
        
    idf[0] = B # There is no unknown token. Hence, idf[<unk>] = 0
    idf = np.log2(B / idf) # Shape: (V,)
    
    # TF - IDF
    for (i, j) in tf_dict.keys():
        tf_idf_dict[(i, j)] = tf_dict[(i, j)] * idf[j]

    tf = build_sparse(tf_dict, shape= (B + V, V))
    tf_idf = build_sparse(tf_idf_dict, shape= (B + V, V))
    
    return tf_dict, tf, idf, tf_idf_dict, tf_idf

In [202]:
# Pipeline testing
raw_texts = ['A two-layer GCN can allow message', 
             'A two-layer GCN can allow message', 
             'A two-layer GCN can allow message',
             'Thus although there is no direct document-document edges in the graph',
             'Thus although there is no direct document-document edges in the graph',
             'Thus although there is no direct document-document edges in the graph',
             'In our preliminary experiment',
             'In our preliminary experiment',
             'In our preliminary experiment']
vocabulary = get_vocab(raw_texts)
print('*********************************')
print('Token list:')
print(vocabulary.get_itos())
print('*********************************')
texts = []
for text in raw_texts:
    texts.append(text_pipeline(text, vocabulary))
    
print('Vocabulary size V = ', len(vocabulary))
_, W, mW = build_W(texts, vocabulary, 5, True)
print('*********************************')
print('Shape of W:', W.shape, 'Number of stored values:', W.nnz)
print('Shape of mW:', mW.shape)
print('*********************************')
print('Co-occurence matrix W:')
print(W)
print('*********************************')
print('Margin-W:')
print(mW)
print('*********************************')
print('PMI:')
_, pmi = calc_pmi(texts, vocabulary, 5)
print('Shape of PMI', pmi.shape, 'Number of stored values:', pmi.nnz)
print(pmi)

*********************************
Token list:
['<unk>', 'a', 'two-layer', 'gcn', 'can', 'allow', 'message', 'thus', 'although', 'there', 'is', 'no', 'direct', 'document-document', 'edges', 'in', 'the', 'graph', 'our', 'preliminary', 'experiment']
*********************************
Vocabulary size V =  21
*********************************
Tokenized texts:
[1, 2, 3, 4, 5, 6]
[1, 2, 3, 4, 5, 6]
[1, 2, 3, 4, 5, 6]
[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
[15, 18, 19, 20]
[15, 18, 19, 20]
[15, 18, 19, 20]
*********************************
Shape of W: (21, 21) Number of stored values: 128
Shape of mW: (21,)
*********************************
Co-occurence matrix W:
  (1, 1)	3
  (1, 2)	3
  (1, 3)	3
  (1, 4)	3
  (1, 5)	3
  (2, 1)	3
  (2, 2)	6
  (2, 3)	6
  (2, 4)	6
  (2, 5)	6
  (2, 6)	3
  (3, 1)	3
  (3, 2)	6
  (3, 3)	6
  (3, 4)	6
  (3, 5)	6
  (3, 6)	3
  (4, 1)	3
  (4, 2)	6
  (4, 3)	6
  (4, 4)	6
  (4, 5)	6
  (4, 6)

  pmi_dict[(i, j)] = math.log2(prelog)


## Build Text graph
The adjacency matrix $A$ is designed as follow:

\begin{equation}
A = \begin{pmatrix}
PMI & TF-IDF\\
TF-IDF & 0
\end{pmatrix}
\end{equation}

In [203]:
def build_A(texts, vocabulary, window_size, nNodes):
    '''
    Calculate adjacency matrix A with shape (V+B, V+B)
    <texts> is expected to have type of list(torch.Tensor)
    <vocab> is the Vocabulary built over text
    
    Vocab size: V
    Text length: L
    Batch size: B
    Nodes: N = B + V
    
    '''
    _, _, _, tf_idf_dict, _ = calc_tf_idf(texts, vocabulary)
    pmi_dict, _ = calc_pmi(texts, vocabulary, window_size)
    
    B = len(texts)
    V = len(vocabulary)
    
    N = B + V # Number of nodes
    
    A_dict = defaultdict(ret_zero) # Adjacency matrix. Shape: (N, N)
    
    # Word-word edges, obtained by pmi_dict
    for x in pmi_dict.keys():
        A_dict[x] = pmi_dict[x]
        
    # Document-word edges, obtained by tf_idf_dict
    for x in tf_idf_dict.keys():
        assert x[0] >= V and x[1] < V # Ensure edge (i, j) has i as document index, and j as token index
        A_dict[x] = tf_idf_dict[x]
        
    # Main diagonal
    for i in range(N):
        A_dict[(i, i)] = 1
    
    if nNodes is None:
        A = build_sparse(A_dict, shape= (N, N))
    else:
        A = build_sparse(A_dict, shape= (nNodes, nNodes))
    
    return A_dict, A

def build_graph(texts, vocabulary, window_size, nNodes= None):
    '''
    Build A_tilde = D x A x D
    
    Nodes: N
    '''
    B = len(texts)
    V = len(vocabulary)
    
    N = B + V # Number of nodes
    
    A_dict, A = build_A(texts, vocabulary, window_size, nNodes= nNodes)
    
    D_dict = defaultdict(ret_zero)
    
    for (i, j) in A_dict.keys():
        D_dict[(i, i)] = D_dict[(i, i)] + 1
    for i in range(N):
        D_dict[(i, i)] = (D_dict[(i, i)] ** (-0.5))
    
    if nNodes is None:
        D = build_sparse(D_dict, shape= (N, N)) # Degree matrix
    else:
        D = build_sparse(D_dict, shape= (nNodes, nNodes))
    G = (D * A) * D # A_tilde. Shape = (N, N)
    
    return G

def build_X(vocab, grand_vocab, batch_size, pretrained= True, W_e= None, init_dim= None):
    '''
    Construct graph feature input
    <vocab>: Vocabulary constructed by batch
    <grand_vocab>: Vocabulary created by pretrained embedding
    
    Embedding size: E
    Batch size: B
    Number of nodes: N
    
    Output:
    -----
    X (torch.Tensor, pretrained= True) : Embedded feature matrix. Shape = (N, N)
    X (sp.csr_matrix, pretrained= False): Sparse one-hot feature matrix. Shape = (V, V)
    '''
    if pretrained:
        assert W_e is not None
        
        X = []
        
        wordlist = vocab.get_itos()
        E = W_e[0].shape[0]
        B = batch_size
        
        # Convert relative vocab to grand_vocab
        grand_vocab_id = [] # Shape: (V,)
        for word in wordlist:
            grand_vocab_id.append(grand_vocab[word])
        
        # Extract pretrained embeddings
        Emb = W_e[grand_vocab_id] # Shape: (V, E)
        X.append(Emb)
        
        # Document one-hot
        Doc_Emb = torch.zeros(B, E)
        Doc_Emb[0:B, 0:B] = 1
        X.append(Doc_Emb.to(DEVICE))
        
        # Concat Word embeddings and Doc (one-hot) embeddings
        X = torch.cat(X, dim= 0) # Shape: (V + B, E)
        
        return X
    
    else:
        B = batch_size
        
        # Relative vocab is used as the main vocab in training phase. 
        # Given that no pretrained embedding is given.
        wordlist = vocab.get_itos()
        V = len(wordlist)
        
        # Number of nodes
        N = B + V
        
        X_dict = defaultdict(ret_zero) # One-hot feature matrix X
        for i in range(N):
            X_dict[(i, i)] = 1
        
        X = build_sparse(X_dict, shape= (N, N))
        
        return X
        

# PyTorch Dataset class
In this section, the `YELPDataset` class is defined with custom batch function `collate_batch`.

In [204]:
# YELP Dataset
def collate_graph(batch):
    '''
    Collate batch and building text graph
    '''
    raw_texts = []
    labels = []
    mask = []
    texts = []
    for _text, _label in batch:
        raw_texts.append(_text)
        labels.append(_label)
        
    vocabulary = get_vocab(raw_texts)
    
    for _text in raw_texts:
        texts.append(np.array(text_pipeline(_text, vocabulary)))
        
    return texts, labels, vocabulary

class YELPDataset(Dataset):
    def __init__(self, df, df_sort= False):
        self.df = df
        if df_sort:
            self.sort_df_by_txt_len()
    
    def sort_df_by_txt_len(self):
        len_list = [-len(self.df.iloc[i]['text']) for i in range(len(self.df))]
        self.df = self.df.iloc[np.argsort(len_list)]
    
    def __len__(self):
        return len(self.df)

    def label_pipeline(self, x):
        return int(x)
    
    def __getitem__(self, idx):
        txt = self.df.iloc[idx]['text']
        
        label = self.label_pipeline(self.df.iloc[idx]['label'])
        
        return (txt, label)

In [205]:
# Dataset, Dataloader
trainset = YELPDataset(train_ds)
valset = YELPDataset(val_ds)
testset = YELPDataset(test_ds)

trainloader = DataLoader(trainset, batch_size= len(trainset), 
                         shuffle= True, pin_memory= True, collate_fn= collate_graph)

valloader = DataLoader(valset, batch_size= len(valset), 
                         shuffle= True, pin_memory= True, collate_fn= collate_graph)

testloader = DataLoader(testset, batch_size= len(testset), 
                         shuffle= True, pin_memory= True, collate_fn= collate_graph)

for batch in trainloader:
    X, y, vocabulary = batch
    print("Shape of Texts:", len(X), len(X[0]))
    print("Shape of Labels:", len(y))
    print("Length of Vocab:", len(vocabulary))
    GLOBAL_N = len(vocabulary) + len(y) # ONLY ONE BIG GRAPH IS NEEDED FOR TRAINING !
    print("Number of Nodes:", GLOBAL_N)
    break

Shape of Texts: 5879 39
Shape of Labels: 5879
Length of Vocab: 15277
Number of Nodes: 21156


# TextGCN Module

In [206]:
# Taks configs
WINDOW_SIZE = 20
EMBEDDING_SIZE = 200
NUM_CLASSES = 52
DROPOUT = 0.5

In [207]:
# TextGCN Module
class TextGCN(nn.Module):
    def __init__(self, graph_emb_size, num_classes, dropout= 0.5, X_emb_size= None, sparse= True):
        super(TextGCN, self).__init__()
        self.sparse = sparse
        self.E = graph_emb_size
        
        if not sparse:
            self.layer = nn.Sequential(
                nn.LazyLinear(self.E), # Lazy linear for flexible initial embedding
                nn.Dropout(p= dropout),
                nn.ReLU(),
            )
        else:
            assert X_emb_size is not None
            self.N = X_emb_size
            self.W0 = nn.Parameter(torch.randn(int(X_emb_size), graph_emb_size))
            self.afterlinear = nn.Sequential(
                nn.Dropout(p= dropout),
                nn.ReLU(),
            )
        
        self.classifier = nn.Sequential(
            nn.Linear(self.E, num_classes),
            nn.Softmax(dim= 1)
        )
    
    def forward(self, batch_size, G, X, check_shape= False):
        '''
        <G> : Graph matrix, expected to have shape (N, N) and symetric
        <X> : Initial embedding. Expected to have shape (N, E_0)
        
        NOTES: If 'sparse' is True, G, X are given in scipy.sparse.csr_matrix format
        
        Vocab size: V
        Batch size: B
        Nodes: N = B + V
        Graph embedding size: E
        Initial embedding size: E_0
        '''
        N = G.shape[0]
        E_0 = X.shape[1]
        B = batch_size
        V = N - B
        
        # First layer:
        if not self.sparse:
            L1 = self.layer(torch.matmul(G, X)) # Shape: (N, E_0)
        else:
            L1 = G * X # Shape: (N, N), type = sp.csr_matrix
            L1 = csr2spMat(L1).to(DEVICE)
            L1 = torch.sparse.mm(L1, self.W0) # Shape: (N, E), type = dense matrix
            L1 = self.afterlinear(L1)
        
        # Classifier:
        if not self.sparse:
            logits = self.classifier(torch.matmul(G, L1)) # Shape: (N, num_classes)
        else:
            newG = csr2spMat(G).to(DEVICE)
            logits = self.classifier(torch.sparse.mm(newG, L1).to(DEVICE)).to(DEVICE) # Shape: (N, num_classes)
        
        doc_logits = logits[V:, :] # Shape: (B, num_classes)
        
        if check_shape:
            print('*********************************')
            print('Shape of L1:', L1.shape)
            print('Shape of logits:', logits.shape)
            print('Shape of doc_logits:', doc_logits.shape)
        
        return logits
    
def init_weights(m):
    '''
    Kaiming initialization.
    '''
    if isinstance(m, nn.Linear) or isinstance(m, nn.Parameter):
        torch.nn.init.kaiming_normal(m.weight, nonlinearity='relu')

In [208]:
# Model initialization
model = TextGCN(EMBEDDING_SIZE, NUM_CLASSES, DROPOUT, GLOBAL_N).to(DEVICE)
model.apply(init_weights)

  torch.nn.init.kaiming_normal(m.weight, nonlinearity='relu')


TextGCN(
  (afterlinear): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): ReLU()
  )
  (classifier): Sequential(
    (0): Linear(in_features=200, out_features=52, bias=True)
    (1): Softmax(dim=1)
  )
)

# Training

## Prepare text graphs
Generating text graphs from text corpus is a considerable computational bottleneck. Hence, the graph $G$ and input $X$ should be calculated in advance. Following the original paper, $X$ is constructed using one-hot encoding over vocabulary and documents inferred from the corpus. Hence, $X\in\mathbb{R}^{N\times N}$ where $N = B + V$, with $V$ being vocabulary size and $B$ being number of documents.

Hence, during inference time, texts must be load in a batch of size $B$ and tokenize using the training vocabulary in order to maintain dimensional consistency with pretrained **TextGCN** model.

In [None]:
# Preparing text graphs

# Train graphs
G_trainlist = []
X_trainlist = []
y_trainlist = []

for i, (texts, labels, vocabulary) in enumerate(trainloader):
    # Build graph
    G = build_graph(texts, vocabulary, WINDOW_SIZE) # Graph matrix
    N = G.shape[0] # Number of nodes
    
    print('Number of nodes: N = ', N)
    torch.save(vocabulary, 'train_vocab.pth')

    # Input features.
    # X = build_X(vocabulary, grand_vocab, len(labels), W_e= W_e) # Pretrained embedding mode. Not efficient.
    # One-hot encoding mode
    X_dict = defaultdict(ret_zero)
    for i in range(N):
        X_dict[(i, i)] = 1
    X = build_sparse(X_dict, shape= (N, N))
    y = torch.tensor(labels)
    
    G_trainlist.append(G)
    X_trainlist.append(X)
    y_trainlist.append(y)
    
    break # Only ONE BIG GRAPH is needed for training

GXy_train = (G_trainlist, X_trainlist, y_trainlist)
torch.save(GXy_train, "GXy_train.pth")

# Val graphs
G_vallist = []
X_vallist = []
y_vallist = []

for i, (texts, labels, vocabulary) in enumerate(valloader):
    # Build graph
    G = build_graph(texts, vocabulary, WINDOW_SIZE, nNodes= GLOBAL_N) # Graph matrix
    N = G.shape[0] # Number of nodes
    
    print('Number of nodes: N = ', N)

    # Input features.
    # X = build_X(vocabulary, grand_vocab, len(labels), W_e= W_e) # Pretrained embedding mode. Not efficient.
    # One-hot encoding mode
    X_dict = defaultdict(ret_zero)
    for i in range(N):
        X_dict[(i, i)] = 1
    X = build_sparse(X_dict, shape= (GLOBAL_N, GLOBAL_N))
    y = torch.tensor(labels)
    
    G_vallist.append(G)
    X_vallist.append(X)
    y_vallist.append(y)
    
    break # Only ONE BIG GRAPH is needed for training

GXy_val = (G_vallist, X_vallist, y_vallist)
torch.save(GXy_val, "GXy_val.pth")

# Test graphs
G_testlist = []
X_testlist = []
y_testlist = []

for i, (texts, labels, vocabulary) in enumerate(testloader):
    # Build graph
    G = build_graph(texts, vocabulary, WINDOW_SIZE, nNodes= GLOBAL_N) # Graph matrix
    N = G.shape[0] # Number of nodes
    
    print('Number of nodes: N = ', N)

    # Input features.
    # X = build_X(vocabulary, grand_vocab, len(labels), W_e= W_e) # Pretrained embedding mode. Not efficient.
    # One-hot encoding mode
    X_dict = defaultdict(ret_zero)
    for i in range(N):
        X_dict[(i, i)] = 1
    X = build_sparse(X_dict, shape= (GLOBAL_N, GLOBAL_N))
    y = torch.tensor(labels)
    
    G_testlist.append(G)
    X_testlist.append(X)
    y_testlist.append(y)
    
    break # Only ONE BIG GRAPH is needed for training

GXy_test = (G_testlist, X_testlist, y_testlist)
torch.save(GXy_test, "GXy_test.pth")

  pmi_dict[(i, j)] = math.log2(prelog)


Number of nodes: N =  21156


In [None]:
# # Load prebuilt graph if available
# GXy_train = torch.load('/kaggle/input/gxy-r52/GXy_train.pth')
# GXy_val = torch.load('/kaggle/input/gxy-r52/GXy_val.pth')
# GXy_test = torch.load('/kaggle/input/gxy-r52/GXy_test.pth')

## Training configs

In [None]:
# Training configs
LR = 0.02

EPOCHS = 200
ITER = EPOCHS

OPTIMIZER = torch.optim.Adam(model.parameters(), lr= LR)
# OPTIMIZER = torch.optim.SGD(model.parameters(), lr= LR, momentum= 0.9, nesterov= True)

# SCHEDULER = torch.optim.lr_scheduler.CosineAnnealingLR(OPTIMIZER, T_max = ITER)
SCHEDULER = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(OPTIMIZER, T_0= 25, T_mult = 1, eta_min= 0)

LOSS_FN = nn.CrossEntropyLoss()
RECORD = 0.0

## Training procedures

In [None]:
# Train procedures
def test(GXy, B_train, model, loss_fn):
    model.eval()
    test_loss = 0
    correct = 0
    
    G_list, X_list, y_list = GXy
    L = len(G_list)
    n_samples = 0
    
    for i in tqdm(range(L)):
        G, X, y = G_list[i], X_list[i], y_list[i]
        y = y.to(DEVICE)
        batch_size = y.shape[0]
        N = G.shape[0]
        
        # Get loss and inference
        logits = model(batch_size, G, X)
#         print(logits.shape, N, N - B_train, N - B_train + batch_size)
        logits = logits[N - B_train : N - B_train + batch_size, :]
#         print(logits.shape, y.shape)
        loss = loss_fn(logits, y)
        n_samples += (batch_size)
        correct += (logits.argmax(dim= 1) == y).type(torch.float).sum().item()

        test_loss += loss
        
    test_loss /= L
    accuracy = correct / n_samples
    
    return test_loss, accuracy

def train(GXy_train, B_train, model, optimizer, scheduler, loss_fn, val_freq):
    global RECORD
    model.train()
    tloss = []
    
    G_list, X_list, y_list = GXy_train
    L = len(G_list)
    
    for i in range(L):
        G, X, y = G_list[i], X_list[i], y_list[i]
        y = y.to(DEVICE)
        batch_size = y.shape[0]
        N = G.shape[0]
        
        # Get loss
        logits = model(batch_size, G, X)
        logits = logits[N - B_train : N - B_train + batch_size, :]
        loss = loss_fn(logits, y)
        
        tloss.append(loss.cpu().detach().numpy())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
#         scheduler.step()
        
    return tloss

In [None]:
# TRAINING 
iter_loss = []
epoch_loss = []
best_acc = 0

with torch.no_grad():
    pass

for t in range(EPOCHS):
    tloss_train = train(GXy_train, len(train_ds), model, OPTIMIZER, SCHEDULER, LOSS_FN, 10)
    
    iter_loss = iter_loss + tloss_train
#     epoch_loss.append(sum(tloss) / len(tloss))
    
#     print(f'Epoch {t}: LOSS = {epoch_loss[-1]}')
    
    tloss_val = train(GXy_val, len(train_ds), model, OPTIMIZER, SCHEDULER, LOSS_FN, 10)
    
    iter_loss = iter_loss + tloss_val
    epoch_loss.append(sum(tloss_train + tloss_val) / len(tloss_train + tloss_val))
    
    print(f'Epoch {t}: LOSS = {epoch_loss[-1]}')
    
torch.save(model.state_dict(), f'TextGCN_last.pth')
    
fig, axes = plt.subplots()
axes.plot(epoch_loss, label = 'train-loss')
axes.legend()
axes.set_xlabel('Iteration')
axes.set_ylabel('Loss')
plt.show()

In [None]:
inputs = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
loss = LOSS_FN(inputs, target)
print(loss)

## Verify training accuracy

In [None]:
# GXy_train[0][0].to(CPU), GXy_train[1][0].to(CPU), GXy_train[2][0].to(CPU)
_, test_acc = test(GXy_val, len(train_ds), model, LOSS_FN)
print(f'Train accuracy: {test_acc}')