# Hyper Parameters

In [None]:
EDGE = 2 # 0:d2w 1:d2w+w2w 2:d2w+w2w+d2d
NODE = 0 # 0:one-hot #1:BERT 
NUM_LAYERS = 2 

# Dataset Preparation

In [None]:
original_train_sentences = 
original_labels_train = 
original_test_sentences = 
original_labels_test = 

train_size = len(original_train_sentences)
test_size = len(original_test_sentences)
sentences = original_train_sentences + original_test_sentences

# Preprocess

## Label Encoding

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

unique_labels=np.unique(original_labels_train)

num_class = len(unique_labels)
lEnc = LabelEncoder()
lEnc.fit(unique_labels)

print(unique_labels)
print(lEnc.transform(unique_labels))

train_labels = lEnc.transform(original_labels_train)
test_labels = lEnc.transform(original_labels_test)

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

labels = train_labels.tolist()+test_labels.tolist()
labels = torch.LongTensor(labels).to(device)

['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'
 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'
 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'
 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'
 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'
 'talk.politics.misc' 'talk.religion.misc']
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]


## Remove Stopwords and less frequent words, tokenize sentences

In [None]:
from nltk.corpus import stopwords
from keras.preprocessing.sequence import pad_sequences
import nltk
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
remove_limit = 5


def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

original_word_freq = {}  # to remove rare words
for sentence in sentences:
    temp = clean_str(sentence)
    word_list = temp.split()
    for word in word_list:
        if word in original_word_freq:
            original_word_freq[word] += 1
        else:
            original_word_freq[word] = 1   

tokenize_sentences = []
word_list_dict = {}
for sentence in sentences:
    temp = clean_str(sentence)
    word_list_temp = temp.split()
    doc_words = []
    for word in word_list_temp: 
        if word in original_word_freq and word not in stop_words and original_word_freq[word] >= remove_limit:
            doc_words.append(word)
            word_list_dict[word] = 1
    tokenize_sentences.append(doc_words)
word_list = list(word_list_dict.keys())
vocab_length = len(word_list)

#word to id dict
word_id_map = {}
for i in range(vocab_length):
    word_id_map[word_list[i]] = i            

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
node_size = train_size + vocab_length + test_size

# Model input

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.notebook import tqdm

## Build Graph

In [None]:
from math import log
row = []
col = []
weight = []

### word-word: PMI

In [None]:
if EDGE >= 1:
    window_size = 20
    total_W = 0
    word_occurrence = {}
    word_pair_occurrence = {}

    def ordered_word_pair(a, b):
        if a > b:
            return b, a
        else:
            return a, b

    def update_word_and_word_pair_occurrence(q):
        unique_q = list(set(q))
        for i in unique_q:
            try:
                word_occurrence[i] += 1
            except:
                word_occurrence[i] = 1
        for i in range(len(unique_q)):
            for j in range(i+1, len(unique_q)):
                word1 = unique_q[i]
                word2 = unique_q[j]
                word1, word2 = ordered_word_pair(word1, word2)
                try:
                    word_pair_occurrence[(word1, word2)] += 1
                except:
                    word_pair_occurrence[(word1, word2)] = 1


    for ind in tqdm(range(train_size+test_size)):
        words = tokenize_sentences[ind]

        q = []
        # push the first (window_size) words into a queue
        for i in range(min(window_size, len(words))):
            q += [word_id_map[words[i]]]
        # update the total number of the sliding windows
        total_W += 1
        # update the number of sliding windows that contain each word and word pair
        update_word_and_word_pair_occurrence(q)

        now_next_word_index = window_size
        # pop the first word out and let the next word in, keep doing this until the end of the document
        while now_next_word_index<len(words):
            q.pop(0)
            q += [word_id_map[words[now_next_word_index]]]
            now_next_word_index += 1
            # update the total number of the sliding windows
            total_W += 1
            # update the number of sliding windows that contain each word and word pair
            update_word_and_word_pair_occurrence(q)

    for word_pair in word_pair_occurrence:
        i = word_pair[0]
        j = word_pair[1]
        count = word_pair_occurrence[word_pair]
        word_freq_i = word_occurrence[i]
        word_freq_j = word_occurrence[j]
        pmi = log((count * total_W) / (word_freq_i * word_freq_j))
        if pmi <=0:
            continue
        row.append(train_size + i)
        col.append(train_size + j)
        weight.append(pmi)
        row.append(train_size + j)
        col.append(train_size + i)
        weight.append(pmi)


HBox(children=(FloatProgress(value=0.0, max=18846.0), HTML(value='')))




### doc-word: Tf-idf

In [None]:
#get each word appears in which document
word_doc_list = {}
for word in word_list:
    word_doc_list[word]=[]

for i in range(len(tokenize_sentences)):
    doc_words = tokenize_sentences[i]
    unique_words = set(doc_words)
    for word in unique_words:
        exsit_list = word_doc_list[word]
        exsit_list.append(i)
        word_doc_list[word] = exsit_list

#document frequency
word_doc_freq = {}
for word, doc_list in word_doc_list.items():
    word_doc_freq[word] = len(doc_list)

# term frequency
doc_word_freq = {}

for doc_id in range(len(tokenize_sentences)):
    words = tokenize_sentences[doc_id]
    for word in words:
        word_id = word_id_map[word]
        doc_word_str = str(doc_id) + ',' + str(word_id)
        if doc_word_str in doc_word_freq:
            doc_word_freq[doc_word_str] += 1
        else:
            doc_word_freq[doc_word_str] = 1

In [None]:
for i in range(len(tokenize_sentences)):
    words = tokenize_sentences[i]
    doc_word_set = set()
    for word in words:
        if word in doc_word_set:
            continue
        j = word_id_map[word]
        key = str(i) + ',' + str(j)
        freq = doc_word_freq[key]
        if i < train_size:
            row.append(i)
        else:
            row.append(i + vocab_length)
        col.append(train_size + j)
        idf = log(1.0 * len(tokenize_sentences) / word_doc_freq[word_list[j]])
        weight.append(freq * idf)
        doc_word_set.add(word)

### doc-doc: jaccard

In [None]:
import nltk

if EDGE>=2:
    tokenize_sentences_set = [set(s) for s in tokenize_sentences]
    jaccard_threshold = 0.2
    for i in tqdm(range(len(tokenize_sentences))):
        for j in range(i+1, len(tokenize_sentences)):
            jaccard_w = 1 - nltk.jaccard_distance(tokenize_sentences_set[i], tokenize_sentences_set[j])
            if jaccard_w > jaccard_threshold:
                if i < train_size:
                    row.append(i)
                else:
                    row.append(i + vocab_length)
                if j < train_size:
                    col.append(j)
                else:
                    col.append(vocab_length + j)
                weight.append(jaccard_w)
                if j < train_size:
                    row.append(j)
                else:
                    row.append(j + vocab_length)
                if i < train_size:
                    col.append(i)
                else:
                    col.append(vocab_length + i)
                weight.append(jaccard_w)

HBox(children=(FloatProgress(value=0.0, max=18846.0), HTML(value='')))




### Adjacent matrix

In [None]:
import scipy.sparse as sp
adj = sp.csr_matrix((weight, (row, col)), shape=(node_size, node_size))

# build symmetric adjacency matrix
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

In [None]:
def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo(), d_inv_sqrt
    
adj, norm_item = normalize_adj(adj + sp.eye(adj.shape[0]))


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape).to(device)

adj = sparse_mx_to_torch_sparse_tensor(adj)

## Features

In [None]:
if NODE == 0:
    features = np.arange(node_size)
    features = torch.FloatTensor(features).to(device)
else:
    !pip install flair

    from flair.embeddings import TransformerDocumentEmbeddings, TransformerWordEmbeddings
    from flair.data import Sentence
    doc_embedding = TransformerDocumentEmbeddings('bert-base-uncased', fine_tune=False)
    word_embedding = TransformerWordEmbeddings('bert-base-uncased', layers='-1',subtoken_pooling="mean")

    sent_embs = []
    word_embs = {}

    for ind in tqdm(range(train_size+test_size)):
        sent = tokenize_sentences[ind]
        sentence = Sentence(" ".join(sent[:512]),use_tokenizer=False)
        doc_embedding.embed(sentence)
        sent_embs.append(sentence.get_embedding().tolist())
        words = Sentence(" ".join(sent[:512]),use_tokenizer=False)
        word_embedding.embed(words)
        for token in words:
            word = token.text
            embedding = token.embedding.tolist()
            if word not in word_embs:
                word_embs[word] = embedding
            else:
                word_embs[word] = np.minimum(word_embs[word], embedding)

    word_embs_list = []
    for word in word_list:
        word_embs_list.append(word_embs[word])

    features = sent_embs[:train_size] + word_embs_list + sent_embs[train_size:]

    import scipy.sparse as sp
    def preprocess_features(features):
        """Row-normalize feature matrix and convert to tuple representation"""
        rowsum = np.array(features.sum(1))
        r_inv = np.power(rowsum, -1).flatten()
        r_inv[np.isinf(r_inv)] = 0.
        r_mat_inv = sp.diags(r_inv)
        features = r_mat_inv.dot(features)
        return features

    features = preprocess_features(sp.csr_matrix(features)).todense()
    features = torch.FloatTensor(features).to(device)

# Model

## GCN Layer

In [None]:
import math

import torch

from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module


class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """

    def __init__(self, in_features, out_features,  drop_out = 0, activation=None, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.zeros(1, out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters(in_features, out_features)
        self.dropout = torch.nn.Dropout(drop_out)
        self.activation =  activation

    def reset_parameters(self,in_features, out_features):
        stdv = np.sqrt(6.0/(in_features+out_features))
        # stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        # if self.bias is not None:
        #     torch.nn.init.zeros_(self.bias)
            # self.bias.data.uniform_(-stdv, stdv)


    def forward(self, input, adj, feature_less = False):
        if feature_less:
            support = self.weight
            support = self.dropout(support)
        else:
            input = self.dropout(input)
            support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        if self.bias is not None:
            output = output + self.bias
        if self.activation is not None:
            output = self.activation(output)
        return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

## GCN Model

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout, n_layers = 2):
        super(GCN, self).__init__()
        self.n_layers = n_layers
        self.gc_list = []
        if n_layers >= 2:
            self.gc1 = GraphConvolution(nfeat, nhid, dropout, activation = nn.ReLU())
            self.gc_list = nn.ModuleList([GraphConvolution(nhid, nhid, dropout, activation = nn.ReLU()) for _ in range(self.n_layers-2)])
            self.gcf = GraphConvolution(nhid, nclass, dropout)
        else:
            self.gc1 = GraphConvolution(nfeat, nclass, dropout)

    def forward(self, x, adj):
        if self.n_layers>=2:
            x = self.gc1(x, adj, feature_less = True)
            for i in range(self.n_layers-2):
                x = self.gc_list[i](x,adj)
            x = self.gcf(x,adj)
        else:
            x = self.gc1(x, adj, feature_less = True)
        return x

In [None]:
def cal_accuracy(predictions,labels):
    pred = torch.argmax(predictions,-1).cpu().tolist()
    lab = labels.cpu().tolist()
    cor = 0
    for i in range(len(pred)):
        if pred[i] == lab[i]:
            cor += 1
    return cor/len(pred)

# Training

## Initialize model

In [None]:
import torch.optim as optim

HIDDEN_DIM = 200
DROP_OUT = 0.5
LR = 0.02
WEIGHT_DECAY = 0
EARLY_STOPPING = 10
NUM_EPOCHS = 200


criterion = nn.CrossEntropyLoss()

model = GCN(nfeat=node_size, nhid=HIDDEN_DIM, nclass=num_class, dropout=DROP_OUT,n_layers=NUM_LAYERS).to(device)
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

## Training and Validating

In [None]:
def generate_train_val(train_pro=0.9):
    real_train_size = int(train_pro*train_size)
    val_size = train_size-real_train_size

    idx_train = np.random.choice(train_size, real_train_size,replace=False)
    idx_train.sort()
    idx_val = []
    pointer = 0
    for v in range(train_size):
        if pointer<len(idx_train) and idx_train[pointer] == v:
            pointer +=1
        else:
            idx_val.append(v)
    idx_test = range(train_size+vocab_length, node_size)
    return idx_train, idx_val, idx_test

idx_train, idx_val, idx_test = generate_train_val()

In [None]:
import time

def train_model(show_result = True):
    val_loss = []
    for epoch in range(NUM_EPOCHS):
        t = time.time()
        model.train()
        optimizer.zero_grad()
        output= model(features, adj)
        loss_train = criterion(output[idx_train], labels[idx_train])
        acc_train = cal_accuracy(output[idx_train], labels[idx_train])
        loss_train.backward()
        optimizer.step()

        model.eval()
        output = model(features, adj)

        loss_val = criterion(output[idx_val], labels[idx_val])
        val_loss.append(loss_val.item())
        acc_val = cal_accuracy(output[idx_val], labels[idx_val])
        if show_result:
            print(  'Epoch: {:04d}'.format(epoch+1),
                    'loss_train: {:.4f}'.format(loss_train.item()),
                    'acc_train: {:.4f}'.format(acc_train),
                    'loss_val: {:.4f}'.format(loss_val.item()),
                    'acc_val: {:.4f}'.format(acc_val),
                    'time: {:.4f}s'.format(time.time() - t))
        
        if epoch > EARLY_STOPPING and np.min(val_loss[-EARLY_STOPPING:]) > np.min(val_loss[:-EARLY_STOPPING]) :
            if show_result:
                print("Early Stopping...")
            break

train_model()

Epoch: 0001 loss_train: 2.9957 acc_train: 0.0433 loss_val: 2.9918 acc_val: 0.0486 time: 0.8628s
Epoch: 0002 loss_train: 2.9898 acc_train: 0.0544 loss_val: 3.0036 acc_val: 0.0574 time: 0.8527s
Epoch: 0003 loss_train: 2.9891 acc_train: 0.0630 loss_val: 2.9875 acc_val: 0.0530 time: 0.8524s
Epoch: 0004 loss_train: 2.9820 acc_train: 0.0583 loss_val: 2.9810 acc_val: 0.0813 time: 0.8523s
Epoch: 0005 loss_train: 2.9776 acc_train: 0.0968 loss_val: 2.9727 acc_val: 0.1042 time: 0.8525s
Epoch: 0006 loss_train: 2.9683 acc_train: 0.1246 loss_val: 2.9600 acc_val: 0.0707 time: 0.8521s
Epoch: 0007 loss_train: 2.9538 acc_train: 0.0788 loss_val: 2.9439 acc_val: 0.1051 time: 0.8531s
Epoch: 0008 loss_train: 2.9339 acc_train: 0.1110 loss_val: 2.9186 acc_val: 0.1387 time: 0.8525s
Epoch: 0009 loss_train: 2.9060 acc_train: 0.1395 loss_val: 2.8799 acc_val: 0.1263 time: 0.8520s
Epoch: 0010 loss_train: 2.8639 acc_train: 0.1370 loss_val: 2.8235 acc_val: 0.2217 time: 0.8523s
Epoch: 0011 loss_train: 2.8060 acc_train

## Evaluation

In [None]:
from sklearn.metrics import f1_score, accuracy_score
def test():
    model.eval()
    output = model(features, adj)
    predictions = torch.argmax(output[idx_test],-1).cpu().tolist()
    acc = accuracy_score(test_labels,predictions)
    f11 = f1_score(test_labels,predictions, average='macro')
    f12 = f1_score(test_labels,predictions, average = 'weighted')
    return acc, f11, f12

print(test())

# Test 10 times

In [None]:
for t in range(10):
    model = GCN(nfeat=node_size, nhid=HIDDEN_DIM, nclass=num_class, dropout=DROP_OUT,n_layers=NUM_LAYERS).to(device)
    optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    idx_train, idx_val, idx_test = generate_train_val()
    train_model(show_result=False)
    acc, f11, f12 = test()
    test_acc_list.append(acc)
    test_f11_list.append(f11)
    test_f12_list.append(f12)


print("Accuracy:",np.round(np.mean(test_acc_list),4))
print("Macro F1:",np.round(np.mean(test_f11_list),4))
print("Weighted F1:",np.round(np.mean(test_f12_list),4))