# **Document Classification using Hierarchical Attention Networks**

# Importing Libraries

In [None]:
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import Dense, Embedding,Input,Bidirectional,TimeDistributed,Activation,Lambda,Multiply,Dropout
from keras.layers import LSTM,GRU
import keras.backend as K
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import re
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.utils.vis_utils import plot_model
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

# IMDB Datset

In [None]:
data_path ='../input/nlp-project/labeledTrainData.tsv'
data_train = pd.read_csv(data_path, sep='\t')

# Exploratory Data Analysis

In [None]:
# Shape Confirmation
data_train.shape

# Data Pre-processing

# Cleaning

In [None]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

# Tokenization

In [None]:
from nltk import tokenize

reviews = []
labels = []
texts = []

for idx in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[idx],'lxml')
    text = clean_str(text.get_text())
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)
    
    labels.append(data_train.sentiment[idx])

# Visualizing the pre-processed data

In [None]:
reviews[45]

In [None]:
MAX_SENT_LENGTH = 100  # Maximum number of words per sentence
MAX_SENTS = 15    # Maximum number of sentences per document
MAX_NB_WORDS = 20000  # Maximum number of words to use
VALIDATION_SPLIT = 0.2

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    data[i,j,k] = tokenizer.word_index[word]
                    k=k+1                    
                    
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

labels = to_categorical(np.asarray(labels))
print('Shape of data:', data.shape)
print('Shape of label tensor:', labels.shape)

# Training and Validation Sets

In [None]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [None]:
print('train and validation shapes in data')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

# Defining the Sentence and Word Embeddings

# Building a model

In [None]:
#---------Word Embedding-----------

hid_dim = 64  # Number of dimensions of hidden layer
embed_size = 128 # Number of embedded dimensions of a word
att_dim = 32  # attention dimension of output in fully connected layer for calculation of
drop =0.5 # Dropout
out_size=2 # Final output number

#---------Sentence Embedding-----------

hid_dim2 = 64  # Number of dimensions of hidden layer
att_dim2 = 32  # attention dimension of output in fully connected layer for calculation of

# Defining the Model Architecture

In [None]:
inputs =Input(shape=(MAX_SENT_LENGTH,))
emb= Embedding(MAX_NB_WORDS , embed_size)(inputs)  #Embedding layer

state = Bidirectional(GRU(hid_dim, return_sequences=True))(emb)  # GRU Model

#-------  Word Attention  ----

u_it = TimeDistributed(Dense(att_dim, activation='tanh'),name='T1')(state)
score = TimeDistributed(Dense(1),name='T2')(u_it)
score_ = Lambda(lambda x: K.reshape(x, (K.shape(x)[0], MAX_SENT_LENGTH)))(score)
alpha=Activation('softmax')(score_)
alpha_ = Lambda(lambda x: K.expand_dims(x))(alpha)
alphahs=Multiply(name='attention_mul')([alpha_,state])
s = Lambda(lambda x: K.sum(x, axis=1))(alphahs)

sent_Encoder = Model(inputs, s)

# Defining Model Parameters 

In [None]:
doc_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH))
encoded_sent = TimeDistributed(sent_Encoder)(doc_input)

state2= Bidirectional(GRU(hid_dim2, return_sequences=True))(encoded_sent)

#-------  Sentence Attention  ----

u_it2 = TimeDistributed(Dense(att_dim2, activation='tanh'),name='T1')(state2)
score2 = TimeDistributed(Dense(1),name='T2')(u_it2)
score_2 = Lambda(lambda x: K.reshape(x, (K.shape(x)[0], MAX_SENTS)))(score2)
alpha2=Activation('softmax')(score_2)
alpha_2 = Lambda(lambda x: K.expand_dims(x))(alpha2)

alphahs2=Multiply(name='attention_mul')([alpha_2,state2])
s2 = Lambda(lambda x: K.sum(x, axis=1))(alphahs2)

preds = Dense(out_size, activation='softmax')(s2)
model = Model(doc_input, preds)

# Visualising the Model

In [None]:
model.summary()

In [None]:
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

# Hyper-parameters for the Model Training

In [None]:
adam = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
epochs = 2
batch_size = 32

In [None]:
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, shuffle=True, validation_data=(x_val,y_val)) 

# Attention Visualization

In [None]:
# Converting the ID column of the data

id_to_word = {value:key for key,value in tokenizer.word_index.items()}
id_to_word[0]=''

In [None]:
num=3

print('Meta data of the article')
L=[]
for i,sen in enumerate(x_val[num]):
    if np.sum(sen)>0:
        l = ' '.join([id_to_word[id]for  id in sen])
        L.append(l)
print(L)
print('Number of articles:',len(L))

# Attention Model Output

In [None]:
# attention weight
get_Attention = K.function([model.layers[0].input],
                                  [model.layers[6].output])
attention = get_Attention([x_val[num:num+1]])[0][0]
print(attention.shape)
print('attention weight Maximum Index Articles'.format(np.argmax(attention)+1,))

plt.plot(attention)
plt.show()

In [None]:
# attention　weight the total of 1 
np.sum(attention)

In [None]:
# To each sentence attention weight
xtick = ['sentence{}'.format(i) for i,_ in enumerate(L)]
plt.figure(figsize=(5, 1))
print(xtick)
plt.pcolormesh(attention[np.newaxis,:]
#sns.heatmap(attention[np.newaxis,:],cmap="Blues",xticklabels=xtick,yticklabels=False)
plt.show()