# **Sentence Classification using Hierarchical Attention Networks**

# Importing Libraries

In [2]:
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import Dense, Embedding,Input,Bidirectional,TimeDistributed,Activation,Lambda,Multiply,Dropout
from keras.layers import LSTM,GRU
import keras.backend as K
from tensorflow.keras.optimizers import Adam
from keras.datasets import imdb
import numpy as np
import matplotlib.pyplot as plt
from keras.utils.vis_utils import plot_model
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

# Hyper-parameters for the Model Training

In [3]:
# Hyper Parameters
max_features = 20000   # Number of words used for learning
maxlen = 80  # Maximum length used for learning (more omitted)
batch_size = 32

# IMDB Datset (Already cleaned)
# This is a dataset of 25,000 (each for train and test) movies reviews from IMDB, labeled by sentiment (positive/negative). Reviews have been preprocessed, and each review is encoded as a list of word indexes (integers). 

In [4]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

# Exploratory Data Analysis

In [5]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

# Defining Model Parameters 

In [6]:
hid_dim = 64  # Number of dimensions of hidden layer
embed_size = 128 # Number of embedded dimensions of a word
att_dim = 32  # attention dimension of output in fully connected layer for calculation of
drop =0.5 # Dropout
out_size=1 # Final output number 

# Defining the Model Architecture

In [7]:
inputs =Input(shape=(maxlen,))
emb= Embedding(max_features, embed_size)(inputs)  #Embedding layer

state = Bidirectional(GRU(hid_dim, return_sequences=True))(emb)  #GRU

#-------  Word Attention  ----

u_it = TimeDistributed(Dense(att_dim, activation='tanh'),name='T1')(state)
score = TimeDistributed(Dense(1),name='T2')(u_it)
score_ = Lambda(lambda x: K.reshape(x, (K.shape(x)[0], maxlen)))(score)
alpha=Activation('softmax')(score_)
alpha_ = Lambda(lambda x: K.expand_dims(x))(alpha)

alphahs=Multiply(name='attention_mul')([alpha_,state])
s = Lambda(lambda x: K.sum(x, axis=1))(alphahs)

dropout = Dropout(drop)(s)
output = Dense(units=out_size, activation='sigmoid')(dropout)

model = Model(inputs, output)

# Visualising the Model

In [8]:
model.summary()

In [9]:
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

# Hyper-parameters for the Model Training

In [10]:
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [11]:
epochs = 2
batch_size = 32

In [12]:
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, shuffle=True, validation_data=(x_test,y_test))  

In [13]:
INDEX_FROM=3   

word_to_id = imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}

# Attention Visualization

In [14]:
num=12
orig_sen = [id_to_word[id] for id in x_test[num]]
print('Meta data of article')
print(' '.join(orig_sen))
print('Correct label')
print(y_test[num])

In [15]:
# attention weight
get_Attention = K.function([model.layers[0].input],
                                  [model.layers[6].output])
attention = get_Attention([x_test[num:num+1]])[0][0]
attention.shape

plt.plot(attention)
plt.plot

In [16]:
# attention　weight The total of is 1
np.sum(attention)

In [33]:
plt.figure(figsize=(1, 20))
plt.pcolormesh(attention[np.newaxis,:])
print(orig_sen)
print(attention[np.newaxis,:])
#sns.heatmap(attention[:,np.newaxis],cmap="Blues",xticklabels=False,yticklabels=orig_sen)
plt.show()

In [1]:
!tensorflow --version