In [1]:
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Uttam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('amazon_cells_labelled.txt',error_bad_lines=False,sep='\t',header=None,names=['text','review'])

In [3]:
df.head(10)

Unnamed: 0,text,review
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
5,I have to jiggle the plug to get it to line up...,0
6,If you have several dozen or several hundred c...,0
7,If you are Razr owner...you must have this!,1
8,"Needless to say, I wasted my money.",0
9,What a waste of money and time!.,0


In [4]:
df.shape

(1000, 2)

In [5]:
# Cleaning the texts
import re
from tqdm import tqdm
corpus = []
for i in tqdm(range(0, df.shape[0])):
    review = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:06<00:00, 148.98it/s]


In [6]:
corpus[:5]

['way plug us unless go convert',
 'good case excel valu',
 'great jawbon',
 'tie charger convers last minut major problem',
 'mic great']

In [7]:
corpus_sent_lenght = []
for i in range(len(corpus)):
    sen_length = len(corpus[i].split())
    corpus_sent_lenght.append(sen_length)  

In [8]:
corpus_sent_lenght[:5]

[6, 4, 2, 7, 2]

In [9]:
vocab_length  =sum(corpus_sent_lenght)
vocab_length

5180

* The text_to_sequences() method takes the corpus and converts it to sequences, i.e. each sentence becomes one vector. The elements of the vectors are the unique integers corresponding to each unique word in the vocabulary

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
t=Tokenizer()
t.fit_on_texts(corpus)
text_matrix=t.texts_to_sequences(corpus)

In [11]:
text_matrix[:5]

[[140, 55, 445, 317, 80, 644],
 [4, 14, 20, 197],
 [3, 318],
 [645, 35, 198, 56, 141, 446, 23],
 [251, 3]]

*  typically sentences are of different lengths. We should make them equal by zero padding. We have used a ‘post padding’ technique here, i.e. zeros will be added at the end of the vectors

In [12]:
max_length = 32

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
text_pad = pad_sequences(text_matrix, maxlen=max_length, padding='post')

In [14]:
text_pad[:5]

array([[140,  55, 445, 317,  80, 644,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0],
       [  4,  14,  20, 197,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0],
       [  3, 318,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0],
       [645,  35, 198,  56, 141, 446,  23,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0],
       [251,   3,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0]])

In [15]:
y= df['review']
y.head()

0    0
1    1
2    1
3    0
4    1
Name: review, dtype: int64

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test ,y_train, y_test= train_test_split(text_pad,y ,train_size= 0.7, random_state = 42)

In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(700, 32)
(300, 32)
(700,)
(300,)


In [18]:
from tensorflow.keras.layers import Embedding

##### * Input_dim: 

This is the size of the vocabulary in the text data. For example, if your data is integer encoded to values between 0-10, then the size of the vocabulary would be 11 words.

    Here we have input_dim = 5180+1

##### * output_dim: 

This is the size of the vector space in which words will be embedded. It defines the size of the output vectors from this layer for each word. For example, it could be 32 or 100 or even larger. Test different values for your problem.

    Herewe have defined output_dim a= 32

##### * input_length: 

lenght of the maximum document. which is stored in max_length variable in our case.we have 32 

    Here we have defined as input_length = max_length = 32

In [19]:
from tensorflow.keras.layers import Dense, LSTM, Activation, Input
from tensorflow.keras.models import Model
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers

In [20]:
vocab_length, max_length 

(5180, 32)

* we have used an Embedding layer followed by an LSTM layer. The embedding layer takes the 32-dimensional vectors, each of which corresponds to a sentence, and subsequently outputs (32,32) dimensional matrices i.e., it creates a 32-dimensional vector corresponding to each word. This embedding is also learnt during model training.

* Then we add an LSTM layer with 100 number of neurons. As it is a simple encoder-decoder model, we don’t want each hidden state of the encoder LSTM. We just want to have the last hidden state of the encoder LSTM and we can do it by setting ‘return_sequences’= False in the Keras LSTM function.But in Keras itself the default value of this parameters is False. So, no action is required.

* The output now becomes 100-dimensional vectors i.e. the hidden states of the LSTM are 100 dimensional. This is passed to a feedforward or Dense layer with ‘sigmoid’ activation. The model is trained using Adam optimizer with binary cross-entropy loss.

In [21]:
inputs1=Input(shape=(32,))
x1=Embedding(input_dim=vocab_length+1,output_dim=32,
             input_length=max_length,embeddings_regularizer= regularizers.l2(.001))(inputs1)
x1=LSTM(100,dropout=0.3,recurrent_dropout=0.2)(x1)
outputs1=Dense(1,activation='sigmoid')(x1)

model1=Model(inputs1,outputs1)

In [22]:
model1.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 32, 32)            165792    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 219,093
Trainable params: 219,093
Non-trainable params: 0
_________________________________________________________________


In [23]:
adam = optimizers.Adam(lr = 0.001)
model1.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])

In [24]:
X_train.shape ,y_train.shape

((700, 32), (700,))

In [25]:
model1.fit(x=X_train,y=y_train,batch_size=32,epochs=10,verbose=1)

Train on 700 samples
Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x25d619126c8>

### Building attention model 

* To implement this, we will use the default Layer class in Keras. We will define a class named Attention as a derived class of the Layer class.
* We need to define four functions as per the Keras custom layer generation rule. These are build(),call (), compute_output_shape() and get_config().

In [26]:
def build(self,input_shape):
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(attention, self).build(input_shape)

* Inside call (), we will write the main logic of Attention. We simply must create a Multi-Layer Perceptron (MLP). Therefore, we will take the dot product of weights and inputs followed by the addition of bias terms. After that, we apply a ‘tanh’ followed by a softmax layer. This softmax gives the alignment scores. Its dimension will be the number of hidden states in the LSTM, i.e., 32 in this case. Taking its dot product along with the hidden states will provide the context vector:

In [27]:
def call(self,x):
        et=K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        at=K.softmax(et)
        at=K.expand_dims(at,axis=-1)
        output=x*at
        return K.sum(output,axis=1)

In [28]:
from tensorflow.keras.layers import Layer
import tensorflow.keras.backend as K

In [29]:
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(attention, self).build(input_shape)

    def call(self,x):
        et=K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        at=K.softmax(et)
        at=K.expand_dims(at,axis=-1)
        output=x*at
        return K.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        return super(attention,self).get_config()

In [30]:
inputs2=Input(shape=(32,))
x2=Embedding(input_dim=vocab_length+1,output_dim=32,
             input_length=max_length,embeddings_regularizer= regularizers.l2(.001))(inputs2)

att_in=LSTM(100,return_sequences=True,dropout=0.3,recurrent_dropout=0.2)(x2)
att_out=attention()(att_in)
outputs2=Dense(1,activation='sigmoid',trainable=True)(att_out)
model2=Model(inputs2,outputs2)
model2.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 32)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 32, 32)            165792    
_________________________________________________________________
lstm_1 (LSTM)                (None, 32, 100)           53200     
_________________________________________________________________
attention (attention)        (None, 100)               132       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 219,225
Trainable params: 219,225
Non-trainable params: 0
_________________________________________________________________


In [31]:
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model2.fit(x=X_train,y=y_train,batch_size=32,epochs=10,verbose=1,shuffle=True,validation_split=0.2)

Train on 560 samples, validate on 140 samples
Epoch 1/10


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x25e0f156ec8>

Resorce: https://www.analyticsvidhya.com/blog/2019/11/comprehensive-guide-attention-mechanism-deep-learning/