In [1]:
import numpy as np
import pandas as pd
import nltk

from sklearn.preprocessing import OneHotEncoder as oneHot
from nltk.corpus import stopwords
from nltk import word_tokenize
from string import punctuation
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.layers import BatchNormalization, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy as cce
from tensorflow.keras.activations import relu, softmax
from tensorflow.keras.initializers import he_uniform, glorot_uniform
from tensorflow.keras.metrics import AUC
from tensorflow.keras import Model
from tensorflow.keras.regularizers import l2#NORMALIZATION
from sklearn.metrics import classification_report

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
#Read the train and test datasets with column names as target and text
train= pd.read_csv('/content/bank_train.csv',
                       names= ["text", "target"])

test= pd.read_csv('/content/bank_test.csv',
                       names= ["text", "target"])

In [3]:
train.shape

(79, 2)

In [4]:
test.shape

(21, 2)

In [5]:
train.head(10)

Unnamed: 0,text,target
0,I APPLIED FOR A CREDIT CARD LAST MONTH BUT I D...,0
1,I OPENED A NEW ACCOUNT IN YOUR BANK BUT WHEN I...,1
2,I APPLIED FOR DEBIT OR CREDIT CARD WHEN I OPEN...,0
3,TILL NOW I DID NOT GET ANY DEBIT OR CREDIT CAR...,0
4,I GAVE AN APPLICATION FOR LINKING UNIQUE NUMBE...,1
5,I WANT TO CHANGE THE MOBILE NUMBER THAT WAS PR...,1
6,I LOST MY DEBIT OR CREDIT CARD YESTERDAY . SO ...,0
7,PLEASE BLOCK MY DEBIT OR CREDIT CARD BECAUSE I...,0
8,I GAVE AN APPLICATION FOR LINKING MY MOBILE NU...,1
9,I GAVE AN APPLICATION FOR LINKING MY MOBILE NU...,1


In [6]:
test.head()

Unnamed: 0,text,target
0,DURING WITHDRAWING MONEY FROM ACCOUNT USING DE...,0
1,THE ADDRESS MENTIONED ON MY ACCOUNT IS NOT COR...,1
2,THE PIN YOU ENTERED IS WRONG MESSAGE WAS SHOWN...,0
3,I WANT TO CHANGE MY DEBIT CARD PIN BUT NOT ABL...,0
4,I AM NOT ABLE TO CHANGE MY DEBIT CARD SECURITY...,0


In [7]:
train['target'].value_counts()

2    29
1    26
0    24
Name: target, dtype: int64

# Preprocessing of data

In [8]:
# one hot encoding
encode_target= oneHot().fit(np.array(train.target).reshape(-1,1))

In [9]:
train_target_encoded= encode_target.transform(np.array(train.target).reshape(-1,1)).toarray()
test_target_encoded= encode_target.transform(np.array(test.target).reshape(-1,1)).toarray()

In [10]:
train_target_encoded

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0

In [11]:
test_target_encoded

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [12]:
#convert into lowercase
train["text"]= train.text.map(lambda l: l.lower())
test["text"]= test.text.map(lambda l: l.lower())

In [13]:
train.head()

Unnamed: 0,text,target
0,i applied for a credit card last month but i d...,0
1,i opened a new account in your bank but when i...,1
2,i applied for debit or credit card when i open...,0
3,till now i did not get any debit or credit car...,0
4,i gave an application for linking unique numbe...,1


In [14]:
test.head()

Unnamed: 0,text,target
0,during withdrawing money from account using de...,0
1,the address mentioned on my account is not cor...,1
2,the pin you entered is wrong message was shown...,0
3,i want to change my debit card pin but not abl...,0
4,i am not able to change my debit card security...,0


In [15]:
#tokenize the words
train["text"]= train.text.map(word_tokenize)
test["text"]= test.text.map(word_tokenize)

In [16]:
train["text"]

0     [i, applied, for, a, credit, card, last, month...
1     [i, opened, a, new, account, in, your, bank, b...
2     [i, applied, for, debit, or, credit, card, whe...
3     [till, now, i, did, not, get, any, debit, or, ...
4     [i, gave, an, application, for, linking, uniqu...
                            ...                        
74    [i, want, to, transfer, money, from, my, accou...
75    [i, want, to, transfer, money, from, my, accou...
76    [i, made, a, transaction, with, my, debit, car...
77    [i, made, a, transaction, on, irctc, using, my...
78    [i, applied, for, emi, on, an, online, shoppin...
Name: text, Length: 79, dtype: object

In [17]:
#Remove stopwords
def clean_data_rm_stop(strings, stop_list):
    sw= [str for str in strings if str not in stop_list]
    return sw

stop_words= stopwords.words("english")
rm_punc_stop= list(set(punctuation))+ stop_words #Remove punctuation and stop words

train["text"]= train.text.map(lambda dataframe: clean_data_rm_stop(dataframe, rm_punc_stop))
test["text"]= test.text.map(lambda dataframe: clean_data_rm_stop(dataframe, rm_punc_stop))

In [18]:
#Stemming and Normalization
def normalize(text):
    return " ".join(text)

#We use PorterStemmer function from nltk.stem library.
stem_func= PorterStemmer()

train["text"]= train.text.map(lambda s: [stem_func.stem(x) for x in s])
train["text"]= train.text.apply(normalize)

test["text"]= test.text.map(lambda s: [stem_func.stem(x) for x in s])
test["text"]= test.text.apply(normalize)

In [19]:
train.head()

Unnamed: 0,text,target
0,appli credit card last month get one till alth...,0
1,open new account bank got passbook found middl...,1
2,appli debit credit card open account bank last...,0
3,till get debit credit card appli last month bank,0
4,gave applic link uniqu number bank account las...,1


In [20]:
#convert text into sequence
num_words=10000
text_tokenizer= Tokenizer(num_words)
text_tokenizer.fit_on_texts(train.text) #fit_on_texts - creates the vocabulary index based on word frequency.

tokenized_train_data= text_tokenizer.texts_to_sequences(train.text) #Converting texts to sequences
tokenized_test_data= text_tokenizer.texts_to_sequences(test.text)

In [21]:
tokenized_train_data #output is related to the index of the corpus

[[8, 15, 2, 12, 18, 24, 39, 9, 68, 131, 132, 52, 8, 15, 2],
 [53, 31, 1, 11, 40, 54, 41, 133, 42, 69, 70],
 [8, 4, 15, 2, 53, 1, 11, 12, 18, 24, 9],
 [9, 24, 4, 15, 2, 8, 12, 18, 11],
 [16, 10, 17, 25, 7, 11, 1, 12, 18, 9, 55],
 [14, 56, 19, 7, 134, 17, 1, 14, 87, 31, 7],
 [88, 4, 15, 2, 71, 14, 89, 4, 15, 2, 72, 29, 90],
 [73, 89, 4, 15, 2, 88, 4, 15, 2, 71, 72, 29, 90],
 [16, 10, 17, 19, 7, 1, 12, 57, 9, 24, 29, 74],
 [16, 10, 17, 19, 7, 1, 12, 18, 24, 135, 72, 1, 75, 7],
 [3, 30, 43, 91, 92, 32],
 [3, 39, 32, 43, 93, 94, 32],
 [95, 33, 1, 26, 96, 20, 21, 3],
 [136, 20, 1, 137, 27, 138, 97, 76],
 [6, 77, 1],
 [139, 77, 6, 1, 44, 21, 3, 5],
 [11, 1, 30, 140, 77, 76, 5],
 [141, 6, 34, 142, 6, 143, 78, 76, 144, 1],
 [145, 74, 3, 20],
 [6, 146, 98, 147, 148, 99, 3, 20],
 [8, 31, 1, 11, 40, 54, 41, 42, 79, 69, 70],
 [35, 22, 5, 4, 15, 2, 45, 58, 59, 2, 100],
 [5, 4, 2, 46, 5, 20, 45, 58, 59, 2, 100],
 [8, 31, 1, 11, 40, 54, 41, 149, 150, 69, 70, 54],
 [21, 151, 5, 3, 44],
 [14,
  8,
  101

In [22]:
#pad the sequence
train_data= pad_sequences(tokenized_train_data, maxlen= 20, padding= "pre")
test_data= pad_sequences(tokenized_test_data, maxlen= 20, padding= "pre")

In [23]:
train_data

array([[  0,   0,   0, ...,   8,  15,   2],
       [  0,   0,   0, ...,  42,  69,  70],
       [  0,   0,   0, ...,  18,  24,   9],
       ...,
       [  0,   0,   0, ...,   6,  23,   1],
       [  0,   0,   0, ...,   6, 198,   1],
       [  0,   0,   0, ...,  59,  38,  51]], dtype=int32)

In [24]:
train_data.shape

(79, 20)

In [25]:
#Let's build a 3 dim array. The dimensions are samples, steps and unique words.
def transform_matrix(data, tokenizer):
    output_shape_mat= [data.shape[0],
                  data.shape[1],
                  tokenizer.word_index.keys().__len__()]
    results_data= np.zeros(output_shape_mat)

    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            results_data[i, j, data[i,j]-1]= 1

    return results_data

trans_matrix_train= transform_matrix(train_data, text_tokenizer)
trans_matrix_test= transform_matrix(test_data, text_tokenizer)

In [26]:
trans_matrix_train.shape

(79, 20, 199)

In [27]:
trans_matrix_test.shape

(21, 20, 199)

# Build the LSTM Model

In [28]:
class lstm_model_class(object):
    def build_lstm_model(self,input_dimensions, op_shape, num_steps, dropout_rate, kernel_reg, bias_reg):
      ip_layer= Input(shape= (num_steps, input_dimensions)) #Define embedded layer with shape as number of steps and input dimensions. Note that both these are input variables to the model.

      lstm_model= LSTM(units= num_steps)(ip_layer) #Make the LSTM layer with number of steps as memory units
      dense_layer_1= Dense(op_shape, kernel_initializer= he_uniform(), #he_uniform draws samples in uniform distribution with -inf to +inf as range.
                   bias_initializer= "zeros",
                   kernel_regularizer= l2(l= kernel_reg),
                   bias_regularizer= l2(l= bias_reg))(lstm_model) # Create the  Dense layer which is the regular deeply connected layer
      int_layer= BatchNormalization()(dense_layer_1) #Normalize and scale activations of the dense layer with BatchNormalization function
      int_layer= relu(int_layer) #This applies the rectified linear unit activation function
      int_layer= Dropout(rate= dropout_rate)(int_layer) #Dropout is used to define Dropout layer that sets input units to 0 with a frequency. Here it is dropout_rate
      output_1= Dense(op_shape, kernel_initializer= glorot_uniform(), #glorot_uniform draws samples in uniform distribution with stddev = sqrt(2 / (fan_in + fan_out)) fan_in is num of units in weight tensor and fan_out is num of output units
             bias_initializer= "zeros",
             kernel_regularizer= l2(l= kernel_reg),
             bias_regularizer= l2(l= bias_reg))(dense_layer_1) # Create another dense layer which is the output of the model.
      output_1= BatchNormalization()(output_1) #Normalize and scale activations of the dense layer with BatchNormalization function
      final_output= softmax(output_1, axis= 1)

      loss_func= cce() # Since it is a multi-class classification problem, categorical crossentropy(cce) is used as the loss function
      perf_metrics= AUC() #our performance metric will be area under the curve
      optimizer= Adam() #we shall use Adam optimizer as our optimizer
      self.final_model= Model(inputs= [ip_layer], outputs= [final_output]) #Build the model with input and output layers
      self.final_model.compile(optimizer= optimizer, loss= loss_func, metrics= [perf_metrics]) #Compiling the keras model

    def train_lstm_model(self,x, y, valid_split, ep):
      self.final_model.fit(x, y, validation_split= valid_split, epochs= ep) #Create the train model


    def predict_lstm_model(self,x):
      return self.final_model.predict(x)    #Create the predict model


In [29]:
steps= trans_matrix_train.shape[1] #Define the number of steps is usually the number of steps in the train data.
input_dim= trans_matrix_train.shape[2] #Input dimension. Number of unique words in the train data
output_shape= train_target_encoded.shape[1] #Output shape. Usually the same number as the number of classes in the target variable. Here we have 8.
final_model= lstm_model_class()
final_model.build_lstm_model(input_dimensions= input_dim,
                  op_shape= output_shape,
                  num_steps= steps,
                  dropout_rate= 0.5, # Meaning 1 in 2 inputs will be randomly executed.
                  bias_reg= 0.3, # Reduce the bias in the model
                  kernel_reg= 0.3) #Reduce the weights excluding bias.



In [30]:
final_model.train_lstm_model(trans_matrix_train, train_target_encoded,
           0.2, 60)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [31]:
pred_train= encode_target.inverse_transform(final_model.predict_lstm_model(trans_matrix_train))
print(classification_report(train.target, pred_train))

              precision    recall  f1-score   support

           0       1.00      0.54      0.70        24
           1       0.70      1.00      0.83        26
           2       1.00      1.00      1.00        29

    accuracy                           0.86        79
   macro avg       0.90      0.85      0.84        79
weighted avg       0.90      0.86      0.85        79



In [32]:
pred_test= encode_target.inverse_transform(final_model.predict_lstm_model(trans_matrix_test))
print(classification_report(test.target, pred_test))

              precision    recall  f1-score   support

           0       1.00      0.33      0.50         9
           1       0.50      1.00      0.67         5
           2       0.88      1.00      0.93         7

    accuracy                           0.71        21
   macro avg       0.79      0.78      0.70        21
weighted avg       0.84      0.71      0.68        21

