# <a name="Part4_6">4.6. LSTM with Glove Embedding Model</a>

## 1. Import and analyse the data set.

In [None]:
import pandas as pd # read data file, data processing
import numpy as np # linear algebra
import matplotlib.pyplot as plt # plotting graph for EDA , Metrics analysis
%matplotlib inline
import seaborn as sns # plotting graph for EDA , Metrics analysis

from sklearn.pipeline import Pipeline

In [None]:
pip install googletrans==4.0.0-rc1



In [None]:
import unicodedata  # Replace accented encoding characters 
from googletrans import Translator # translate given text to English text
import re # Text pre-processing


### Load the data 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
import os

py_file_location = "/content/drive/MyDrive/AIML/projects/Capstone-NLP-Ticketing/"
sys.path.append(os.path.abspath(py_file_location))



In [None]:
# User defined file for doing input preprocess for Production inputs
from InputTransformer import InputTransformer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Input data files has been processed for 
# 1. carriage return characters like '_x000D_' and \n 
# 2. Accented encoding character like äº§å“æ‰€åœ¨ä»“åº“å‡ºé”™ã€ , è¿žæŽ¥åŽè‡ªåŠ¨æ–­å¼€ï¼Œæ
# 3. Translation of words in non english language especially German, Italian, French
# Above 3 steps are done separately and output from these steps are used for further processing in Part 2
# 4. Update of Assigment group - fewer data groups , grouped to Group others
# 5. Pre-process for having only English data after translation, removal of spaces 
# 6. Treatment of Null values
# Above step 4,5,6 are done in part2 and processed data is stored in input_data_trans_preprocess.csv

data_dir = "/content/drive/MyDrive/AIML/projects/Capstone-NLP-Ticketing/"
data_file_name='input_data_trans_preprocess.csv'
#data_file_name='input_data.xlsx'
data_file_path = data_dir+data_file_name
data_file_path

'/content/drive/MyDrive/AIML/projects/Capstone-NLP-Ticketing/input_data_trans_preprocess.csv'

In [None]:
#df_data = pd.read_excel(data_file_path)
df_data = pd.read_csv(data_file_path)

In [None]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8467 entries, 0 to 8466
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Short description       8467 non-null   object
 1   Description             8467 non-null   object
 2   Caller                  8467 non-null   object
 3   Assignment group        8467 non-null   object
 4   orig_desc               8466 non-null   object
 5   orig_short_desc         8459 non-null   object
 6   Lang                    8467 non-null   object
 7   Translated_ShortDesc    8450 non-null   object
 8   Translated_Description  8467 non-null   object
 9   orig_assign_group       8467 non-null   object
dtypes: object(10)
memory usage: 661.6+ KB


**Feature with both description - Merging both Description and Short description**

In [None]:
df_data.head(5)

Unnamed: 0,Short description,Description,Caller,Assignment group,orig_desc,orig_short_desc,Lang,Translated_ShortDesc,Translated_Description,orig_assign_group
0,login issue,verified user details employee manager name ch...,spxjnwir pjlcoqds,GRP_0,-verified user details.(employee# & manager na...,login issue,en,login issue,-verified user details.(employee# & manager na...,GRP_0
1,outlook,received from hmjdrvpb komuaywn gmail com hell...,hmjdrvpb komuaywn,GRP_0,_x000D_\n_x000D_\nreceived from: hmjdrvpb.komu...,outlook,en,outlook,received from: hmjdrvpb.komuaywn@gmail.com...,GRP_0
2,cant log in to vpn,received from eylqgodm ybqkwiam gmail com hi i...,eylqgodm ybqkwiam,GRP_0,_x000D_\n_x000D_\nreceived from: eylqgodm.ybqk...,cant log in to vpn,en,cant log in to vpn,received from: eylqgodm.ybqkwiam@gmail.com...,GRP_0
3,unable to access hr tool page,unable to access hr tool page,xbkucsvz gcpydteq,GRP_0,unable to access hr_tool page,unable to access hr_tool page,en,unable to access hr_tool page,unable to access hr_tool page,GRP_0
4,skype error,skype error,owlgqjme qhcozdfx,GRP_0,skype error,skype error,no,skype error,skype error,GRP_0


In [None]:
# Import stop words list from NLTK
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords # Import stop words
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
# Remove Stop words and do Lemmatization of features
def preprocess_vocab(df_column):
    stop_words=set(stopwords.words('english'))
    #stem=PorterStemmer()
    lem=WordNetLemmatizer()
    for tickets in df_column:
      words=[w for w in word_tokenize(tickets) if (w not in stop_words)]
      words=[lem.lemmatize(w) for w in words if len(w)>2]

    df_column_upd = df_column.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    
    return  df_column_upd

## Train a Glove Embedding - LSTM Model - Input Feature - Description + Short Description

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

def calc_metrics(actual,predicted):
  print('Accuracy score: ', round(accuracy_score(actual, predicted),2))
  print("precision_weighted:", round(precision_score(actual, predicted,average='weighted', zero_division=1),2))
  print("recall_weighted:", round(recall_score(actual, predicted,average='weighted', zero_division=1 ),2))
  print("f1_weighted:", round(f1_score(actual, predicted,average='weighted', zero_division=1 ),2))
  print("Classification Report:")
  print(classification_report(actual, predicted,zero_division=1))

In [None]:
# Create a new column by merging both description field and use this for model training and prediction
df_data['Desc_All'] = df_data['Short description'] + ' '+ df_data['Description']

In [None]:
feature_name = "Desc_All"
# Removal of stop words and Lemmatization ; Other preprocessing like remove whitespace character, lower, transalation all done in earlier steps of EDA
X = preprocess_vocab(df_data[feature_name])
# Convert Target variable to categorical value using label encoding
y = df_data['Assignment group'].values

### <span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"><a name="Part2_step3">3. Get characteristic of each ticket</a>

In [None]:
# Find max/min number of words in the Overall Description
Num_of_words=X.str.split().map(lambda x: len(x))

print("Max number of words in a Ticket", Num_of_words.max())
print("Min number of words in a Ticket", Num_of_words.min()) 
print("Mean number of words in a Ticket", round(Num_of_words.mean(),2)) 
print("Number of words in each Ticket", Num_of_words)

Max number of words in a Ticket 1264
Min number of words in a Ticket 0
Mean number of words in a Ticket 24.84
Number of words in each Ticket 0       24
1       20
2       13
3       10
4        4
        ..
8462    18
8463     6
8464    12
8465    12
8466    12
Name: Desc_All, Length: 8467, dtype: int64


In [None]:
# Total number of words / unique words in dataset 
all_words = ''
for words in X :
  all_words += ' '.join([word for word in words.split()])
print("Total Number of words in Tickets" , len(all_words.split()))
all_words_unique = set(all_words.split())
print("Vocabulary Size - Total Number of unique words in headlines" ,len(all_words_unique))

Total Number of words in Tickets 201824
Vocabulary Size - Total Number of unique words in headlines 20469


### <span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"><a name="Part2_step4">4. Define parameters</a>

In [None]:
# define parameters to be used in various stage of processing the data

# Training sequences for a max vocabulary size of 25,000.
max_features = 20000  #top_freq_words 

# Max Number of words to be used in each Ticket
max_words = 100

pad_type = 'post' 
trunc_type = 'post'

Split the Train, val and prod test data

In [None]:
from sklearn.model_selection import train_test_split

# Split data into Train, and Test - Test data would be used for testing the model 
X_train, X_prod, y_train, y_prod = train_test_split(X, y, test_size=0.05, random_state=0, stratify=y)
print('Prod Shape', X_prod.shape )
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, stratify=y)
print('Train shape', len(X_train))
print('Val shape', len(X_val))


Prod Shape (424,)
Train shape 6350
Val shape 2117


### <span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"><a name="Part2_step5">5. Get indices for words</a>

In [None]:
# tokenize sentences
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(X_train)

In [None]:
# Vectorize the text
# Before training, we need to map strings to a numerical representation. 
# Create two lookup tables: one mapping words to numbers, and another for numbers to words.
word_idx = tokenizer.word_index #Last is the key
idx_word = tokenizer.index_word


In [None]:
# Number of words in the train dataset used for indexing
word_counts = tokenizer.word_counts
num_words = len(word_counts)

In [None]:
for i_word  in word_counts:
  print(i_word,word_counts[i_word] )
  break

unable 1139


In [None]:
num_words

12168

### <span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"><a name="Part2_step6">6. Create features and labels</a>

In [None]:
# get word index and vectors for all sentences
X_token_train = tokenizer.texts_to_sequences(X_train)
X_token_val =  tokenizer.texts_to_sequences(X_val)

In [None]:
print(X_train[0:5])
print(X_token_train[0:5])

7660              unable load outlook unable load outlook
3477    urgent help required crm mobile app loading cr...
3734    telephone defect gigaset extension battery las...
7278    connect vpn name pfzxecbo ptygkvzl language br...
5979                        login help hub login help hub
Name: Desc_All, dtype: object
[[15, 408, 27, 15, 408, 27], [221, 33, 205, 61, 188, 218, 522, 61, 188, 218, 329, 2280, 62, 498, 4556, 417], [162, 1296, 2950, 1184, 630, 339, 392, 373, 2047, 306, 7153, 418], [80, 52, 40, 1297, 1298, 136, 167, 97, 120, 174, 31, 1297, 1298, 8, 2, 85, 71, 162, 108, 80, 52], [29, 33, 345, 29, 33, 345]]


In [None]:
# Convert Target variable to categorical value using label encoding

from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical

le = preprocessing.LabelEncoder()
le.fit(y)
num_classes = len(le.classes_)
y_train_mdl_lbl_enc = le.transform(y_train)
y_train_mdl_cat = to_categorical(y_train_mdl_lbl_enc, num_classes)
y_val_mdl_lbl_enc = le.transform(y_val)
y_val_mdl_cat = to_categorical(y_val_mdl_lbl_enc, num_classes)


##### Features & Labels - print a sentence and its sequence and label

In [None]:
print(X_train[1])
print(y_train[1])
print(X_token_train[1])
print(y_train_mdl_cat[1])

outlook received hmjdrvpb komuaywn gmail com hello team meetings skype meetings etc appearing outlook calendar somebody please advise correct kind
GRP_0
[221, 33, 205, 61, 188, 218, 522, 61, 188, 218, 329, 2280, 62, 498, 4556, 417]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Features
X_train = pad_sequences(X_token_train, maxlen=max_words, padding = pad_type, truncating = trunc_type)
X_val = pad_sequences(X_token_val, maxlen=max_words, padding = pad_type, truncating = trunc_type)

In [None]:
print(X_train[1])

[ 221   33  205   61  188  218  522   61  188  218  329 2280   62  498
 4556  417    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [None]:
print("Input tensor shape")
print(X_train.shape)
print(y_train.shape)

Input tensor shape
(6350, 100)
(6350,)


In [None]:
print(X_val.shape)
print(y_val.shape)

(2117, 100)
(2117,)


In [None]:
y_train_mdl_cat.shape

(6350, 50)

### <span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"><a name="Part2_step7">7. Get vocabulary size</a>

In [None]:
# Vocabulary Size 
print("Vocabulary Size", len(word_counts))

Vocabulary Size 12168


In [None]:
vocab_size = np.max(np.hstack(X_train))
print(vocab_size, 'vocab size')

12168 vocab size


In [None]:
print("Few words in the vocabulary --> ", idx_word[1] , " " , idx_word[10000], " " , idx_word[vocab_size])


Few words in the vocabulary -->  job   srujan   pavan


### <span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"><a name="Part2_step8">8. Create a weight matrix using GloVe embeddings</a>

In [None]:
 # read all the words from the glove2vec database and create a index of words and corresponding word vector embedding
glove_file_name = 'DataSet - glove.6B.50d.txt'
glove_file_path = data_dir+glove_file_name

EMBEDDING_DIM = 50 
embeddings_index = {}
f = open(glove_file_path)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Found 400000 word vectors.


In [None]:
# create corresponding word vector embedding
embedding_matrix = np.zeros((len(word_idx) + 1, EMBEDDING_DIM))
for word, i in word_idx.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    # words not found in embedding index will be all-zeros.
    embedding_matrix[i] = embedding_vector

In [None]:
len(word_idx)

12168

### <span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"><a name="Part2_step9">9. Define and compile a Bidirectional LSTM model using GLoVe Embedding</a>

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
embedding_layer = Embedding(input_dim=len(word_idx) + 1,
                                output_dim=EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=max_words,
                                trainable=True)

In [None]:
#Defining Neural Network
def LSTM_Model1():
  model = Sequential()
  #Embeddidng layer
  model.add(embedding_layer)
  #Bidirectional LSTM 
  model.add(Bidirectional(LSTM(units=128 , recurrent_dropout = 0.5 , dropout = 0.5)))
  # add the prediction layer
  model.add(Dense(num_classes, activation='softmax'))

  model.compile(optimizer=Adam(learning_rate = 0.01), loss='categorical_crossentropy', metrics=['acc'])

  model.summary()

  return model

In [None]:
modelLSTMGlove = LSTM_Model1()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 50)           608450    
                                                                 
 bidirectional (Bidirectiona  (None, 256)              183296    
 l)                                                              
                                                                 
 dense (Dense)               (None, 50)                12850     
                                                                 
Total params: 804,596
Trainable params: 804,596
Non-trainable params: 0
_________________________________________________________________


### <span style="font-family: Arial; font-weight:bold;font-size:1em;color:#00b3e5;"><a name="Part2_step10">10. Fit the model and check the validation accuracy</a>

In [None]:
Batch_size=128
Epochs=20

# Use earlystopping
callback = EarlyStopping(monitor='val_acc', patience=2, min_delta=0.01)

In [None]:
#Model 1 fit & evaluate
history1 = modelLSTMGlove.fit(X_train, y_train_mdl_cat, batch_size = Batch_size, 
                    validation_data = (X_val,y_val_mdl_cat),
                    epochs = Epochs, callbacks=[callback])



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


In [None]:
# Evaluation of the model
scores_LSTM_Model1 = modelLSTMGlove.evaluate(X_val, y_val_mdl_cat, verbose=0)
accuracy_LSTM_Model1 = scores_LSTM_Model1[1]*100
print("Validation Accuracy: %.2f%%" % (accuracy_LSTM_Model1))

Validation Accuracy: 64.10%


In [None]:
predictions = modelLSTMGlove.predict(X_val)

In [None]:
# Convert output value from categorical to encoder value
len_val = len(y_val_mdl_cat)
actual = np.zeros(len_val)
pred = np.zeros(len_val)
for i in range(0, len_val):
  actual[i] = np.argmax(y_val_mdl_cat[i])
  pred[i] = np.argmax(predictions[i])

In [None]:
calc_metrics(actual,pred)

Accuracy score:  0.64
precision_weighted: 0.64
recall_weighted: 0.64
f1_weighted: 0.62
Classification Report:
              precision    recall  f1-score   support

         0.0       0.81      0.87      0.84       994
         1.0       0.50      0.25      0.33         8
         2.0       0.38      0.49      0.42        35
         3.0       0.40      0.25      0.31         8
         4.0       0.63      0.58      0.60        64
         5.0       0.47      0.39      0.42        36
         6.0       0.73      0.27      0.39        30
         7.0       0.17      0.20      0.18        10
         8.0       0.40      0.67      0.50        21
         9.0       0.91      1.00      0.95        20
        10.0       0.50      0.18      0.27        22
        11.0       0.32      0.20      0.25        54
        12.0       0.51      0.42      0.46        60
        13.0       0.00      0.00      0.00         9
        14.0       1.00      0.00      0.00         7
        15.0       0.00  