<a href="https://colab.research.google.com/github/veer064/NLU_Intent_Classifier/blob/master/Intent_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <center><b>NLU Intent Classifier</b></center>

NLU Intent Classifier is a machine learning model that classifies the user queries into one the no. of. intents present previously, by observing the patterns in the query strings.

### Import module and packages

In [None]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint

### Loading Data

In [None]:
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])
  print(df.head())
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)

In [None]:
intent, unique_intent, sentences = load_dataset("/content/drive/My Drive/Colab Notebooks/Dataset.csv")

FileNotFoundError: ignored

#### Unique Intents

In [None]:
print(f'No. of. Unique intents: {len(unique_intent)}\n')
unique_intent

#### Sample Sentense

In [None]:
sentences[:11]

## Text Cleaning

#### Dowloading 'punkt' and 'stopwords' from nltk

In [None]:
nltk.download("stopwords")
nltk.download("punkt")

#### Defining stemmer

In [None]:
stemmer = LancasterStemmer()

#### Cleaning

- Takes sentences, cleans them by substituting every punctuation and special characters(if any) other than alphabets of any case and digits with a white space. 

- Then perform tokenization (sentence to words) and convert all the alphabeltic characters to lower case.

- And returns those list of words for each sentence in sentences.



In [None]:
def cleaning(sentences):
  words = []
  for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
    w = word_tokenize(clean)
    #stemming
    words.append([i.lower() for i in w])
    
  return words  

In [None]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:5])  

## Encoding

### Input Encoding
- Using Tokenizer class of Keras to convert these words into indexes so that we can use them as input.


In [None]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [None]:
def max_length(words):
  return(len(max(words, key = len)))

In [None]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)
print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

In [None]:
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [None]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

### Padding

Using padding to make them of equal length so that they can be used in the model.

In [None]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [None]:
padded_doc = padding_doc(encoded_doc, max_length)

In [None]:
padded_doc[:5]

In [None]:
print("Shape of padded docs = ",padded_doc.shape)

### Output Encoding

- Similarly to inputs for outputs also used Tokenizer class and here we are using a different filter because our outputs(intents) have'.' and '_' as part of them, if default filters are used then we won't be getting expected outputs.

In [None]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

In [None]:
output_tokenizer.word_index

In [None]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [None]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [None]:
encoded_output.shape

## One-Hot Encoding

After getting the indexes of the 21 intents, its time for One-Hot Encoding, so that they can be used to train the model.

In [None]:
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [None]:
output_one_hot = one_hot(encoded_output)

In [None]:
output_one_hot.shape

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_Y, val_Y = train_test_split(
    padded_doc, output_one_hot, shuffle = True, test_size = 0.2)


In [None]:
print(f"Shape of train_X = {train_X.shape} and train_Y = {train_Y.shape}")
print(f"Shape of val_X = {val_X.shape} and val_Y = {val_Y.shape}")

## Model Building

In [None]:
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
  model.add(Bidirectional(LSTM(128)))
#   model.add(LSTM(128))
  model.add(Dense(32, activation = "relu"))
  model.add(Dropout(0.5))
  model.add(Dense(21, activation = "softmax"))
  
  return model

In [None]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

In [None]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

In [None]:
 model = load_model("model.h5")

In [None]:
def predictions(text):
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean)
  test_word = [w.lower() for w in test_word]
  test_ls = word_tokenizer.texts_to_sequences(test_word)
  print(test_word)
  #Check for unknown words
  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_length)
  
  pred = model.predict_proba(x)
  
  
  return pred

In [None]:
def get_final_output(pred, classes):
  predictions = pred[0]
 
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
 
  for i in range(pred.shape[1]):
    print("%s has confidence = %s" % (classes[i], (predictions[i])))

In [None]:
text = "Can you help me?"
pred = predictions(text)
get_final_output(pred, unique_intent)