# Natural language processing Chatbot application using NLTK for text classification


In this NLP AI application, I built the core conversational engine for a chatbot. I used the popular NLTK text classification library to achieve this.

In [3]:
# import the necessary packages
import re
import os
import csv
import nltk
from nltk.stem.snowball import SnowballStemmer
import random
from nltk.classify import SklearnClassifier
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
import pandas as pd

In [4]:
## Get multiple outputs in the same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Ignore all warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [5]:
## Display all rows and columns of a dataframe instead of a truncated version
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Preprocess

In [14]:
tokenizer = RegexpTokenizer(r'\w+')
lmtzr = WordNetLemmatizer()
stemmer = SnowballStemmer("english")

In [11]:
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stopwords.words('english')]
    return filtered_words

In [12]:
def extract_tagged(sentences):
    features = []
    for tagged_word in sentences:
        word, tag = tagged_word
        if tag=='NN' or tag == 'VBN' or tag == 'NNS' or tag == 'VBP' or tag == 'RB' or tag == 'VBZ' or tag == 'VBG' or tag =='PRP' or tag == 'JJ':
            features.append(word)
    return features

## Putting it all together

In [15]:
def extract_feature(text):
    words = preprocess(text)
#     print('words: ',words)
    tags = nltk.pos_tag(words)
#     print('tags: ',tags)
    extracted_features = extract_tagged(tags)
#     print('Extracted features: ',extracted_features)
    stemmed_words = [stemmer.stem(x) for x in extracted_features]
#     print(stemmed_words)

    result = [lmtzr.lemmatize(x) for x in stemmed_words]
   
    return result

## Implementing bag of words

In simple terms, it’s a collection of words to represent a sentence, disregarding the order in which they appear.

In [16]:
def word_feats(words):
    return dict([(word, True) for word in words])

## Parsing the whole document

In [17]:
def extract_feature_from_doc(data):
    result = []
    corpus = []
    # The responses of the chat bot
    answers = {}
    for (text,category,answer) in data:

        features = extract_feature(text)

        corpus.append(features)
        result.append((word_feats(features), category))
        answers[category] = answer

    return (result, sum(corpus,[]), answers)

In [18]:
def get_content(filename):
    doc = os.path.join(filename)
    with open(doc, 'r') as content_file:
        lines = csv.reader(content_file,delimiter='|')
        data = [x for x in lines if len(x) == 3]
        return data

In [19]:
filename = 'leaves.txt'
data = get_content(filename)

In [20]:
data[:5]

[['Hello',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi hello',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi ',
  'Greetings',
  'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi', 'Greetings', 'Hello. I am Dexter. I will serve your leave enquiries.'],
 ['hi', 'Greetings', 'Hello. I am Dexter. I will serve your leave enquiries.']]

In [21]:
features_data, corpus, answers = extract_feature_from_doc(data)

In [22]:
print(features_data[50])

({'mani': True, 'option': True, 'leav': True}, 'Utilized-Optional-Leaves')


In [25]:
corpus[:5]

['hello', 'hi', 'hello', 'hi', 'hi']

In [26]:
answers

{'Greetings': 'Hello. I am Dexter. I will serve your leave enquiries.',
 'Morning': 'Good Morning. I am Dexter. I will serve your leave enquiries.',
 'Afternoon': 'Good afternoon. I am Dexter. I will serve your leave enquiries.',
 'Evening': 'Good evening. I am Dexter. I will serve your leave enquiries.',
 'Goodbye': 'Good night. Take care.',
 'Opening': "I'm fine! Thank you. How can I help you?",
 'Help': 'How can I help you?',
 'No-Help': 'Ok sir/madam. No problem. Have a nice day.',
 'Closing': "It's glad to know that I have been helpful. Have a good day!",
 'Leaves-Type': 'Currently I know about two: annual and optional leaves.',
 'Default-Utilized-Annual-Leaves': 'You have used 12 annual leaves.',
 'Utilized-Annual-Leaves': 'You have taken 12 annual leaves.',
 'Utilized-Optional-Leaves': 'You have taken 1 optional leaves.',
 'Default-Balance-Annual-Leaves': 'You have 25 annual leaves left.',
 'Balance-Annual-Leaves': 'You have 25 annual leaves remaining.',
 'Balance-Optional-Leave

# Train a model using these fetures

In [27]:
## split data into train and test sets
split_ratio = 0.8

In [28]:
def split_dataset(data, split_ratio):
    random.shuffle(data)
    data_length = len(data)
    train_split = int(data_length * split_ratio)
    return (data[:train_split]), (data[train_split:])

In [29]:
training_data, test_data = split_dataset(features_data, split_ratio)

In [30]:
training_data

[({'mani': True, 'leav': True, 'remain': True},
  'Default-Balance-Annual-Leaves'),
 ({'mani': True, 'option': True, 'leav': True}, 'Balance-Optional-Leaves'),
 ({'option': True, 'leav': True, 'count': True, 'taken': True},
  'Utilized-Optional-Leaves'),
 ({'great': True}, 'Closing'),
 ({'number': True, 'option': True, 'leav': True, 'taken': True},
  'Utilized-Optional-Leaves'),
 ({'mani': True, 'option': True, 'leav': True}, 'Balance-Optional-Leaves'),
 ({'annual': True, 'leav': True}, 'Balance-Annual-Leaves'),
 ({'annual': True, 'leav': True, 'count': True, 'taken': True},
  'Utilized-Annual-Leaves'),
 ({'mani': True, 'carri': True, 'forward': True, 'leav': True}, 'CF'),
 ({'number': True, 'option': True, 'leav': True, 'remain': True},
  'Balance-Optional-Leaves'),
 ({'option': True, 'leav': True, 'balanc': True}, 'Balance-Optional-Leaves'),
 ({'mani': True, 'leav': True, 'taken': True},
  'Default-Utilized-Annual-Leaves'),
 ({'good': True, 'night': True}, 'Goodbye'),
 ({'hey': True}

In [31]:
# save the data
np.save('training_data', training_data)
np.save('test_data', test_data)

## Classification using Decision tree

In [33]:
np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

training_data = np.load('training_data.npy')
test_data = np.load('test_data.npy')

In [34]:
def train_using_decision_tree(training_data, test_data):
    
    classifier = nltk.classify.DecisionTreeClassifier.train(training_data, entropy_cutoff=0.6, support_cutoff=6)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    print('training set accuracy: ', training_set_accuracy)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    print('test set accuracy: ', test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [35]:
dtclassifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_decision_tree(training_data, test_data)

training set accuracy:  0.9298245614035088
test set accuracy:  0.7241379310344828


## Classification using Naive Bayes

In [36]:
def train_using_naive_bayes(training_data, test_data):
    classifier = nltk.NaiveBayesClassifier.train(training_data)
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [37]:
classifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_naive_bayes(training_data, test_data)
print(training_set_accuracy)
print(test_set_accuracy)
print(len(classifier.most_informative_features()))
classifier.show_most_informative_features()

0.8596491228070176
0.7931034482758621
70
Most Informative Features
                    leav = None           Greeti : Balanc =     11.2 : 1.0
                    mani = True           Defaul : Balanc =      9.4 : 1.0
                   taken = None           Balanc : Utiliz =      5.1 : 1.0
                 alreadi = True           Defaul : Utiliz =      4.6 : 1.0
                    help = True             Help : Closin =      3.9 : 1.0
                   count = True           Utiliz : CF     =      3.4 : 1.0
                   carri = None           Utiliz : CF     =      3.0 : 1.0
                   thank = None           Utiliz : Closin =      2.7 : 1.0
                  remain = None           Utiliz : Balanc =      2.6 : 1.0
                    take = True           Defaul : Balanc =      2.2 : 1.0


In [38]:
classifier.classify(({'mani': True, 'option': True, 'leav': True}))

'Utilized-Optional-Leaves'

In [39]:
extract_feature("hello")

['hello']

In [40]:
word_feats(extract_feature("hello"))

{'hello': True}

In [41]:
input_sentence = "how many balanced leaves do I have?"
classifier.classify(word_feats(extract_feature(input_sentence)))

'Utilized-Optional-Leaves'

In [42]:
def reply(input_sentence):
    category = dtclassifier.classify(word_feats(extract_feature(input_sentence)))
    return answers[category]    

In [43]:
reply('Hi')

'Hello. I am Dexter. I will serve your leave enquiries.'

In [44]:
reply('How many annual leaves do I have left?')

'You have 25 annual leaves remaining.'

In [45]:
reply('How many leaves have I taken?')

'You have used 12 annual leaves.'

In [46]:
reply('Thanks!')

"It's glad to know that I have been helpful. Have a good day!"

# Conclusion:

Once the model has been developed using an algorithm that gives an acceptable accuracy, this model can be called using to any chatbot UI framework