# Creating a Chatbot from Scratch using Python and Scikit-learn

### Applications
- FAQ bots
- Recommendations
- Airports
- Taxi Bookings
- Hotel Bookings
    

### Chatbot architecture
- Humans (No engineering involved)
- Rule-based (Regular Expressions)
- Predictive (Retrieval Based)
- Generative

### Training - Finding Intents
- where is my hotel
    - where is my hotel
    - hotel location/
    - how do i get to the hotel?
- when is checkout_time
    - when is the checkout time?
    - when do i need to check out?

### Import the libraries

In [1]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.naive_bayes import MultinomialNB

### Create training phrases

In [2]:
training_phrases = {
    "help-me": ' '.join(["I have a problem",
                         "Hey i need some answers",
                         "Can you help me with this?",
                         "I need help",
                         "Please help me"
                        ]),
    "alcohol-addiction": ' '.join(["I am addicted to alcohol",
                                   "I love alcohol daily",
                                   "I am an alcoholic"
                                  ]),
    "depression-problem": ' '.join(["I am depressed",
                                   "I am lonely",
                                   "I dont have friends",
                                   "I am alone",
                                   "I am always sad",
                                   "Why am I sad all the time?"
                                   ]),
    "greeting": ' '.join(["Hi",
                         "Hey there",
                          "Hola",
                          "Hi How are you doing?"
                         ])
}

In [3]:
training_phrases

{'help-me': 'I have a problem Hey i need some answers Can you help me with this? I need help Please help me',
 'alcohol-addiction': 'I am addicted to alcohol I love alcohol daily I am an alcoholic',
 'depression-problem': 'I am depressed I am lonely I dont have friends I am alone I am always sad Why am I sad all the time?',
 'greeting': 'Hi Hey there Hola Hi How are you doing?'}

In [4]:
training_documents = list(training_phrases.values())
labels = list(training_phrases.keys())

In [5]:
training_documents

['I have a problem Hey i need some answers Can you help me with this? I need help Please help me',
 'I am addicted to alcohol I love alcohol daily I am an alcoholic',
 'I am depressed I am lonely I dont have friends I am alone I am always sad Why am I sad all the time?',
 'Hi Hey there Hola Hi How are you doing?']

### Tokenization

In [6]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = tuple(set(stopwords.words('english'))) 

word_tokens = []
for sent in training_documents:
    word_tokens.append(word_tokenize(sent))

print(word_tokens)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

print(word_tokens)
print(filtered_sentence)

[['I', 'have', 'a', 'problem', 'Hey', 'i', 'need', 'some', 'answers', 'Can', 'you', 'help', 'me', 'with', 'this', '?', 'I', 'need', 'help', 'Please', 'help', 'me'], ['I', 'am', 'addicted', 'to', 'alcohol', 'I', 'love', 'alcohol', 'daily', 'I', 'am', 'an', 'alcoholic'], ['I', 'am', 'depressed', 'I', 'am', 'lonely', 'I', 'dont', 'have', 'friends', 'I', 'am', 'alone', 'I', 'am', 'always', 'sad', 'Why', 'am', 'I', 'sad', 'all', 'the', 'time', '?'], ['Hi', 'Hey', 'there', 'Hola', 'Hi', 'How', 'are', 'you', 'doing', '?']]
[['I', 'have', 'a', 'problem', 'Hey', 'i', 'need', 'some', 'answers', 'Can', 'you', 'help', 'me', 'with', 'this', '?', 'I', 'need', 'help', 'Please', 'help', 'me'], ['I', 'am', 'addicted', 'to', 'alcohol', 'I', 'love', 'alcohol', 'daily', 'I', 'am', 'an', 'alcoholic'], ['I', 'am', 'depressed', 'I', 'am', 'lonely', 'I', 'dont', 'have', 'friends', 'I', 'am', 'alone', 'I', 'am', 'always', 'sad', 'Why', 'am', 'I', 'sad', 'all', 'the', 'time', '?'], ['Hi', 'Hey', 'there', 'Hola'

### Create Bag of Words Model

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(training_documents)
print(X)
vectorizer.get_feature_names()

  (0, 16)	1
  (0, 27)	1
  (0, 18)	1
  (0, 25)	2
  (0, 29)	1
  (0, 8)	1
  (0, 10)	1
  (0, 37)	1
  (0, 17)	3
  (0, 24)	2
  (0, 36)	1
  (0, 32)	1
  (0, 26)	1
  (1, 6)	2
  (1, 0)	1
  (1, 34)	1
  (1, 1)	2
  (1, 23)	1
  (1, 11)	1
  (1, 7)	1
  (1, 2)	1
  (2, 16)	1
  (2, 6)	5
  (2, 12)	1
  (2, 22)	1
  (2, 14)	1
  (2, 15)	1
  (2, 4)	1
  (2, 5)	1
  (2, 28)	2
  (2, 35)	1
  (2, 3)	1
  (2, 30)	1
  (2, 33)	1
  (3, 18)	1
  (3, 37)	1
  (3, 19)	2
  (3, 31)	1
  (3, 20)	1
  (3, 21)	1
  (3, 9)	1
  (3, 13)	1


['addicted',
 'alcohol',
 'alcoholic',
 'all',
 'alone',
 'always',
 'am',
 'an',
 'answers',
 'are',
 'can',
 'daily',
 'depressed',
 'doing',
 'dont',
 'friends',
 'have',
 'help',
 'hey',
 'hi',
 'hola',
 'how',
 'lonely',
 'love',
 'me',
 'need',
 'please',
 'problem',
 'sad',
 'some',
 'the',
 'there',
 'this',
 'time',
 'to',
 'why',
 'with',
 'you']

### Multinomial NB Classifier

In [8]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(X, labels)

MultinomialNB()

In [9]:
raw_queries = ["I love everything related to C2H50H"]
queries = vectorizer.transform(raw_queries)
predictions = classifier.predict(queries)
predictions

array(['alcohol-addiction'], dtype='<U18')

In [10]:
def predict(raw_queries):
    queries = vectorizer.transform(raw_queries)
    return classifier.predict(queries)

predicted = predict(["I am very much sad", "can we talk?", "Can you help me?", "i take wine everyday and i cant live without it"])
expected = ["depression-problem", "help-me", "help-me", "alcohol-addiction"]

In [11]:
predicted

array(['depression-problem', 'help-me', 'help-me', 'alcohol-addiction'],
      dtype='<U18')

In [12]:
evaluation = precision_recall_fscore_support(expected, predicted)
evaluation

(array([1., 1., 1.]),
 array([1., 1., 1.]),
 array([1., 1., 1.]),
 array([1, 1, 2]))

In [13]:
metrics = {}
(metrics['p'], metrics['r'], metrics['f1'], _) = evaluation
metrics

{'p': array([1., 1., 1.]), 'r': array([1., 1., 1.]), 'f1': array([1., 1., 1.])}

### Challenges / Questions to be answered
- Return the answer
- Exclude unimportant words("stop words")
- Handle synonyms (e.g. "lobby" = "front desk")
- Handle typos
- Return "Unknown"
- Handle a entity/parameter ("set my check out time to 3 PM")

### Returning the Answer

In [14]:
responses = {
    "depression-response": "Hi, please do not worry. I am here to help you.",
    "help-response": "Hi there, yes ofcourse. I am at your service. How can I help you today?",
    "alcohol-response": "Good to know that. Now I can work with you in making you better.",
    "greeting-response": "Hi there my friend. How are you today?"
}

In [15]:
responses['alcohol-response']

'Good to know that. Now I can work with you in making you better.'

In [16]:
predicted = predict(["i take wine everyday and i cant live without it"])
# expected = ["alcohol-addiction"]
predicted

array(['alcohol-addiction'], dtype='<U18')

In [17]:
def send_response(raw_queries):
    predicted = predict(raw_queries)
    print(predicted[0])
    if predicted[0] == "alcohol-addiction":
        return(responses["alcohol-response"])
    else:
        return "You are not an alcoholic!"

In [18]:
bot_response = send_response(["i take wine everyday and i cant live without it"])
print(bot_response)

alcohol-addiction
Good to know that. Now I can work with you in making you better.


### Stop words

In [19]:
from nltk.corpus import stopwords

In [22]:
stop_words = tuple(set(stopwords.words('english'))) 

for sent in training_documents:
    word_tokens = word_tokenize(sent)
    
filtered_sentence = [w for w in word_tokens if not w in stop_words] 
filtered_sentence

['Hi', 'Hey', 'Hola', 'Hi', 'How', '?']

### Typos - Edit distance, Phonetics

In [23]:
tokens = ['problem', 'Hey', 'need', 'some', 'answers', 'help', 'me', 'with', 'this', 'Please', 'addicted', 'alcohol', 'love', 'daily', 'alcoholic', 'depressed', 'lonely', 'dont', 'friends', 'alone', 'always', 'sad', 'Why', 'all', 'time', 'Hi', 'Hey', 'there', 'Hola', 'Hi', 'How', 'are', 'you', 'doing', '?']

In [24]:
from difflib import get_close_matches
def spell_checker(token):
    spelling_error_flag = False
    corrected_word = ''
    if len(get_close_matches(token, tokens, n=1, cutoff=0.80)) > 0:
        corrected_word = get_close_matches(token, tokens, n=1, cutoff=0.80)[0]
        spelling_error_flag = True
        return corrected_word, spelling_error_flag
    else:
        return corrected_word, spelling_error_flag

In [25]:
corrected_word, flag = spell_checker('lone')
corrected_word

'alone'

### Synonyms

In [26]:
from nltk.corpus import wordnet
syns = wordnet.synsets("alone")
print(syns)

[Synset('alone.s.01'), Synset('alone.s.02'), Synset('alone.s.03'), Synset('alone.s.04'), Synset('entirely.r.02'), Synset('alone.r.02')]
