<a href="https://colab.research.google.com/github/vt-ai-ml-club/repo/blob/master/Fall2020/Chatbot/Chatbot_with_VSM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Building a chatbot using a VSM

Reference: https://medium.com/analytics-vidhya/building-a-simple-chatbot-in-python-using-nltk-7c8c8215ac6e

In [None]:
# download data
import requests
url = 'https://raw.githubusercontent.com/vt-ai-ml-club/repo/master/Fall2020/Chatbot/chatbot_corpus.txt'
corpus = requests.get(url).text

In [None]:
data = corpus.rstrip().split('\n')
data[:2]

In [None]:
question_to_responses = {}
questions = set()

# Creating a key, value pair as <question,response>
for i in range(0, len(data), 2):
    question = data[i]
    response = data[i+1]
    
    responses = question_to_responses.setdefault(question, [])
    responses.append(response)
    questions.add(question)
    
question_list = list(questions)
print(question_list[-5:])

In [None]:
import string
import nltk     # natural language toolkit (popular Python module for NLP)

nltk.download('punkt')    # to remove punctuation
nltk.download('wordnet')  # a lemmatization dictionary

lemmatizer = nltk.stem.WordNetLemmatizer()

remove_punctuation_table = dict((ord(punct), None) for punct in string.punctuation)

def normal_lemma_tokenizer(text):
    normalize_text = text.lower().translate(remove_punctuation_table)           # normalize text (remove puncutation)
    tokens = nltk.word_tokenize(normalize_text)                                 # tokenize text (separate into words)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]       # lemmatize each token (find base word)
    return lemmatized_tokens

In [None]:
import random

def select_random_response(question):
    return random.choice(question_to_responses[question])

### Term Frequency-Inverse Document Frequency (TF-IDF)
>**Term Frequency**: scores the frequency of a word in the current document.

$$ TF(w,d) = \frac{\text{# of times w appears in d}}{\text{total # of words in the d}} $$

>**Inverse Document Frequency**: scores how rare the word is across all documents.
>+ A word with a *low* IDF score is a *common* word.
>+ A word with a *high* IDF score is a *uncommon* word.

$$ IDF(w,D) = \log{(\frac{\text{total # of documents}}{\text{# of documents containing w}})} $$



### Word Similarity 
>**Cosine similarity**: is the measure of similarity between two vectors. In our case, the similarity between two questions.

$$ cos(u,v) = \frac{u \cdot v}{\left\lVert u \right\lVert \left\lVert v \right\lVert} $$


<img src="https://github.com/vt-ai-ml/fall2019-meetings/raw/master/data/chatbot_cosine_image.png" align="left" style="width:250px;height:250px;">

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def respond(user_question):
    question_list.append(user_question)
    
    # Maps user response and questions into TF-IDF vector space, where each word is a dimension
    tf_idf_matrix = tf_idf_vector_space.fit_transform(question_list)
    
    # Calculate the cosine similarity of user's response with other vectors (question) in the vector space
    user_question_tf_idf = tf_idf_matrix[-1]
    similarity_score = cosine_similarity(user_question_tf_idf, tf_idf_matrix[:-1])
    
    # Find the most similar vector (question) to our user's response
    highest_tf_idf_idx = similarity_score.argmax()
    highest_tf_idf = similarity_score.max()
    
    # Find appropriate response
    if(highest_tf_idf == 0): # no similarity
        robo_response = "I don't understand what you're saying."
    else:
        robo_response = select_random_response(question_list[highest_tf_idf_idx])
        
    question_list.remove(user_question)
    return robo_response

In [None]:
tf_idf_vector_space = TfidfVectorizer(tokenizer=normal_lemma_tokenizer)

print("Chatbot is on! Type 'exit' to turn off the chatbot.\n\")
while(True):
    user_question = input().lower().translate(remove_punctuation_table)
    
    if(user_question == 'exit'):
        print('Chatbot is now off!')
        break
    else:
        print("BOT: ", respond(user_question), '\n')  

### Questions to ask your chatbot
* How is everything going?
* Tell me a joke
* What is a chat robot?
* What can you eat?