https://www.analyticsvidhya.com/blog/2018/01/faq-chatbots-the-future-of-information-searching/
http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/

In [1]:
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import stop_words
from sklearn.metrics.pairwise import cosine_similarity

# Uncomment to see full cell text
pd.set_option('display.max_colwidth', 0)

In [2]:
faq = pd.read_csv('../data/interim/faq-text-separated.csv', keep_default_na=False)
test = pd.read_csv('../data/interim/test-questions.csv')

In [3]:
test.head()

Unnamed: 0,test_question,match_question
0,"I live on the corner of Bear Mountain Drive and Scrub Oak Circle, and would like the City enforce the speed limit on Bear Mountain Drive. Despite a limit of 25 mph, I routinely observe vehicles traveling well above that speed in both directions (especially in the mornings and early evening). There are a ton of kids in this neighborhood, as well as a blind curve in the road between several crosswalks. It baffles me to see so much enforcement on Lehigh (with the regular presence of a photo van) and only the very occasional patrol car on Bear Mountain Drive. It would be great if the photo van or other officers could regularly make an appearance on Bear Mountain.",Speeding on Residential Streets
1,Can you please mow the grass in the park. It is becoming difficult to find the dog poop and dog owners are just leaving it in the grass.,Park Maintenance Issues
2,Are there grizzlies in Boulder?,Do we have grizzly bears in Colorado?
3,Where do I report being hit by a bicycle?,"Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?"
4,How much time do I have to wait for my income certification for affordable housing?,How long does it take to become income-certified?


In [6]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char if char not in string.punctuation else ' ' for char in mess]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
# Lemmatize???    
    # Now just remove any stopwords
    return [word.lower() for word in nopunc.split() if word.lower() not in stop_words.ENGLISH_STOP_WORDS]

In [4]:
# Create BOW tranformer based on faq.question
# bow_transformer = CountVectorizer(analyzer=text_process).fit(faq.question)
bow_transformer = CountVectorizer().fit(faq.question)

# Tranform faq.question itself into BOW
q_bow = bow_transformer.transform(faq.question)

# Create TFIDF transformer based on faq.question's BOW
tfidf_transformer = TfidfTransformer().fit(q_bow)
# Transform faq.question's BOW into TFIDF
q_tfidf = tfidf_transformer.transform(q_bow)

# Create test question
# tq = 'Are there grizzly bears here?'

In [5]:
def max_sim_skl(tq):
    # Transform test question into BOW using BOW transformer (based on faq.question) 
    tq_bow = bow_transformer.transform([tq])
    # Transform test question's BOW into TFIDF
    tq_tfidf = tfidf_transformer.transform(tq_bow)
    
    sims = np.transpose(cosine_similarity(tq_tfidf, q_tfidf))
    max_s = sims.max()
    max_i = np.argmax(sims)
    
    return max_i, max_s 

In [9]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf_vectorizer = TfidfVectorizer()
# tfidf_matrix = tfidf_vectorizer.fit_transform(faq.question)
# print(tfidf_matrix.shape)

In [11]:
import spacy
nlp = spacy.load('en_core_web_sm')

def max_sim_spacy(q, docs):
    """
    Take user query in form of spaCy document, find question that has the highest similarity,
    and return the associated answer from FAQ if the similarity is above threshold value.
    docs must be a set of spaCy documents
    """
    q = nlp(q)
    max_i = 0
    max_s = 0
    ms = []
    for i, d in enumerate(docs):
        if d.similarity(q) > max_s:
            max_s = d.similarity(q)
            max_i = i
    
    return max_i, max_s    
 
q_docs = [nlp(entry) for entry in faq.question]
a_docs = [nlp(entry) for entry in faq.answer]      

In [6]:
def respond(row):
    query = row.test_question.strip()

    index, sim = max_sim_skl(query)

    row['sim_question'] = faq.question.iloc[index]
#     row['info'] = faq.answer.iloc[index]
    row['max_similarity'] = round(sim, 2)
    row['success'] = row.sim_question == row.match_question
    return row   

In [7]:
pred = test.apply(respond, axis=1) 

In [8]:
#Without custom analyzer
pred

Unnamed: 0,test_question,match_question,sim_question,max_similarity,success
0,"I live on the corner of Bear Mountain Drive and Scrub Oak Circle, and would like the City enforce the speed limit on Bear Mountain Drive. Despite a limit of 25 mph, I routinely observe vehicles traveling well above that speed in both directions (especially in the mornings and early evening). There are a ton of kids in this neighborhood, as well as a blind curve in the road between several crosswalks. It baffles me to see so much enforcement on Lehigh (with the regular presence of a photo van) and only the very occasional patrol car on Bear Mountain Drive. It would be great if the photo van or other officers could regularly make an appearance on Bear Mountain.",Speeding on Residential Streets,Dogs on Open Space and Mountain Parks,0.31,False
1,Can you please mow the grass in the park. It is becoming difficult to find the dog poop and dog owners are just leaving it in the grass.,Park Maintenance Issues,Dog Parks and Dog Swimming,0.3,False
2,Are there grizzlies in Boulder?,Do we have grizzly bears in Colorado?,Why are there lions in Boulder?,0.77,False
3,Where do I report being hit by a bicycle?,"Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?",Where can I find Boulder's bicycle traffic laws?,0.29,False
4,How much time do I have to wait for my income certification for affordable housing?,How long does it take to become income-certified?,Housing Fund for Affordable Housing Providers,0.46,False
5,my water pipes froze and now they are leaking. how do i turn off the water??,Water System Maintenance,How can I prevent and thaw frozen water pipes?,0.42,False
6,"Hello, There are a group of Gambel Oak Trees with Tree ID #'s 38820-38825 that I would like to be pruned up in order to keep them away from the Rec Center wall and to keep them in good, trimmed health. I also would like Tree ID 38825 to be pruned away from the American Flag so it will not come in contact with the tree, from the nearby flag pole. Thank you very much and if you have any questions or concerns, feel free to give me a call.",Public Tree Issues,Do I have to reserve a shelter in order to use it or the grill?,0.25,False
7,There are constantly dogs off leash in the children playgrounds of columbine school. Even though the playgrounds are fenced and have a sign stating dogs should not go in. You can see some dog owners do not even pick their dog shit.,Dogs on Open Space and Mountain Parks,Dog Parks and Dog Swimming,0.26,False
8,"I parked in the garage on 11th and Walnut on Sunday 12/23 starting at 5 PM and left at 12:35 AM that Monday 12/24. I was charged $1.25 but it should have still been free since charged parking doesnâ€™t start till 7 AM on Mondays , I park here all the time and I am confused as to why it says I owed $1.25. Is there a glitch in the system ?",Contact Parking Services,Is it free to park at pay stations and meters on holidays?,0.36,False
9,Where do I apply for building permits?,How do I get a building permit?,Do I need a building permit?,0.42,False


In [22]:
pred

Unnamed: 0,test_question,match_question,sim_question,max_similarity,success
0,"I live on the corner of Bear Mountain Drive and Scrub Oak Circle, and would like the City enforce the speed limit on Bear Mountain Drive. Despite a limit of 25 mph, I routinely observe vehicles traveling well above that speed in both directions (especially in the mornings and early evening). There are a ton of kids in this neighborhood, as well as a blind curve in the road between several crosswalks. It baffles me to see so much enforcement on Lehigh (with the regular presence of a photo van) and only the very occasional patrol car on Bear Mountain Drive. It would be great if the photo van or other officers could regularly make an appearance on Bear Mountain.",Speeding on Residential Streets,Who should I call when I see a bear?,0.44,False
1,Can you please mow the grass in the park. It is becoming difficult to find the dog poop and dog owners are just leaving it in the grass.,Park Maintenance Issues,Dog Parks and Dog Swimming,0.4,False
2,Are there grizzlies in Boulder?,Do we have grizzly bears in Colorado?,Why are there lions in Boulder?,0.56,False
3,Where do I report being hit by a bicycle?,"Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?",Is there someone I can report this to?,0.53,False
4,How much time do I have to wait for my income certification for affordable housing?,How long does it take to become income-certified?,Housing Fund for Affordable Housing Providers,0.54,False
5,my water pipes froze and now they are leaking. how do i turn off the water??,Water System Maintenance,How can I prevent and thaw frozen water pipes?,0.55,False
6,"Hello, There are a group of Gambel Oak Trees with Tree ID #'s 38820-38825 that I would like to be pruned up in order to keep them away from the Rec Center wall and to keep them in good, trimmed health. I also would like Tree ID 38825 to be pruned away from the American Flag so it will not come in contact with the tree, from the nearby flag pole. Thank you very much and if you have any questions or concerns, feel free to give me a call.",Public Tree Issues,Public Tree Issues,0.37,True
7,There are constantly dogs off leash in the children playgrounds of columbine school. Even though the playgrounds are fenced and have a sign stating dogs should not go in. You can see some dog owners do not even pick their dog shit.,Dogs on Open Space and Mountain Parks,Dog Parks and Dog Swimming,0.45,False
8,"I parked in the garage on 11th and Walnut on Sunday 12/23 starting at 5 PM and left at 12:35 AM that Monday 12/24. I was charged $1.25 but it should have still been free since charged parking doesnâ€™t start till 7 AM on Mondays , I park here all the time and I am confused as to why it says I owed $1.25. Is there a glitch in the system ?",Contact Parking Services,"If there is still time left on my kiosk receipt, can I park in another area?",0.32,False
9,Where do I apply for building permits?,How do I get a building permit?,Parking Permits,0.49,False


In [24]:
x = faq.question + ' ' + faq.answer

In [25]:
x

0      Affordable Homeownership The City of Boulder is committed to making housing in our community available to a variety of people, including those who could not otherwise purchase a home in Boulder's housing market. The Division of Housing offers opportunities for homeownership to those with low and moderate incomes - from down payment assistance on market rate homes to homes that must be sold at affordable prices.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [21]:
len(x)

1376