https://www.analyticsvidhya.com/blog/2018/01/faq-chatbots-the-future-of-information-searching/
http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/
http://nlp.town/blog/sentence-similarity/
https://spacy.io/usage/training

In [1]:
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import stop_words
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KDTree
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

# Uncomment to see full cell text
pd.set_option('display.max_colwidth', 0)

In [33]:
faq = pd.read_csv('../data/interim/faq-text-separated.csv', keep_default_na=False)
test = pd.read_csv('../data/interim/test-questions.csv')

In [34]:
test.head()

Unnamed: 0,test_question,match_question
0,"I live on the corner of Bear Mountain Drive and Scrub Oak Circle, and would like the City enforce the speed limit on Bear Mountain Drive. Despite a limit of 25 mph, I routinely observe vehicles traveling well above that speed in both directions (especially in the mornings and early evening). There are a ton of kids in this neighborhood, as well as a blind curve in the road between several crosswalks. It baffles me to see so much enforcement on Lehigh (with the regular presence of a photo van) and only the very occasional patrol car on Bear Mountain Drive. It would be great if the photo van or other officers could regularly make an appearance on Bear Mountain.",Speeding on Residential Streets
1,Can you please mow the grass in the park. It is becoming difficult to find the dog poop and dog owners are just leaving it in the grass.,Park Maintenance Issues
2,Are there grizzlies in Boulder?,Do we have grizzly bears in Colorado?
3,Where do I report being hit by a bicycle?,"Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?"
4,How much time do I have to wait for my income certification for affordable housing?,How long does it take to become income-certified?


# Text Processing

In [78]:
def lem(words):
    lem_sentence=[]
    for word in words:
        lem_sentence.append(wordnet_lemmatizer.lemmatize(word))
    return lem_sentence

def text_process(mess):
    """Returns list of the cleaned text in argument mess, with stopwords, punctuation removed and tokens lemmatized."""
    # Check characters to see if they are in punctuation
    clean = [char if char not in string.punctuation else ' ' for char in mess]

    # Join the characters again to form the string.
    clean = ''.join(clean)

    # Now just remove any stopwords
    clean = [word.lower() for word in clean.split() if word.lower() not in stop_words.ENGLISH_STOP_WORDS]
    
    # Lemmatize
    clean = lem(clean)
    
    return clean

In [6]:
# Create corpus by joining questions and answers
corpus = faq.question + ' ' + faq.answer

# Create BOW tranformer based on faq.question
bow_transformer = CountVectorizer(analyzer=text_process).fit(corpus)
# Tranform faq.question itself into BOW
q_bow = bow_transformer.transform(corpus)

# Create TFIDF transformer based on faq.question's BOW
tfidf_transformer = TfidfTransformer().fit(q_bow)
# Transform faq.question's BOW into TFIDF
q_tfidf = tfidf_transformer.transform(q_bow)

# QnA Maker
(The data used by QnA Maker at this time is a slightly less clean version.)<br>
6 successes

# Semantic Similarity with spaCy
1 success

In [11]:
# This is not the full code, so isn't operational in this notebook.
import spacy
nlp = spacy.load('en_core_web_sm')

def max_sim_spacy(q, docs):
    """Returns (index, similarity) of argument q's most similar match in argument docs, all spaCy documents."""
    q = nlp(q)
    max_i = 0
    max_s = 0
    ms = []
    for i, d in enumerate(docs):
        if d.similarity(q) > max_s:
            max_s = d.similarity(q)
            max_i = i
    
    return max_i, max_s    
 
q_docs = [nlp(entry) for entry in faq.question]
a_docs = [nlp(entry) for entry in faq.answer]      

# Cosine Similarity
7 successes<br>

In [7]:
def max_sim_skl(tq):
    """Returns (index, similarity value) of string argument q's most similar match in FAQ, determined by cosine similarity."""
    # Transform test question into BOW using BOW transformer (based on faq.question) 
    tq_bow = bow_transformer.transform([tq])
    # Transform test question's BOW into TFIDF
    tq_tfidf = tfidf_transformer.transform(tq_bow)
    
    sims = np.transpose(cosine_similarity(tq_tfidf, q_tfidf))

    max_s = sims.max()
    max_i = np.argmax(sims)
    
    return max_i, max_s 

In [7]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf_vectorizer = TfidfVectorizer()
# tfidf_matrix = tfidf_vectorizer.fit_transform(faq.question)
# print(tfidf_matrix.shape)

In [8]:
def respond(row):
    """Returns argument row with added columns to match questions in FAQ."""
    query = row.test_question.strip()

    index, sim = max_sim_skl(query)

    row['sim_question'] = faq.question.iloc[index]
#     row['info'] = faq.answer.iloc[index]
    row['max_similarity'] = round(sim, 2)
    row['success'] = row.sim_question == row.match_question
    return row   

In [38]:
test.apply(respond, axis=1) 

Unnamed: 0,test_question,match_question,sim_question,max_similarity,success
0,"I live on the corner of Bear Mountain Drive and Scrub Oak Circle, and would like the City enforce the speed limit on Bear Mountain Drive. Despite a limit of 25 mph, I routinely observe vehicles traveling well above that speed in both directions (especially in the mornings and early evening). There are a ton of kids in this neighborhood, as well as a blind curve in the road between several crosswalks. It baffles me to see so much enforcement on Lehigh (with the regular presence of a photo van) and only the very occasional patrol car on Bear Mountain Drive. It would be great if the photo van or other officers could regularly make an appearance on Bear Mountain.",Speeding on Residential Streets,What does CPW do about bears in town? When are they relocated or killed?,0.27,False
1,Can you please mow the grass in the park. It is becoming difficult to find the dog poop and dog owners are just leaving it in the grass.,Park Maintenance Issues,Dog Parks and Dog Swimming,0.4,False
2,Are there grizzlies in Boulder?,Do we have grizzly bears in Colorado?,Do we have grizzly bears in Colorado?,0.39,True
3,Where do I report being hit by a bicycle?,"Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?","Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?",0.5,True
4,How much time do I have to wait for my income certification for affordable housing?,How long does it take to become income-certified?,Housing Fund for Affordable Housing Providers,0.33,False
5,my water pipes froze and now they are leaking. how do i turn off the water??,How can I prevent and thaw frozen water pipes?,How can I prevent and thaw frozen water pipes?,0.52,True
6,"Hello, There are a group of Gambel Oak Trees with Tree ID #'s 38820-38825 that I would like to be pruned up in order to keep them away from the Rec Center wall and to keep them in good, trimmed health. I also would like Tree ID 38825 to be pruned away from the American Flag so it will not come in contact with the tree, from the nearby flag pole. Thank you very much and if you have any questions or concerns, feel free to give me a call.",Public Tree Issues,Public Tree Issues,0.47,True
7,There are constantly dogs off leash in the children playgrounds of columbine school. Even though the playgrounds are fenced and have a sign stating dogs should not go in. You can see some dog owners do not even pick their dog shit.,Dogs on Open Space and Mountain Parks,Dog Parks and Dog Swimming,0.56,False
8,"I parked in the garage on 11th and Walnut on Sunday 12/23 starting at 5 PM and left at 12:35 AM that Monday 12/24. I was charged $1.25 but it should have still been free since charged parking doesnâ€™t start till 7 AM on Mondays , I park here all the time and I am confused as to why it says I owed $1.25. Is there a glitch in the system ?",Contact Parking Services,"Parking Information - Hours, Rates and Holidays",0.23,False
9,Where do I apply for building permits?,How do I get a building permit?,Do I need a building permit?,0.65,False


In [41]:
print('Successes: ', sum(test.apply(respond, axis=1).success))

Successes:  7


# KD Trees Nearest Neighbor
6 successes with euclidean distance<br>
3 successes with cosine_similarity matrix (if I've implemented it right)<br>
Other results below:

In [31]:
dist_metric_success = {
    'euclidean': 7,
    'l2': 7,
    'minkowski': 7,
    'p': 7,
    'manhattan': 0,
    'cityblock': 0,
    'l1': 0,
    'chebyshev': 5,
    'infinity': 5}

In [74]:
tree = KDTree(q_tfidf.toarray(), metric='minkowski')
# tree = KDTree(cosine_similarity(q_tfidf, q_tfidf))

In [25]:
def kd_sim(query):
    """Returns (index, similarity value) of string argument q's most similar match in FAQ, determined by cosine similarity, determined by nearest neighor in a KDTree of TFIDF vectors."""
    # Transform test question into BOW using BOW transformer (based on faq.question) 
    tq_bow = bow_transformer.transform([query])
    # Transform test question's BOW into TFIDF
    tq_tfidf = tfidf_transformer.transform(tq_bow)

    nearest_dist, nearest_ind = tree.query(tq_tfidf.toarray(), k=2)  # k=2 nearest neighbors where k1 = identity
#     nearest_dist, nearest_ind = tree.query(cosine_similarity(tq_tfidf, q_tfidf), k=2)  # k=2 nearest neighbors where k1 = identity
        
    return nearest_ind[0][0], nearest_dist[0][0]

In [12]:
def kd_respond(row):
    """Returns argument row with added columns to match questions in FAQ."""
    query = row.test_question.strip()

    index, dis = kd_sim(query)

    row['near_question'] = faq.question.iloc[index]
#     row['info'] = faq.answer.iloc[index]
    row['nearest_distance'] = round(dis, 2)
    row['success'] = row.near_question == row.match_question
    return row   

In [75]:
test.apply(kd_respond, axis=1)

Unnamed: 0,test_question,match_question,near_question,nearest_distance,success
0,"I live on the corner of Bear Mountain Drive and Scrub Oak Circle, and would like the City enforce the speed limit on Bear Mountain Drive. Despite a limit of 25 mph, I routinely observe vehicles traveling well above that speed in both directions (especially in the mornings and early evening). There are a ton of kids in this neighborhood, as well as a blind curve in the road between several crosswalks. It baffles me to see so much enforcement on Lehigh (with the regular presence of a photo van) and only the very occasional patrol car on Bear Mountain Drive. It would be great if the photo van or other officers could regularly make an appearance on Bear Mountain.",Speeding on Residential Streets,What does CPW do about bears in town? When are they relocated or killed?,1.21,False
1,Can you please mow the grass in the park. It is becoming difficult to find the dog poop and dog owners are just leaving it in the grass.,Park Maintenance Issues,Dog Parks and Dog Swimming,1.1,False
2,Are there grizzlies in Boulder?,Do we have grizzly bears in Colorado?,Do we have grizzly bears in Colorado?,1.1,True
3,Where do I report being hit by a bicycle?,"Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?","Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?",1.0,True
4,How much time do I have to wait for my income certification for affordable housing?,How long does it take to become income-certified?,Housing Fund for Affordable Housing Providers,1.16,False
5,my water pipes froze and now they are leaking. how do i turn off the water??,How can I prevent and thaw frozen water pipes?,How can I prevent and thaw frozen water pipes?,0.98,True
6,"Hello, There are a group of Gambel Oak Trees with Tree ID #'s 38820-38825 that I would like to be pruned up in order to keep them away from the Rec Center wall and to keep them in good, trimmed health. I also would like Tree ID 38825 to be pruned away from the American Flag so it will not come in contact with the tree, from the nearby flag pole. Thank you very much and if you have any questions or concerns, feel free to give me a call.",Public Tree Issues,Public Tree Issues,1.03,True
7,There are constantly dogs off leash in the children playgrounds of columbine school. Even though the playgrounds are fenced and have a sign stating dogs should not go in. You can see some dog owners do not even pick their dog shit.,Dogs on Open Space and Mountain Parks,Dog Parks and Dog Swimming,0.94,False
8,"I parked in the garage on 11th and Walnut on Sunday 12/23 starting at 5 PM and left at 12:35 AM that Monday 12/24. I was charged $1.25 but it should have still been free since charged parking doesnâ€™t start till 7 AM on Mondays , I park here all the time and I am confused as to why it says I owed $1.25. Is there a glitch in the system ?",Contact Parking Services,"Parking Information - Hours, Rates and Holidays",1.24,False
9,Where do I apply for building permits?,How do I get a building permit?,Do I need a building permit?,0.83,False


In [76]:
print('Successes: ', sum(test.apply(kd_respond, axis=1).success))

Successes:  7


# Soft Cosine Similarity... 
https://www.machinelearningplus.com/nlp/cosine-similarity/