https://www.analyticsvidhya.com/blog/2018/01/faq-chatbots-the-future-of-information-searching/
http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/
http://nlp.town/blog/sentence-similarity/
https://spacy.io/usage/training

In [1]:
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KDTree

# Uncomment to see full cell text
pd.set_option('display.max_colwidth', 0)

In [2]:
# Import local libraries
import sys
sys.path.append('../src')

import processing

In [3]:
faq = pd.read_csv('../data/interim/faq-text-separated.csv', keep_default_na=False)
test = pd.read_csv('../data/test/test-questions.csv')
results = pd.DataFrame()

## Load test set for topic matching

In [20]:
test = pd.read_excel('../../../Inquire Boulder request data- detailed open and closed - for research purposes.xlsx')
test = test[['Description', 'Topic']]
test = test.rename(index=str, columns={"Description": "test_question", "Topic": "match_topic"})

# QnA Maker
(The data used by QnA Maker for this metric is a somewhat unclean version.)<br>
6 successes

# Google Dialogflow
Using the built-in Dialogflow ML in 'hybrid' mode with threshold 0.3 (default settings)<br>
4 successes

# Semantic Similarity with spaCy
1 success

In [None]:
# This is not the full code, so isn't operational in this notebook.
import spacy
# A larger model should be used for better accuracy
nlp = spacy.load('en_core_web_sm')

def max_sim_spacy(q, docs):
    """Returns (index, similarity) of argument q's most similar match in argument docs, all spaCy documents."""
    q = nlp(q)
    max_i = 0
    max_s = 0
    ms = []
    for i, d in enumerate(docs):
        if d.similarity(q) > max_s:
            max_s = d.similarity(q)
            max_i = i
    
    return max_i, max_s    
 
q_docs = [nlp(entry) for entry in faq.question]
a_docs = [nlp(entry) for entry in faq.answer]      

# Text Processing

In [4]:
import string
from sklearn.feature_extraction import stop_words
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [5]:
def test_lem(words):
    """Returns list of lemmas from argument list of words."""
    wordnet_lemmatizer = WordNetLemmatizer()
    lem_sentence=[]
    for word in words:
        lem_sentence.append(wordnet_lemmatizer.lemmatize(word))
    return lem_sentence

def test_text_process(mess):
    """Returns list of tokenized lemmas in argument string mess, with stopwords, punctuation removed."""
    clean = [char if char not in string.punctuation else ' ' for char in mess]
    clean = ''.join(clean)
    clean = [word.lower() for word in clean.split() if word.lower() not in stop_words.ENGLISH_STOP_WORDS] 
    clean = test_lem(clean)
    return clean

In [6]:
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


# defines a custom vectorizer class
class CustomVectorizer(CountVectorizer): 
    
    # overwrite the build_analyzer method, allowing one to
    # create a custom analyzer for the vectorizer
    def build_analyzer(self):
        
        # load stop words using CountVectorizer's built in method
        stop_words = self.get_stop_words()
        
        # create the analyzer that will be returned by this method
        def analyser(mess):
            # Remove punctuation, lowercase all letters            
            clean = mess.replace('-','')
            clean = ''.join([char if char not in string.punctuation or '0123456789' else ' ' for char in clean])
            clean = [word.lower() for word in clean.split()] 
        
            wordnet_lemmatizer = WordNetLemmatizer()
            lem_sentence=[]
            for word in clean:
                lem_sentence.append(wordnet_lemmatizer.lemmatize(word, get_wordnet_pos(word)))
            
            # use CountVectorizer's _word_ngrams built in method
            # to remove stop words and extract n-grams
            return(self._word_ngrams(lem_sentence, stop_words))
        return(analyser)

In [7]:
# Create corpus by joining columns
features = ['Topic', 'Category', 'Department', 'question', 'answer']
corpus = ''
for f in features:
    corpus += faq[f] + ' '

# BOW and TFIDF Vectorization

In [8]:
count_vect = CustomVectorizer(ngram_range=(1, 2), stop_words='english')
bow_transformer = count_vect.fit(corpus)
# Tranform corpus itself into BOW
bow = bow_transformer.transform(corpus)

# Create TFIDF transformer based on faq.question's BOW
tfidf_transformer = TfidfTransformer().fit(bow)
# Transform faq.question's BOW into TFIDF
tfidf = tfidf_transformer.transform(bow)

In [9]:
#### Print info about text features
def wm2df(wm, feat_names):
    
    # create an index for each row
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
    df = pd.DataFrame(data=wm.toarray(), index=doc_names,
                      columns=feat_names)
    return(df)
  
# wm2df(bow, count_vect.get_feature_names())

In [10]:
print(len(count_vect.vocabulary_))
# print(count_vect.vocabulary_)

39784


In [11]:
print(bow_transformer.transform(['bear mountain ']))

  (0, 5275)	1
  (0, 5317)	1
  (0, 23356)	1


# Cosine Similarity

In [12]:
def max_sim_skl(tq):
    """Returns (index, similarity value) of string argument q's most similar match in FAQ, determined by cosine similarity."""
    # Transform test question into BOW using BOW transformer (based on faq.question) 
    tq_bow = bow_transformer.transform([tq])
    # Transform test question's BOW into TFID
    tq_tfidf = tfidf_transformer.transform(tq_bow)

    sims = np.transpose(cosine_similarity(tq_tfidf, tfidf))

    max_s = sims.max()
    max_i = np.argmax(sims)
    
    return max_i, max_s 

In [13]:
# Form cosine similarity matrix
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf_vectorizer = TfidfVectorizer()
# tfidf_matrix = tfidf_vectorizer.fit_transform(faq.question)
# print(tfidf_matrix.shape)

In [27]:
def cosine_respond(row):
    """Returns argument row with added columns to match questions in FAQ."""
    query = row.test_question.strip()

    index, sim = max_sim_skl(query)

    if 'match_question' in row.keys():
        match = faq.question.iloc[index]
        response = faq.answer.iloc[index]
                
        row['sim_question'] = match
        row['question_success'] = row.sim_question == row.match_question
#         if 'service request' in response.lower():
#             response += '\n\nIf you would like to submit a service request, please visit https://user.govoutreach.com/boulder/faq.php?cmd=shell or call 303-441-3388.'   
#         row['info'] = response    

    if 'match_topic' in row.keys():
        row['sim_topic'] = faq.Topic.iloc[index]
        row['topic_success'] = row.sim_topic == row.match_topic
    
    row['max_similarity'] = round(sim, 2)
    return row 

In [28]:
t = test.apply(cosine_respond, axis=1)

In [None]:
# Test Topics
results['cosine_similarity'] = t.topic_success

In [29]:
# Test Questions
results['cosine_similarity'] = t.question_success

In [30]:
print('Successes: ', sum(results.cosine_similarity), '/', len(results), '=', round(sum(results.cosine_similarity) / len(results), 2))
t

Successes:  6 / 20 = 0.3


Unnamed: 0,test_question,match_question,sim_question,question_success,max_similarity
0,"I live on the corner of Bear Mountain Drive and Scrub Oak Circle, and would like the City enforce the speed limit on Bear Mountain Drive. Despite a limit of 25 mph, I routinely observe vehicles traveling well above that speed in both directions (especially in the mornings and early evening). There are a ton of kids in this neighborhood, as well as a blind curve in the road between several crosswalks. It baffles me to see so much enforcement on Lehigh (with the regular presence of a photo van) and only the very occasional patrol car on Bear Mountain Drive. It would be great if the photo van or other officers could regularly make an appearance on Bear Mountain.",Speeding on Residential Streets,What does CPW do about bears in town? When are they relocated or killed?,False,0.19
1,Can you please mow the grass in the park. It is becoming difficult to find the dog poop and dog owners are just leaving it in the grass.,Park Maintenance Issues,Dog Parks and Dog Swimming?,False,0.32
2,Are there grizzlies in Boulder?,Do we have grizzly bears in Colorado?,Do we have grizzly bears in Colorado?,True,0.23
3,Where do I report being hit by a bicycle?,"Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?","Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?",True,0.29
4,How much time do I have to wait for my income certification for affordable housing?,How long does it take to become income-certified?,Can I have a cosigner?,False,0.28
5,my water pipes froze and now they are leaking. how do i turn off the water??,How can I prevent and thaw frozen water pipes?,How can I prevent and thaw frozen water pipes?,True,0.41
6,"Hello, There are a group of Gambel Oak Trees with Tree ID #'s 38820-38825 that I would like to be pruned up in order to keep them away from the Rec Center wall and to keep them in good, trimmed health. I also would like Tree ID 38825 to be pruned away from the American Flag so it will not come in contact with the tree, from the nearby flag pole. Thank you very much and if you have any questions or concerns, feel free to give me a call.",Public Tree Issues,Public Tree Issues?,False,0.3
7,There are constantly dogs off leash in the children playgrounds of columbine school. Even though the playgrounds are fenced and have a sign stating dogs should not go in. You can see some dog owners do not even pick their dog shit.,Dogs on Open Space and Mountain Parks,Dog Parks and Dog Swimming?,False,0.52
8,"I parked in the garage on 11th and Walnut on Sunday 12/23 starting at 5 PM and left at 12:35 AM that Monday 12/24. I was charged $1.25 but it should have still been free since charged parking doesnâ€™t start till 7 AM on Mondays , I park here all the time and I am confused as to why it says I owed $1.25. Is there a glitch in the system ?",Contact Parking Services,"If there is still time left on my kiosk receipt, can I park in another area?",False,0.09
9,Where do I apply for building permits?,How do I get a building permit?,How do I get a building permit?,True,0.15


# Soft Cosine Similarity... 
https://www.machinelearningplus.com/nlp/cosine-similarity/

In [46]:
# Insert here

# KD Trees Nearest Neighbor
7 successes with euclidean distance<br>
3 successes with cosine_similarity matrix (if I've implemented it right)<br>
Other results below:

In [42]:
dist_metric_success = {
    'euclidean': 7,
    'l2': 7,
    'minkowski': 7,
    'p': 7,
    'manhattan': 0,
    'cityblock': 0,
    'l1': 0,
    'chebyshev': 5,
    'infinity': 5}

In [43]:
tree = KDTree(tfidf.toarray(), metric='euclidean')
# tree = KDTree(cosine_similarity(tfidf, tfidf))

In [44]:
def kd_sim(query):
    """Returns (index, similarity value) of string argument q's most similar match in FAQ, determined by cosine similarity, determined by nearest neighor in a KDTree of TFIDF vectors."""
    # Transform test question into BOW using BOW transformer (based on faq.question) 
    tq_bow = bow_transformer.transform([query])
    # Transform test question's BOW into TFIDF
    tq_tfidf = tfidf_transformer.transform(tq_bow)

    nearest_dist, nearest_ind = tree.query(tq_tfidf.toarray(), k=2)  # k=2 nearest neighbors where k1 = identity
#     nearest_dist, nearest_ind = tree.query(cosine_similarity(tq_tfidf, tfidf), k=2)  # k=2 nearest neighbors where k1 = identity
        
    return nearest_ind[0][0], nearest_dist[0][0]

In [45]:
def kd_respond(row):
    """Returns argument row with added columns to match questions in FAQ."""
    query = row.test_question.strip()

    index, dis = kd_sim(query)

    row['near_question'] = faq.question.iloc[index]
    row['nearest_distance'] = round(dis, 2)
    row['success'] = row.near_question == row.match_question
    return row   

In [46]:
results['kd_tree'] = test.apply(kd_respond, axis=1).success
print('Successes: ', sum(results.kd_tree))
test.apply(kd_respond, axis=1)

Successes:  7


Unnamed: 0,test_question,match_question,near_question,nearest_distance,success
0,"I live on the corner of Bear Mountain Drive and Scrub Oak Circle, and would like the City enforce the speed limit on Bear Mountain Drive. Despite a limit of 25 mph, I routinely observe vehicles traveling well above that speed in both directions (especially in the mornings and early evening). There are a ton of kids in this neighborhood, as well as a blind curve in the road between several crosswalks. It baffles me to see so much enforcement on Lehigh (with the regular presence of a photo van) and only the very occasional patrol car on Bear Mountain Drive. It would be great if the photo van or other officers could regularly make an appearance on Bear Mountain.",Speeding on Residential Streets,What does CPW do about bears in town? When are they relocated or killed?,1.21,False
1,Can you please mow the grass in the park. It is becoming difficult to find the dog poop and dog owners are just leaving it in the grass.,Park Maintenance Issues,Dog Parks and Dog Swimming,1.1,False
2,Are there grizzlies in Boulder?,Do we have grizzly bears in Colorado?,Do we have grizzly bears in Colorado?,1.13,True
3,Where do I report being hit by a bicycle?,"Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?","Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?",1.01,True
4,How much time do I have to wait for my income certification for affordable housing?,How long does it take to become income-certified?,How long does it take to become income-certified?,1.15,True
5,my water pipes froze and now they are leaking. how do i turn off the water??,How can I prevent and thaw frozen water pipes?,How can I prevent and thaw frozen water pipes?,0.97,True
6,"Hello, There are a group of Gambel Oak Trees with Tree ID #'s 38820-38825 that I would like to be pruned up in order to keep them away from the Rec Center wall and to keep them in good, trimmed health. I also would like Tree ID 38825 to be pruned away from the American Flag so it will not come in contact with the tree, from the nearby flag pole. Thank you very much and if you have any questions or concerns, feel free to give me a call.",Public Tree Issues,Public Tree Issues,1.04,True
7,There are constantly dogs off leash in the children playgrounds of columbine school. Even though the playgrounds are fenced and have a sign stating dogs should not go in. You can see some dog owners do not even pick their dog shit.,Dogs on Open Space and Mountain Parks,Dog Parks and Dog Swimming,0.94,False
8,"I parked in the garage on 11th and Walnut on Sunday 12/23 starting at 5 PM and left at 12:35 AM that Monday 12/24. I was charged $1.25 but it should have still been free since charged parking doesnâ€™t start till 7 AM on Mondays , I park here all the time and I am confused as to why it says I owed $1.25. Is there a glitch in the system ?",Contact Parking Services,"Parking Information - Hours, Rates and Holidays",1.26,False
9,Where do I apply for building permits?,How do I get a building permit?,How do I get a building permit?,0.91,True


# Multinomial Naive-Bayes

In [33]:
from sklearn.naive_bayes import MultinomialNB
faq_model = MultinomialNB().fit(tfidf, faq['question'])

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=processing.text_process, ngram_range=(1, 1))),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [35]:
pipeline.fit(corpus,faq['question'])

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_process at 0x1a1263eb70>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 4), preprocessor=None...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [36]:
predictions = pipeline.predict(test.test_question)

In [37]:
results['multinomial_nb'] = [True if x == test.match_question[i] else False for i, x in enumerate(predictions)]
print('Successes: ', sum(results.multinomial_nb))
print(classification_report(predictions, test.match_question))

Successes:  0
                                                                                                                                                                                                                                                                                                                                                                       precision    recall  f1-score   support

                                                                                                                                                                                                                                                                                                                        Can I sell my product from a cart or vehicle?       0.00      0.00      0.00        10
                                                                                                                                                                                           

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


# Random Forests of Decision Trees

In [None]:
# Insert here?

# Pretrained Word2Vec Model

from gensim.models import Word2Vec, KeyedVectors
pretrainedpath = "/Users/willscott/Documents/Gensim-Word2Vec-models/GoogleNews-vectors-negative300.bin"
#Load W2V model. This will take some time, but it is a one time effort! 
%time w2v_model = KeyedVectors.load_word2vec_format(pretrainedpath, binary=True)
print('done loading Word2Vec')
print(len(w2v_model.vocab)) #Number of words in the vocabulary. 

#Let us examine the model by knowing what the most similar words are, for a given word!
%time w2v_model.most_similar('boulder')

w2v_model['computer']

#What if I am looking for a word that is not in this vocabulary?
w2v_model['practicalnlp']

# Training Custom Word2Vec Model

In [5]:
from gensim.models import Word2Vec
from sklearn.feature_extraction import stop_words

In [6]:
def w2v_text_process(mess):
    """Returns list of the cleaned text in argument string mess, with stopwords, punctuation removed and tokens lemmatized."""
    clean = [char if char not in string.punctuation else ' ' for char in mess]
    clean = ''.join(clean)
    clean = [word.lower() for word in clean.split() if word.lower() not in stop_words.ENGLISH_STOP_WORDS] 
    return clean

In [7]:
# Creating a feature vector by averaging all embeddings for all sentences
def embedding_feats(list_of_lists):
    DIMENSION = 300
    zero_vector = np.zeros(DIMENSION)
    feats = []
    for list_of_tokens in list_of_lists:
        feat_for_this = zero_vector
        count_for_this = 0
        for token in list_of_tokens:
            if token in our_model:
                feat_for_this += our_model[token]
                count_for_this += 1
        feats.append(feat_for_this/count_for_this) 
    return feats

In [8]:
def max_sim_w2v(tq):
    """Returns (index, similarity value) of string argument q's most similar match in FAQ, determined by cosine similarity."""
    query = [w2v_text_process(tq)]
    query = embedding_feats(query)
 
    sims = np.transpose(cosine_similarity(query, corpus_vectors))

    max_s = sims.max()
    max_i = np.argmax(sims)
    
    return max_i, max_s 

In [9]:
def cosine_respond_w2v(row):
    """Returns argument row with new info columns to match with questions in FAQ."""
    query = row.test_question.strip()

    index, sim = max_sim_w2v(query)

    row['sim_question'] = faq.question.iloc[index]
    row['max_similarity'] = round(sim, 2)
    row['success'] = row.sim_question == row.match_question
    row['info'] = faq.answer.iloc[index]
    return row   

In [10]:
# Create corpus by joining columns
features = ['Topic', 'Category', 'Department', 'question', 'answer']
corpus = ''
for f in features:
    corpus += faq[f] + ' '
    
corpus = [w2v_text_process(x) for x in corpus]

In [11]:
#Build the model, by selecting the parameters. 
our_model = Word2Vec(corpus, size=300, window=5, min_count=1, workers=4)
#Save the model
our_model.save("../models/tempmodel.w2v")
#Inspect the model by looking for the most similar words for a test word. 
# [print(x) for x in our_model.wv.most_similar('dogs', topn=5)]

In [12]:
corpus_vectors = embedding_feats(corpus)

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


In [19]:
results['w2v'] = test.apply(cosine_respond_w2v, axis=1).success
print('Successes: ', sum(results.w2v))
test.apply(cosine_respond_w2v, axis=1)

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


Successes:  1


Unnamed: 0,test_question,match_question,sim_question,max_similarity,success,info
0,"I live on the corner of Bear Mountain Drive and Scrub Oak Circle, and would like the City enforce the speed limit on Bear Mountain Drive. Despite a limit of 25 mph, I routinely observe vehicles traveling well above that speed in both directions (especially in the mornings and early evening). There are a ton of kids in this neighborhood, as well as a blind curve in the road between several crosswalks. It baffles me to see so much enforcement on Lehigh (with the regular presence of a photo van) and only the very occasional patrol car on Bear Mountain Drive. It would be great if the photo van or other officers could regularly make an appearance on Bear Mountain.",Speeding on Residential Streets,Will CPW pay for property damage caused by bears?,1.0,False,"In some cases, the CPW will reimburse agricultural producers for damage caused by big game, including bears, to their agricultural product or means of production. The reimbursement process is governed by state statutes and regulations and does not apply to personal property, including fences, garages, cars, or most other objects that a bear might damage within the Boulder city limits."
1,Can you please mow the grass in the park. It is becoming difficult to find the dog poop and dog owners are just leaving it in the grass.,Park Maintenance Issues,Will CPW pay for property damage caused by bears?,1.0,False,"In some cases, the CPW will reimburse agricultural producers for damage caused by big game, including bears, to their agricultural product or means of production. The reimbursement process is governed by state statutes and regulations and does not apply to personal property, including fences, garages, cars, or most other objects that a bear might damage within the Boulder city limits."
2,Are there grizzlies in Boulder?,Do we have grizzly bears in Colorado?,What if I can't pay my fine by the court date or the due date? Can I get an extension?,1.0,False,"Fines and costs are due on the date that you either plead guilty or are found guilty after trial. In rare cases, the court will grant brief extensions of the payment due date. After the court has imposed your fine, you may complete an application for a stay of your payment obligation. Your application will be reviewed by the judge or the court's staff. The court charges a $25 stay fee."
3,Where do I report being hit by a bicycle?,"Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?","I have received a citation, how do I appeal?",1.0,False,"Under section 6‐3‐12, you have a right to a hearing to determine whether the assessment of a civil penalty comports with the requirements and standards in chapter 6-3-12, B.R.C, 1981. A request for a hearing must be received by the Boulder Municipal Court no later than 10 calendar days from the date of this letter. If you do not request a hearing within 10 days, the decision of the city to assess a civil penalty and to certify any unpaid charges to the Boulder County Treasurer will become final. The form for requesting a hearing may be found on the Boulder Municipal website ( http://tinyurl.com/civil-onlineform ) or at Request for Quasi-Judicial Review and may be submitted to the attention of ""Civil Settings Request for Quasi-Judicial Review"" and mailed to P.O. Box 8015, Boulder, CO 80302, fax to 303-441-4233, emailed to settings@bouldercolorado.gov, or hand delivered at 1777 6th Street Boulder, CO 80302 between the hours of 8:00 am - 4:30 pm. If questions arise while completing the form, please call the Boulder Municipal Court at 720-564-2327 and ask to speak to the Civil Settings Clerk. ""Do Not Pay this citation until after the hearing process is completed."""
4,How much time do I have to wait for my income certification for affordable housing?,How long does it take to become income-certified?,How long does it take to become income-certified?,1.0,True,Generally up to two weeks.
5,my water pipes froze and now they are leaking. how do i turn off the water??,How can I prevent and thaw frozen water pipes?,Boulder Office of Emergency Management?,1.0,False,"The Boulder Office of Emergency Management (OEM) has emergency management responsibilities for both the City of Boulder and Boulder County. The Boulder OEM enables effective preparation for, efficient response to, and effective recovery from emergencies and disasters in order to save lives, reduce human suffering, protect resources and develop a more resilient community."
6,"Hello, There are a group of Gambel Oak Trees with Tree ID #'s 38820-38825 that I would like to be pruned up in order to keep them away from the Rec Center wall and to keep them in good, trimmed health. I also would like Tree ID 38825 to be pruned away from the American Flag so it will not come in contact with the tree, from the nearby flag pole. Thank you very much and if you have any questions or concerns, feel free to give me a call.",Public Tree Issues,Can I seal my record?,1.0,False,"You may have the right to seal the record of conviction of this/these offense(s) if you comply with the applicable provisions of §24-72-308.9 CRS. Municipal court staff cannot provide legal advice. If you have questions, you may wish to seek legal counsel. For more information on Senate Bill 13-123 click here."
7,There are constantly dogs off leash in the children playgrounds of columbine school. Even though the playgrounds are fenced and have a sign stating dogs should not go in. You can see some dog owners do not even pick their dog shit.,Dogs on Open Space and Mountain Parks,How is annual income calculated?,1.0,False,"Annual income is defined as the anticipated total income for the next 12-month period received from all sources by each member (over the age of 18) of the household. It is assumed that today's circumstances will continue for the next 12 months. The applicant is required to verify this by either submitting a letter from his/her employer stating his/her annual wage or by completing an employer verification form. To obtain this form, call the Housing Division at 303-441-3157 ext. 2."
8,"I parked in the garage on 11th and Walnut on Sunday 12/23 starting at 5 PM and left at 12:35 AM that Monday 12/24. I was charged $1.25 but it should have still been free since charged parking doesnâ€™t start till 7 AM on Mondays , I park here all the time and I am confused as to why it says I owed $1.25. Is there a glitch in the system ?",Contact Parking Services,"I have received a citation, how do I appeal?",1.0,False,"Under section 6‐3‐12, you have a right to a hearing to determine whether the assessment of a civil penalty comports with the requirements and standards in chapter 6-3-12, B.R.C, 1981. A request for a hearing must be received by the Boulder Municipal Court no later than 10 calendar days from the date of this letter. If you do not request a hearing within 10 days, the decision of the city to assess a civil penalty and to certify any unpaid charges to the Boulder County Treasurer will become final. The form for requesting a hearing may be found on the Boulder Municipal website ( http://tinyurl.com/civil-onlineform ) or at Request for Quasi-Judicial Review and may be submitted to the attention of ""Civil Settings Request for Quasi-Judicial Review"" and mailed to P.O. Box 8015, Boulder, CO 80302, fax to 303-441-4233, emailed to settings@bouldercolorado.gov, or hand delivered at 1777 6th Street Boulder, CO 80302 between the hours of 8:00 am - 4:30 pm. If questions arise while completing the form, please call the Boulder Municipal Court at 720-564-2327 and ask to speak to the Civil Settings Clerk. ""Do Not Pay this citation until after the hearing process is completed."""
9,Where do I apply for building permits?,How do I get a building permit?,Can I sell my product from a cart or vehicle?,1.0,False,Visit the Mobile Food Vehicles page for details.


# Doc2Vec with GenSim
Make sure you have a C compiler before installing Gensim, to use the optimized doc2vec routines - 70x speedup compared to plain NumPy implementation, https://rare-technologies.com/parallelizing-word2vec-in-python/

In [6]:
import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk import word_tokenize
from nltk.stem.porter import *

In [7]:
def d2v_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Lemmatizes all words
    4. Returns a string of the cleaned text
    """
    # Check characters to see if they are in punctuation
    clean = [char if char not in string.punctuation else ' ' for char in mess]
    # Join the characters again to form the string.
    clean = ''.join(clean)
    # Now just remove any stopwords
    clean = [word.lower() for word in clean.split() if word.lower() not in stop_words.ENGLISH_STOP_WORDS]
    # Lemmatize
    clean = lem(clean)
    return ' '.join(clean)
 
def default_clean(text, stem=True, stemmer = PorterStemmer()):
    '''
        Removes default bad characters, Removes stopwords and does stemming
    '''
    # Check characters to see if they are in punctuation
    text = [char if char not in string.punctuation else ' ' for char in text]
    # Join the characters again to form the string.
    text = ''.join(text)
    # Now just remove any stopwords
    text = [word.lower() for word in text.split() if word.lower() not in stop_words.ENGLISH_STOP_WORDS]
    text = ' '.join(text)
    
    if stem:
        text_stemmed = [wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(text) if len(word) > 3]
#         text_stemmed = [stemmer.stem(word) for word in word_tokenize(text) if len(word) > 3]
    else:
        text_stemmed = [word for word in word_tokenize(text) if len(word) > 3]
    
    text = ' '.join(text_stemmed)
    return text

In [8]:
# Add Dept, Category, Topic as LABELS???
class TaggedDocumentIterator(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield TaggedDocument(words=doc.split(), tags=[self.labels_list[idx]])

In [9]:
def test_d2v(test_sample):
    """Passes parameter test_sample into Doc2Vec model and returns the most similar entry in training set."""
    #Clean the document using the utility functions used in train phase
    test_sample = default_clean(test_sample)

    #Convert the sample document into a list and use the infer_vector method to get a vector representation for it
    new_doc_words = test_sample.split()
    new_doc_vec = model.infer_vector(new_doc_words)

    #use the most_similar utility to find the most similar documents.
    return model.docvecs.most_similar(positive=[new_doc_vec])

In [10]:
def respond_d2v(row):
    """Passes parameter row of test dataframe to Doc2Vec prediction function and returns row with results added."""
    query = row.test_question.strip()

    sims = test_d2v(query)
    
    best = sims[0][0]

    row['sim_question'] = best
    row['similarity'] = round(sims[0][1], 2)
    row['success'] = (best == row.match_question)
    return row   

## Train on Answer, test with Question

In [11]:
sample = faq[['question', 'answer']]
sample = sample.sample(frac=1).reset_index(drop=True)
print ('The shape of the input data frame: {}'.format(sample.shape))

The shape of the input data frame: (688, 2)


In [12]:
sample['answer'] = sample['answer'].apply(default_clean)

In [13]:
docLabels = list(sample['question'])
data = list(sample['answer'])
sentences = TaggedDocumentIterator(data, docLabels)

In [32]:
# This is a simple gridsearch-like process to automate parameter testing. 
runs = 100 
res = pd.DataFrame(columns=['params', 'avg successes'])
e = 42
for i in range(5):
    v_s = 50
    for j in range(10):
        m_c = 0
        for k in range(3):
            model = Doc2Vec(vector_size=v_s, min_count=m_c, epochs=e)
            model.build_vocab(sentences)
            model.train(sentences,total_examples=model.corpus_count, epochs=model.epochs)

            avg = 0
            for i in range(runs):
                t = test.apply(respond_d2v, axis=1)
                avg += sum(t.success)

            res = res.append({'params' : 'vector_size={}, min_count={}, epochs={}'.format(v_s, m_c, e), 'avg successes' : avg / runs}, ignore_index=True)

            m_c += 1
            print('mc')
        v_s += 50
        print('vs')
    e += 10
    print('e')

mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e


In [31]:
res.tail()

Unnamed: 0,params,avg successes
95,"vector_size=440, min_count=1, epochs=42",0.97
96,"vector_size=470, min_count=0, epochs=42",1.22
97,"vector_size=470, min_count=1, epochs=42",0.95
98,"vector_size=500, min_count=0, epochs=42",1.18
99,"vector_size=500, min_count=1, epochs=42",1.17


In [30]:
res[res['avg successes'] == max(res['avg successes'])]

Unnamed: 0,params,avg successes
23,"vector_size=260, min_count=1, epochs=12",2.63


In [64]:
# Avg num of sucessses (out of 20) : model parameters

# no stemming
# 2.50 : model = Doc2Vec(vector_size=100, min_count=1, epochs=100)
# 2.70 : model = Doc2Vec(vector_size=100, min_count=0, epochs=100)
# 1.27 : model = Doc2Vec(vector_size=100, min_count=1, epochs=25)
# 1.23 : model = Doc2Vec(vector_size=100, min_count=0 epochs=25)
# 1.18 : model = Doc2Vec( min_count=0, epochs=25)

# with stemming
# 1.74 : model = Doc2Vec(vector_size=100, min_count=1, epochs=100)
# 1.21 : model = Doc2Vec(vector_size=100, min_count=0, epochs=100)
# 1.28 : model = Doc2Vec(vector_size=100, min_count=1, epochs=25)
# 1.15 : model = Doc2Vec(vector_size=100, min_count=0, epochs=25)

# with lemming
# 0.58 : model = Doc2Vec(vector_size=100, min_count=1, epochs=100)
# 1.04 : model = Doc2Vec(vector_size=100, min_count=0, epochs=100)
# 0.93 : model = Doc2Vec(vector_size=100, min_count=1, epochs=25)
# 0.99 : model = Doc2Vec(vector_size=100, min_count=0, epochs=25)
# 1.54 : vector_size=130, min_count=14, epochs=10
# 2.14 : vector_size=230, min_count=0, epochs=10
# 2.63 : vector_size=260, min_count=1, epochs=12
# Tested up to 500, 2, 42

In [62]:
# Individual model testing
model = Doc2Vec(vector_size=100, min_count=0, epochs=25)
model.build_vocab(sentences)
model.train(sentences,total_examples=model.corpus_count, epochs=model.epochs)

runs = 1000 
avg = 0
for i in range(runs):
    t = test.apply(respond_d2v, axis=1)
    avg += sum(t.success)
print('Avg. Successes: ', avg / runs)

In [24]:
# Store the model to mmap-able files
model.save('../models/model_answer.doc2vec')
# Load the model
model = Doc2Vec.load('../models/model_answer.doc2vec')

## Train on Question+Answer, test with Question

In [33]:
sample = pd.DataFrame(corpus, columns=['qna']) 
sample['question'] = faq['question']
sample['answer'] = faq['answer']
sample = sample.sample(frac=1).reset_index(drop=True)
print ('The shape of the input data frame: {}'.format(sample.shape))

The shape of the input data frame: (688, 3)


In [34]:
sample['qna'] = sample['qna'].apply(default_clean)

In [35]:
docLabels = list(sample['question'])
data = list(sample['qna'])
sentences = TaggedDocumentIterator(data, docLabels)

In [40]:
# This is a simple gridsearch-like process to automate parameter testing. 
runs = 100 
res = pd.DataFrame(columns=['params', 'avg successes'])
e = 10
for i in range(5):
    v_s = 50
    for j in range(10):
        m_c = 0
        for k in range(3):
            model = Doc2Vec(vector_size=v_s, min_count=m_c, epochs=e)
            model.build_vocab(sentences)
            model.train(sentences,total_examples=model.corpus_count, epochs=model.epochs)

            avg = 0
            for i in range(runs):
                t = test.apply(respond_d2v, axis=1)
                avg += sum(t.success)

            res = res.append({'params' : 'vector_size={}, min_count={}, epochs={}'.format(v_s, m_c, e), 'avg successes' : avg / runs}, ignore_index=True)

            m_c += 1
            print('mc')
        v_s += 50
        print('vs')
    e += 5
    print('e')

mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e


In [41]:
res.tail()

Unnamed: 0,params,avg successes
145,"vector_size=450, min_count=1, epochs=30",0.99
146,"vector_size=450, min_count=2, epochs=30",0.72
147,"vector_size=500, min_count=0, epochs=30",1.06
148,"vector_size=500, min_count=1, epochs=30",1.01
149,"vector_size=500, min_count=2, epochs=30",0.65


In [42]:
res[res['avg successes'] == max(res['avg successes'])]

Unnamed: 0,params,avg successes
13,"vector_size=250, min_count=1, epochs=10",2.34


In [116]:
# Individual model testing
model = Doc2Vec(vector_size=100, min_count=0, epochs=100)
model.build_vocab(sentences)
model.train(sentences,total_examples=model.corpus_count, epochs=model.epochs)

runs = 100 
avg = 0
for i in range(runs):
    t = test.apply(respond_d2v, axis=1)
    avg += sum(t.success)
print('Avg. Successes: ', avg / runs)

In [103]:
# with tutourial cleaning, no stemming, taggedDocument
#  model = Doc2Vec(vector_size=100, min_count=0, epochs=100)
# 0-2 model = Doc2Vec(vector_size=100, min_count=1, epochs=30)

# with lemming
# 0-2 model = Doc2Vec(vector_size=100, min_count=1, epochs=25)
# 0-2 model = Doc2Vec(vector_size=100, min_count=1, epochs=30)
# 3.07 : vector_size=100, min_count=1, epochs=82

In [312]:
# Store the model to mmap-able files
model.save('../models/model_qna.doc2vec')
# Load the model
model = Doc2Vec.load('../models/model_qna.doc2vec')

# Analyze Similarity Comparisons

In [15]:
grid = pd.DataFrame(index=results.columns, columns=['Matched'])
for c in results.columns:
    grid.Matched[c] = sum(results[c])
grid

Unnamed: 0,Matched
cosine_similarity,6
kd_tree,6
