https://www.analyticsvidhya.com/blog/2018/01/faq-chatbots-the-future-of-information-searching/
http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/
http://nlp.town/blog/sentence-similarity/
https://spacy.io/usage/training

In [2]:
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KDTree

# Uncomment to see full cell text
pd.set_option('display.max_colwidth', 0)

In [3]:
# Import local libraries
import sys
sys.path.append('../src')

import processing

In [22]:
faq = pd.read_csv('../data/interim/faq-text-separated.csv', keep_default_na=False)
test = pd.read_csv('../data/test/test-questions.csv')

In [23]:
test.head()

Unnamed: 0,test_question,match_question
0,"I live on the corner of Bear Mountain Drive and Scrub Oak Circle, and would like the City enforce the speed limit on Bear Mountain Drive. Despite a limit of 25 mph, I routinely observe vehicles traveling well above that speed in both directions (especially in the mornings and early evening). There are a ton of kids in this neighborhood, as well as a blind curve in the road between several crosswalks. It baffles me to see so much enforcement on Lehigh (with the regular presence of a photo van) and only the very occasional patrol car on Bear Mountain Drive. It would be great if the photo van or other officers could regularly make an appearance on Bear Mountain.",Speeding on Residential Streets
1,Can you please mow the grass in the park. It is becoming difficult to find the dog poop and dog owners are just leaving it in the grass.,Park Maintenance Issues
2,Are there grizzlies in Boulder?,Do we have grizzly bears in Colorado?
3,Where do I report being hit by a bicycle?,"Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?"
4,How much time do I have to wait for my income certification for affordable housing?,How long does it take to become income-certified?


# Text Processing

In [6]:
# Create corpus by joining questions and answers
features = ['Topic', 'Category', 'Department', 'question', 'answer']
corpus = ''
for f in features:
    corpus += faq[f] + ' '

# Create BOW tranformer based on corpus
bow_transformer = CountVectorizer(analyzer=processing.text_process).fit(corpus)
# Tranform corpus itself into BOW
bow = bow_transformer.transform(corpus)

# Create TFIDF transformer based on faq.question's BOW
tfidf_transformer = TfidfTransformer().fit(bow)
# Transform faq.question's BOW into TFIDF
tfidf = tfidf_transformer.transform(bow)

results = pd.DataFrame()

# QnA Maker
(The data used by QnA Maker at this time is a slightly less clean version.)<br>
6 successes

# Google Dialogflow
Using the built-in Dialogflow ML in 'hybrid' mode with threshold 0.3 (default settings)<br>
4 successes

# Semantic Similarity with spaCy
1 success

In [None]:
# This is not the full code, so isn't operational in this notebook.
import spacy
# A larger model should be used for better accuracy
nlp = spacy.load('en_core_web_sm')

def max_sim_spacy(q, docs):
    """Returns (index, similarity) of argument q's most similar match in argument docs, all spaCy documents."""
    q = nlp(q)
    max_i = 0
    max_s = 0
    ms = []
    for i, d in enumerate(docs):
        if d.similarity(q) > max_s:
            max_s = d.similarity(q)
            max_i = i
    
    return max_i, max_s    
 
q_docs = [nlp(entry) for entry in faq.question]
a_docs = [nlp(entry) for entry in faq.answer]      

# Cosine Similarity
7 successes<br>

In [5]:
def max_sim_skl(tq):
    """Returns (index, similarity value) of string argument q's most similar match in FAQ, determined by cosine similarity."""
    # Transform test question into BOW using BOW transformer (based on faq.question) 
    tq_bow = bow_transformer.transform([tq])
    # Transform test question's BOW into TFIDF
    tq_tfidf = tfidf_transformer.transform(tq_bow)
    
    sims = np.transpose(cosine_similarity(tq_tfidf, tfidf))

    max_s = sims.max()
    max_i = np.argmax(sims)
    
    return max_i, max_s 

In [75]:
# Form cosine similarity matrix
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf_vectorizer = TfidfVectorizer()
# tfidf_matrix = tfidf_vectorizer.fit_transform(faq.question)
# print(tfidf_matrix.shape)

In [6]:
def cosine_respond(row):
    """Returns argument row with added columns to match questions in FAQ."""
    query = row.test_question.strip()

    index, sim = max_sim_skl(query)

    row['sim_question'] = faq.question.iloc[index]
#     row['info'] = faq.answer.iloc[index]
    row['max_similarity'] = round(sim, 2)
    row['success'] = row.sim_question == row.match_question
    return row   

In [7]:
results['cosine_similarity'] = test.apply(cosine_respond, axis=1).success
print('Successes: ', sum(results.cosine_similarity))
test.apply(cosine_respond, axis=1)

Successes:  6


Unnamed: 0,test_question,match_question,sim_question,max_similarity,success
0,"I live on the corner of Bear Mountain Drive and Scrub Oak Circle, and would like the City enforce the speed limit on Bear Mountain Drive. Despite a limit of 25 mph, I routinely observe vehicles traveling well above that speed in both directions (especially in the mornings and early evening). There are a ton of kids in this neighborhood, as well as a blind curve in the road between several crosswalks. It baffles me to see so much enforcement on Lehigh (with the regular presence of a photo van) and only the very occasional patrol car on Bear Mountain Drive. It would be great if the photo van or other officers could regularly make an appearance on Bear Mountain.",Speeding on Residential Streets,What does CPW do about bears in town? When are they relocated or killed?,0.27,False
1,Can you please mow the grass in the park. It is becoming difficult to find the dog poop and dog owners are just leaving it in the grass.,Park Maintenance Issues,Dog Parks and Dog Swimming,0.39,False
2,Are there grizzlies in Boulder?,Do we have grizzly bears in Colorado?,Do we have grizzly bears in Colorado?,0.37,True
3,Where do I report being hit by a bicycle?,"Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?","Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?",0.49,True
4,How much time do I have to wait for my income certification for affordable housing?,How long does it take to become income-certified?,How long does it take to become income-certified?,0.34,True
5,my water pipes froze and now they are leaking. how do i turn off the water??,How can I prevent and thaw frozen water pipes?,How can I prevent and thaw frozen water pipes?,0.51,True
6,"Hello, There are a group of Gambel Oak Trees with Tree ID #'s 38820-38825 that I would like to be pruned up in order to keep them away from the Rec Center wall and to keep them in good, trimmed health. I also would like Tree ID 38825 to be pruned away from the American Flag so it will not come in contact with the tree, from the nearby flag pole. Thank you very much and if you have any questions or concerns, feel free to give me a call.",Public Tree Issues,Public Tree Issues,0.47,True
7,There are constantly dogs off leash in the children playgrounds of columbine school. Even though the playgrounds are fenced and have a sign stating dogs should not go in. You can see some dog owners do not even pick their dog shit.,Dogs on Open Space and Mountain Parks,Dog Parks and Dog Swimming,0.55,False
8,"I parked in the garage on 11th and Walnut on Sunday 12/23 starting at 5 PM and left at 12:35 AM that Monday 12/24. I was charged $1.25 but it should have still been free since charged parking doesnâ€™t start till 7 AM on Mondays , I park here all the time and I am confused as to why it says I owed $1.25. Is there a glitch in the system ?",Contact Parking Services,"Parking Information - Hours, Rates and Holidays",0.21,False
9,Where do I apply for building permits?,How do I get a building permit?,How do I get a building permit?,0.59,False


# Soft Cosine Similarity... 
https://www.machinelearningplus.com/nlp/cosine-similarity/

In [46]:
# Insert here

# Multinomial Naive-Bayes

In [7]:
from sklearn.naive_bayes import MultinomialNB
faq_model = MultinomialNB().fit(tfidf, faq['question'])

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=processing.text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [9]:
pipeline.fit(corpus,faq['question'])

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_process at 0x1a16285bf8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [10]:
predictions = pipeline.predict(test.test_question)

In [18]:
results['multinomial_nb'] = [True if x == test.match_question[i] else False for i, x in enumerate(predictions)]
print('Successes: ', sum(results.multinomial_nb))
print(classification_report(predictions, test.match_question))

Successes:  0
                                                                                                                                                                                                                                                                                                                                                                               precision    recall  f1-score   support

                                                                                                                                                                                                                                                                                                                                                     Composting and Recycling       0.00      0.00      0.00         0
                                                                                                                                                                           

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


# Random Forests of Decision Trees

In [None]:
# Insert here?

# KD Trees Nearest Neighbor
7 successes with euclidean distance<br>
3 successes with cosine_similarity matrix (if I've implemented it right)<br>
Other results below:

In [78]:
dist_metric_success = {
    'euclidean': 7,
    'l2': 7,
    'minkowski': 7,
    'p': 7,
    'manhattan': 0,
    'cityblock': 0,
    'l1': 0,
    'chebyshev': 5,
    'infinity': 5}

In [79]:
tree = KDTree(q_tfidf.toarray(), metric='euclidean')
# tree = KDTree(cosine_similarity(tfidf, tfidf))

In [80]:
def kd_sim(query):
    """Returns (index, similarity value) of string argument q's most similar match in FAQ, determined by cosine similarity, determined by nearest neighor in a KDTree of TFIDF vectors."""
    # Transform test question into BOW using BOW transformer (based on faq.question) 
    tq_bow = bow_transformer.transform([query])
    # Transform test question's BOW into TFIDF
    tq_tfidf = tfidf_transformer.transform(tq_bow)

    nearest_dist, nearest_ind = tree.query(tq_tfidf.toarray(), k=2)  # k=2 nearest neighbors where k1 = identity
#     nearest_dist, nearest_ind = tree.query(cosine_similarity(tq_tfidf, tfidf), k=2)  # k=2 nearest neighbors where k1 = identity
        
    return nearest_ind[0][0], nearest_dist[0][0]

In [81]:
def kd_respond(row):
    """Returns argument row with added columns to match questions in FAQ."""
    query = row.test_question.strip()

    index, dis = kd_sim(query)

    row['near_question'] = faq.question.iloc[index]
    row['nearest_distance'] = round(dis, 2)
    row['success'] = row.near_question == row.match_question
    return row   

In [82]:
results['kd_tree'] = test.apply(kd_respond, axis=1).success
print('Successes: ', sum(results.kd_tree))
test.apply(kd_respond, axis=1)

Successes:  6


Unnamed: 0,test_question,match_question,near_question,nearest_distance,success
0,"I live on the corner of Bear Mountain Drive and Scrub Oak Circle, and would like the City enforce the speed limit on Bear Mountain Drive. Despite a limit of 25 mph, I routinely observe vehicles traveling well above that speed in both directions (especially in the mornings and early evening). There are a ton of kids in this neighborhood, as well as a blind curve in the road between several crosswalks. It baffles me to see so much enforcement on Lehigh (with the regular presence of a photo van) and only the very occasional patrol car on Bear Mountain Drive. It would be great if the photo van or other officers could regularly make an appearance on Bear Mountain.",Speeding on Residential Streets,What does CPW do about bears in town? When are they relocated or killed?,1.21,False
1,Can you please mow the grass in the park. It is becoming difficult to find the dog poop and dog owners are just leaving it in the grass.,Park Maintenance Issues,Dog Parks and Dog Swimming,1.11,False
2,Are there grizzlies in Boulder?,Do we have grizzly bears in Colorado?,Do we have grizzly bears in Colorado?,1.12,True
3,Where do I report being hit by a bicycle?,"Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?","Have you had a close call with a bicycle, pedestrian or motorist? For example: Were you in a crosswalk (on foot, bike, skateboard) and a car almost hit you? Were you riding your bike on the right side of the road and a car almost hit you? Did you bike through a red light and a car almost hit you? Were you walking on the sidewalk and a bike almost hit you?",1.01,True
4,How much time do I have to wait for my income certification for affordable housing?,How long does it take to become income-certified?,How long does it take to become income-certified?,1.15,True
5,my water pipes froze and now they are leaking. how do i turn off the water??,How can I prevent and thaw frozen water pipes?,How can I prevent and thaw frozen water pipes?,0.99,True
6,"Hello, There are a group of Gambel Oak Trees with Tree ID #'s 38820-38825 that I would like to be pruned up in order to keep them away from the Rec Center wall and to keep them in good, trimmed health. I also would like Tree ID 38825 to be pruned away from the American Flag so it will not come in contact with the tree, from the nearby flag pole. Thank you very much and if you have any questions or concerns, feel free to give me a call.",Public Tree Issues,Public Tree Issues,1.03,True
7,There are constantly dogs off leash in the children playgrounds of columbine school. Even though the playgrounds are fenced and have a sign stating dogs should not go in. You can see some dog owners do not even pick their dog shit.,Dogs on Open Space and Mountain Parks,Dog Parks and Dog Swimming,0.95,False
8,"I parked in the garage on 11th and Walnut on Sunday 12/23 starting at 5 PM and left at 12:35 AM that Monday 12/24. I was charged $1.25 but it should have still been free since charged parking doesnâ€™t start till 7 AM on Mondays , I park here all the time and I am confused as to why it says I owed $1.25. Is there a glitch in the system ?",Contact Parking Services,"Parking Information - Hours, Rates and Holidays",1.26,False
9,Where do I apply for building permits?,How do I get a building permit?,How do I get a building permit?,0.9,False


# Doc2Vec with GenSim
Make sure you have a C compiler before installing Gensim, to use the optimized doc2vec routines (70x speedup compared to plain NumPy implementation, https://rare-technologies.com/parallelizing-word2vec-in-python/).

In [6]:
import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk import word_tokenize
from nltk.stem.porter import *

In [7]:
def d2v_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Lemmatizes all words
    4. Returns a string of the cleaned text
    """
    # Check characters to see if they are in punctuation
    clean = [char if char not in string.punctuation else ' ' for char in mess]
    # Join the characters again to form the string.
    clean = ''.join(clean)
    # Now just remove any stopwords
    clean = [word.lower() for word in clean.split() if word.lower() not in stop_words.ENGLISH_STOP_WORDS]
    # Lemmatize
    clean = lem(clean)
    return ' '.join(clean)
 
def default_clean(text, stem=True, stemmer = PorterStemmer()):
    '''
        Removes default bad characters, Removes stopwords and does stemming
    '''
    # Check characters to see if they are in punctuation
    text = [char if char not in string.punctuation else ' ' for char in text]
    # Join the characters again to form the string.
    text = ''.join(text)
    # Now just remove any stopwords
    text = [word.lower() for word in text.split() if word.lower() not in stop_words.ENGLISH_STOP_WORDS]
    text = ' '.join(text)
    
    if stem:
        text_stemmed = [wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(text) if len(word) > 3]
#         text_stemmed = [stemmer.stem(word) for word in word_tokenize(text) if len(word) > 3]
    else:
        text_stemmed = [word for word in word_tokenize(text) if len(word) > 3]
    
    text = ' '.join(text_stemmed)
    return text

In [8]:
# Add Dept, Category, Topic as LABELS???
class TaggedDocumentIterator(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            yield TaggedDocument(words=doc.split(), tags=[self.labels_list[idx]])

In [9]:
def test_d2v(test_sample):
    """Passes parameter test_sample into Doc2Vec model and returns the most similar entry in training set."""
    #Clean the document using the utility functions used in train phase
    test_sample = default_clean(test_sample)

    #Convert the sample document into a list and use the infer_vector method to get a vector representation for it
    new_doc_words = test_sample.split()
    new_doc_vec = model.infer_vector(new_doc_words)

    #use the most_similar utility to find the most similar documents.
    return model.docvecs.most_similar(positive=[new_doc_vec])

In [10]:
def respond_d2v(row):
    """Passes parameter row of test dataframe to Doc2Vec prediction function and returns row with results added."""
    query = row.test_question.strip()

    sims = test_d2v(query)
    
    best = sims[0][0]

    row['sim_question'] = best
    row['similarity'] = round(sims[0][1], 2)
    row['success'] = (best == row.match_question)
    return row   

## Train on Answer, test with Question

In [11]:
sample = faq[['question', 'answer']]
sample = sample.sample(frac=1).reset_index(drop=True)
print ('The shape of the input data frame: {}'.format(sample.shape))

The shape of the input data frame: (688, 2)


In [12]:
sample['answer'] = sample['answer'].apply(default_clean)

In [13]:
docLabels = list(sample['question'])
data = list(sample['answer'])
sentences = TaggedDocumentIterator(data, docLabels)

In [32]:
# This is a simple gridsearch-like process to automate parameter testing. 
runs = 100 
res = pd.DataFrame(columns=['params', 'avg successes'])
e = 42
for i in range(5):
    v_s = 50
    for j in range(10):
        m_c = 0
        for k in range(3):
            model = Doc2Vec(vector_size=v_s, min_count=m_c, epochs=e)
            model.build_vocab(sentences)
            model.train(sentences,total_examples=model.corpus_count, epochs=model.epochs)

            avg = 0
            for i in range(runs):
                t = test.apply(respond_d2v, axis=1)
                avg += sum(t.success)

            res = res.append({'params' : 'vector_size={}, min_count={}, epochs={}'.format(v_s, m_c, e), 'avg successes' : avg / runs}, ignore_index=True)

            m_c += 1
            print('mc')
        v_s += 50
        print('vs')
    e += 10
    print('e')

mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e


In [31]:
res.tail()

Unnamed: 0,params,avg successes
95,"vector_size=440, min_count=1, epochs=42",0.97
96,"vector_size=470, min_count=0, epochs=42",1.22
97,"vector_size=470, min_count=1, epochs=42",0.95
98,"vector_size=500, min_count=0, epochs=42",1.18
99,"vector_size=500, min_count=1, epochs=42",1.17


In [30]:
res[res['avg successes'] == max(res['avg successes'])]

Unnamed: 0,params,avg successes
23,"vector_size=260, min_count=1, epochs=12",2.63


In [64]:
# Avg num of sucessses (out of 20) : model parameters

# no stemming
# 2.50 : model = Doc2Vec(vector_size=100, min_count=1, epochs=100)
# 2.70 : model = Doc2Vec(vector_size=100, min_count=0, epochs=100)
# 1.27 : model = Doc2Vec(vector_size=100, min_count=1, epochs=25)
# 1.23 : model = Doc2Vec(vector_size=100, min_count=0 epochs=25)
# 1.18 : model = Doc2Vec( min_count=0, epochs=25)

# with stemming
# 1.74 : model = Doc2Vec(vector_size=100, min_count=1, epochs=100)
# 1.21 : model = Doc2Vec(vector_size=100, min_count=0, epochs=100)
# 1.28 : model = Doc2Vec(vector_size=100, min_count=1, epochs=25)
# 1.15 : model = Doc2Vec(vector_size=100, min_count=0, epochs=25)

# with lemming
# 0.58 : model = Doc2Vec(vector_size=100, min_count=1, epochs=100)
# 1.04 : model = Doc2Vec(vector_size=100, min_count=0, epochs=100)
# 0.93 : model = Doc2Vec(vector_size=100, min_count=1, epochs=25)
# 0.99 : model = Doc2Vec(vector_size=100, min_count=0, epochs=25)
# 1.54 : vector_size=130, min_count=14, epochs=10
# 2.14 : vector_size=230, min_count=0, epochs=10
# 2.63 : vector_size=260, min_count=1, epochs=12
# Tested up to 500, 2, 42

In [62]:
# Individual model testing
model = Doc2Vec(vector_size=100, min_count=0, epochs=25)
model.build_vocab(sentences)
model.train(sentences,total_examples=model.corpus_count, epochs=model.epochs)

runs = 1000 
avg = 0
for i in range(runs):
    t = test.apply(respond_d2v, axis=1)
    avg += sum(t.success)
print('Avg. Successes: ', avg / runs)

In [24]:
# Store the model to mmap-able files
model.save('../models/model_answer.doc2vec')
# Load the model
model = Doc2Vec.load('../models/model_answer.doc2vec')

## Train on Question+Answer, test with Question

In [33]:
sample = pd.DataFrame(corpus, columns=['qna']) 
sample['question'] = faq['question']
sample['answer'] = faq['answer']
sample = sample.sample(frac=1).reset_index(drop=True)
print ('The shape of the input data frame: {}'.format(sample.shape))

The shape of the input data frame: (688, 3)


In [34]:
sample['qna'] = sample['qna'].apply(default_clean)

In [35]:
docLabels = list(sample['question'])
data = list(sample['qna'])
sentences = TaggedDocumentIterator(data, docLabels)

In [40]:
# This is a simple gridsearch-like process to automate parameter testing. 
runs = 100 
res = pd.DataFrame(columns=['params', 'avg successes'])
e = 10
for i in range(5):
    v_s = 50
    for j in range(10):
        m_c = 0
        for k in range(3):
            model = Doc2Vec(vector_size=v_s, min_count=m_c, epochs=e)
            model.build_vocab(sentences)
            model.train(sentences,total_examples=model.corpus_count, epochs=model.epochs)

            avg = 0
            for i in range(runs):
                t = test.apply(respond_d2v, axis=1)
                avg += sum(t.success)

            res = res.append({'params' : 'vector_size={}, min_count={}, epochs={}'.format(v_s, m_c, e), 'avg successes' : avg / runs}, ignore_index=True)

            m_c += 1
            print('mc')
        v_s += 50
        print('vs')
    e += 5
    print('e')

mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
mc
mc
mc
vs
e


In [41]:
res.tail()

Unnamed: 0,params,avg successes
145,"vector_size=450, min_count=1, epochs=30",0.99
146,"vector_size=450, min_count=2, epochs=30",0.72
147,"vector_size=500, min_count=0, epochs=30",1.06
148,"vector_size=500, min_count=1, epochs=30",1.01
149,"vector_size=500, min_count=2, epochs=30",0.65


In [42]:
res[res['avg successes'] == max(res['avg successes'])]

Unnamed: 0,params,avg successes
13,"vector_size=250, min_count=1, epochs=10",2.34


In [116]:
# Individual model testing
model = Doc2Vec(vector_size=100, min_count=0, epochs=100)
model.build_vocab(sentences)
model.train(sentences,total_examples=model.corpus_count, epochs=model.epochs)

runs = 100 
avg = 0
for i in range(runs):
    t = test.apply(respond_d2v, axis=1)
    avg += sum(t.success)
print('Avg. Successes: ', avg / runs)

In [103]:
# with tutourial cleaning, no stemming, taggedDocument
#  model = Doc2Vec(vector_size=100, min_count=0, epochs=100)
# 0-2 model = Doc2Vec(vector_size=100, min_count=1, epochs=30)

# with lemming
# 0-2 model = Doc2Vec(vector_size=100, min_count=1, epochs=25)
# 0-2 model = Doc2Vec(vector_size=100, min_count=1, epochs=30)
# 3.07 : vector_size=100, min_count=1, epochs=82

In [312]:
# Store the model to mmap-able files
model.save('../models/model_qna.doc2vec')
# Load the model
model = Doc2Vec.load('../models/model_qna.doc2vec')

# Analyze Similarity Comparisons

In [15]:
grid = pd.DataFrame(index=results.columns, columns=['Matched'])
for c in results.columns:
    grid.Matched[c] = sum(results[c])
grid

Unnamed: 0,Matched
cosine_similarity,6
kd_tree,6
