In [4]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from nltk.tokenize import word_tokenize 
import gensim
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
from bs4 import BeautifulSoup
nltk.download('punkt')
ps = PorterStemmer()

[nltk_data] Downloading package punkt to /home/zachmacke/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def prep_data():
    """
    Loads the data and reads it into dataframes. 
    
    Returns:
        questions, answers (pandas Dataframe): question and answers in seperate dataframes
        
    """
    data0 = pd.read_csv('python_questions0.csv')
    data1 = pd.read_csv('python_questions1.csv')
    data2 = pd.read_csv('python_questions2.csv')
    data3 = pd.read_csv('python_questions3.csv')

    data = pd.concat([data0,data1,data2,data3], ignore_index=False)
    
    questions = data[['question_id','question_title']].drop_duplicates('question_id')
    answers = data[['parent_id','answer','answer_score']] 
    
    return (questions,answers)

In [6]:
df_questions, df_answers = prep_data()

In [7]:
def tag_questions(df):
    """Tokenize, tag, and stem sentences
    
    Args:
        df (pandas Dataframe): pre-tagged dataframe to iterate through
    
    Returns:
        tagged_sents (list): a list of tokenized sentances
    
    """
    taged_sents = []

    for sent in range(df_questions.shape[0]):

        title = df_questions.iloc[sent]['question_title']
        q_id = df_questions.iloc[sent]['question_id']

        words = [ps.stem(word) for word in nltk.word_tokenize(title.lower())]

        documents = gensim.models.doc2vec.TaggedDocument(words, [str(q_id)])

        taged_sents.append(documents)
        
        if sent % 10000 == 0:
            print(sent)
        
    return (taged_sents)

In [79]:
tagged_docs = tag_questions(df_questions)

0


KeyboardInterrupt: 

In [47]:
def train_model():
    """
    Train the model on a certain set of values and save the model for later 
    """
    vec_size = 100
    alpha = 0.025

    print_every_n = 1

    model = Doc2Vec(vector_size=vec_size,dm=0, alpha=alpha, min_alpha=alpha, min_count=0, workers = 4)
    model.build_vocab(tagged_docs)

    for epoch in range(40):

        if epoch % print_every_n == 0:       
            print('iteration {0}'.format(epoch))

        model.train(tagged_docs,
                total_examples=len(tagged_docs),
                epochs=1)

        if model.alpha > 0.001:
            model.alpha -= 0.0002
            model.min_alpha = model.alpha

    model.save('trained_model')

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39


In [8]:
model = Doc2Vec.load('trained_model')

In [9]:
# reset the index of our dataframes 
df_questions_qid_index = df_questions.set_index('question_id') 
df_answers_parent_index = df_answers.set_index('parent_id') 

In [None]:
def find_similar(user_input):
    """
    Finds the most similar vectors to those in our training set
    
    Args:
        user_input (string): the user's question
        
    Returns
        ques[:9] (list splice): the top 9 closest matching questions to the user input
    """
    
    ques = [] 
    s = set()
    
    for i in range(20):
        tokens = user_input.split()
        stem_token = [ps.stem(word) for word in tokens]
        
        new_vector = model.infer_vector(stem_token,steps=4000)
        
        sims = model.docvecs.most_similar([new_vector])
        for i in sims:
            if i[0] not in s:
                ques.append(((df_questions_qid_index.loc[int(i[0])]['question_title']),i[0],i[1]))
                s.add(i[0])
                
    ques.sort(key=lambda x: x[1])            
    
    return(ques[:9])

In [None]:
def start_bot():
   """
   Initialize the chatbot
   """ 
    while(True):

        print("\nPlease input question:\n")
        question = input()
        if question == 'q':
            break

        answers= find_similar(question)

        def get_answer():
            """
            Create the user interface and use the parent id each question to get the corresponding asnwer
            """
            print("\nWhich answer would you like to see?  ") 
            print('-'*35) 

            for x in range(len(answers)):
                print(str(x + 1) + '. ' + str(answers[x][0]))
            print('-'*35) 

            print("Input a number (1-9): ")
            chosen_answer = (int(input()) - 1)

            print('\nHere is your answer: ')
            print('-'*35) 
            ans = df_answers[df_answers['parent_id'] == int(answers[chosen_answer][1])]
            max_score = ans['answer_score'].idxmax()
            tagged_answer = (ans.loc[max_score]['answer'])
            clean_answer = BeautifulSoup(tagged_answer).get_text()
            print(clean_answer)

        get_answer()

        while(True):
            print("Would you list to see the list again? (y/n): ")
            y_n = input()
            if (y_n == 'y'):
                get_answer()
            else:
                break

In [None]:
start_bot()


Please input question:

list comprehensions in python

Which answer would you like to see?  
-----------------------------------
1. List comprehensions in python
2. Cross-list comprehension in Python
3. List comprehension python
4. Python list append
5. Capture-and-yield in a list comprehension
6. List comprehension with if-condition
7. List Comprehension in Python
8. List Comprehension Syntax
9. Python list of lists
-----------------------------------
Input a number (1-9): 
3

Here is your answer: 
-----------------------------------
A list comprehension is used to take an existing sequence and perform some function and/or filter to it, resulting in a new list. So, in this case a list comprehension is not appropriate since you don't have a starting sequence. An example with a while loop:

numbers = []
x=input()
while x != 1:
  numbers.append(x)
  if x % 2 == 0: x /= 2
  else: x = 3 * x + 1

Would you list to see the list again? (y/n): 
y

Which answer would you like to see?  
-------