In [1]:
import pandas as pd 
import numpy as np 
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df1 = pd.read_csv('S08_question_answer_pairs.txt', sep='\t')
df2 = pd.read_csv('S09_question_answer_pairs.txt', sep='\t')
df3 = pd.read_csv('S10_question_answer_pairs.txt', sep='\t', encoding = 'ISO-8859-1')

In [3]:
df1.head()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,Yes.,easy,easy,S08_set3_a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,Yes.,easy,easy,S08_set3_a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4


In [4]:
all_data = df1.append([df2, df3])
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3998 entries, 0 to 1457
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ArticleTitle              3998 non-null   object
 1   Question                  3961 non-null   object
 2   Answer                    3422 non-null   object
 3   DifficultyFromQuestioner  3043 non-null   object
 4   DifficultyFromAnswerer    3418 non-null   object
 5   ArticleFile               3996 non-null   object
dtypes: object(6)
memory usage: 218.6+ KB


In [5]:
all_data['Question'] = all_data['ArticleTitle'].str.replace('_', ' ') + ' ' + all_data['Question']
all_data = all_data[['Question', 'Answer']]
all_data.shape


(3998, 2)

In [6]:
all_data.head(10)["Question"]

0    Abraham Lincoln Was Abraham Lincoln the sixtee...
1    Abraham Lincoln Was Abraham Lincoln the sixtee...
2    Abraham Lincoln Did Lincoln sign the National ...
3    Abraham Lincoln Did Lincoln sign the National ...
4     Abraham Lincoln Did his mother die of pneumonia?
5     Abraham Lincoln Did his mother die of pneumonia?
6    Abraham Lincoln How many long was Lincoln's fo...
7    Abraham Lincoln How many long was Lincoln's fo...
8    Abraham Lincoln When did Lincoln begin his pol...
9    Abraham Lincoln When did Lincoln begin his pol...
Name: Question, dtype: object

In [7]:
all_data = all_data.drop_duplicates(subset='Question')
all_data.head(10)

Unnamed: 0,Question,Answer
0,Abraham Lincoln Was Abraham Lincoln the sixtee...,yes
2,Abraham Lincoln Did Lincoln sign the National ...,yes
4,Abraham Lincoln Did his mother die of pneumonia?,no
6,Abraham Lincoln How many long was Lincoln's fo...,18 months
8,Abraham Lincoln When did Lincoln begin his pol...,1832
10,Abraham Lincoln What did The Legal Tender Act ...,"the United States Note, the first paper curren..."
12,Abraham Lincoln Who suggested Lincoln grow a b...,11-year-old Grace Bedell
14,Abraham Lincoln When did the Gettysburg addres...,1776
16,Abraham Lincoln Did Lincoln beat John C. Breck...,yes
18,Abraham Lincoln Was Abraham Lincoln the first ...,No


In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [9]:
stopwords_list = stopwords.words('english')

lemmatizer = WordNetLemmatizer()

def my_tokenizer(doc):
    words = word_tokenize(doc)
    
    pos_tags = pos_tag(words)
    
    non_stopwords = [w for w in pos_tags if not w[0].lower() in stopwords_list]
    
    non_punctuation = [w for w in non_stopwords if not w[0] in string.punctuation]
    
    lemmas = []
    for w in non_punctuation:
        if w[1].startswith('J'):
            pos = wordnet.ADJ
        elif w[1].startswith('V'):
            pos = wordnet.VERB
        elif w[1].startswith('N'):
            pos = wordnet.NOUN
        elif w[1].startswith('R'):
            pos = wordnet.ADV
        else:
            pos = wordnet.NOUN
        
        lemmas.append(lemmatizer.lemmatize(w[0], pos))

    return lemmas

In [10]:
all_data = all_data.dropna()
all_data.shape

(2188, 2)

In [11]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=my_tokenizer)
tfidf_matrix = tfidf_vectorizer.fit_transform(tuple(all_data['Question']))
print(tfidf_matrix.shape)

(2188, 3550)


In [12]:
def ask_question(question):
    query_vect = tfidf_vectorizer.transform([question])
    similarity = cosine_similarity(query_vect, tfidf_matrix)
    max_similarity = np.argmax(similarity, axis=None)
    
    print('Your question:', question)
    print('Closest question found:', all_data.iloc[max_similarity]['Question'])
    print('Similarity: {:.2%}'.format(similarity[0, max_similarity]))
    print('Answer:', all_data.iloc[max_similarity]['Answer'])

In [13]:
ask_question('When Abraham Lincoln started his political career')

Your question: When Abraham Lincoln started his political career
Closest question found: Abraham Lincoln Did Lincoln start his political career in 1832?
Similarity: 88.14%
Answer: Yes


In [20]:
!pip install PySimpleGUI




Collecting PySimpleGUI
  Downloading PySimpleGUI-4.60.4-py3-none-any.whl (509 kB)
Installing collected packages: PySimpleGUI
Successfully installed PySimpleGUI-4.60.4


In [23]:
import PySimpleGUI as sg

sg.theme('DarkAmber')   # Add a touch of color

# All the stuff inside your window.

layout = [  [sg.Text('Ask a question')],
            [sg.InputText()],

            [sg.Button('Ok'), sg.Button('Cancel')] ]

# Create the Window
window = sg.Window('Ask a question', layout)
# Event Loop to process "events" and get the "values" of the inputs
while True:
    event, values = window.read()
    def ask_question(question):
        query_vect = tfidf_vectorizer.transform([question])
        similarity = cosine_similarity(query_vect, tfidf_matrix)
        max_similarity = np.argmax(similarity, axis=None)
    
        print('Your question:', question)
        print('Closest question found:', all_data.iloc[max_similarity]['Question'])
        print('Similarity: {:.2%}'.format(similarity[0, max_similarity]))
        print('Answer:', all_data.iloc[max_similarity]['Answer'])
    if event == sg.WIN_CLOSED or event == 'Cancel': # if user closes window or clicks cancel
        break
    if event == 'Ok':
        ask_question(values[0])
window.close()

Your question: When Abraham Lincoln started his political career
Closest question found: Abraham Lincoln Did Lincoln start his political career in 1832?
Similarity: 88.14%
Answer: Yes
