In [1]:
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
with open('TrainData.txt') as f:
    lines = f.readlines()

In [3]:
questions = []
answers = []

In [4]:
for l in lines:
    as_list = l.split("\t")
    questions.append(as_list[1].replace("\n",""))
    answers.append(as_list[0])

In [5]:
questions = questions[1:]

In [16]:
questions[1:5]

["that one. the one that's all black.",
 "i got it from macy's.",
 'when does it start?',
 "what's the matter with green eyes?"]

In [6]:
answers = answers[1:]

In [7]:
df = pd.DataFrame({'Questions':questions,'Answers':answers})
df.head()

Unnamed: 0,Questions,Answers
0,The weather is great isn't it?,Yes. It's absolutely beautiful today.
1,that one. the one that's all black.,"yes, i like that one, too."
2,i got it from macy's.,it's really nice.
3,when does it start?,at 8:00 p.m.
4,what's the matter with green eyes?,"nothing, except my favorite color is blue."


In [10]:
df.shape

(5495, 2)

In [8]:
all_data = df.drop_duplicates(subset='Questions')
all_data.head(10)

Unnamed: 0,Questions,Answers
0,The weather is great isn't it?,Yes. It's absolutely beautiful today.
1,that one. the one that's all black.,"yes, i like that one, too."
2,i got it from macy's.,it's really nice.
3,when does it start?,at 8:00 p.m.
4,what's the matter with green eyes?,"nothing, except my favorite color is blue."
5,have you ever read a book,I have read just about everything in Project ...
6,what is bioinformatics,a fancy name for applied computer science in ...
7,Can I try it on?,"Sure, the changing rooms are over there."
8,Which account are you making this withdrawal f...,I want it taken from my checking account.
9,that's the truth.,and you get to play with a lot of dogs.


In [9]:
all_data.shape

(4875, 2)

In [11]:
all_data = all_data.dropna()
all_data.shape

(4875, 2)

In [12]:
stopwords_list = stopwords.words('english')

lemmatizer = WordNetLemmatizer()

def my_tokenizer(doc):
    words = word_tokenize(doc)
    
    pos_tags = pos_tag(words)
    
    non_stopwords = [w for w in pos_tags if not w[0].lower() in stopwords_list]
    
    non_punctuation = [w for w in non_stopwords if not w[0] in string.punctuation]
    
    lemmas = []
    for w in non_punctuation:
        if w[1].startswith('J'):
            pos = wordnet.ADJ
        elif w[1].startswith('V'):
            pos = wordnet.VERB
        elif w[1].startswith('N'):
            pos = wordnet.NOUN
        elif w[1].startswith('R'):
            pos = wordnet.ADV
        else:
            pos = wordnet.NOUN
        
        lemmas.append(lemmatizer.lemmatize(w[0], pos))

    return lemmas

In [14]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=my_tokenizer)
tfidf_matrix = tfidf_vectorizer.fit_transform(tuple(all_data['Questions']))
print(tfidf_matrix.shape)

(4875, 2334)


In [15]:
def ask_question(question):
    query_vect = tfidf_vectorizer.transform([question])
    similarity = cosine_similarity(query_vect, tfidf_matrix)
    max_similarity = np.argmax(similarity, axis=None)
    
    print('Your question:', question)
    print('Closest question found:', all_data.iloc[max_similarity]['Questions'])
    print('Similarity: {:.2%}'.format(similarity[0, max_similarity]))
    print('Answer:', all_data.iloc[max_similarity]['Answers'])

In [35]:
def ask_question_modified(question):
    query_vect = tfidf_vectorizer.transform([question])
    similarity = cosine_similarity(query_vect, tfidf_matrix)
    max_similarity = np.argmax(similarity, axis=None)
    return all_data.iloc[max_similarity]['Answers']

In [17]:
ask_question("i got it from macy's.")

Your question: i got it from macy's.
Closest question found: i got it from macy's.
Similarity: 100.00%
Answer: it's really nice.


In [36]:
ask_question_modified("i got it from macy's.")

"it's really nice."

In [18]:
test_df = pd.read_csv('TestData.csv')
test_df.head()

Unnamed: 0,I.D.,Question
0,QN_1,"i'll give you a speech like that, too."
1,QN_2,"i know, you're absolutely right."
2,QN_3,i liked it.
3,QN_4,the baby was eight pounds six ounces.
4,QN_5,I was sold a wireless service unavailable in m...


In [37]:
result = []
for i in test_df['Question']:
    result.append(ask_question_modified(i))


In [38]:
result

['do you think anyone will come to my funeral?',
 'i wish it would cool off one day.',
 'Oh, yes, I really like it.',
 "that's good to hear.",
 'I see it here,we charged you $5 extra a month.',
 'my waist is bigger than it was.',
 ' hal is the famous artificial intelligence from "2001".',
 'How does she act? ',
 "that's a good question. maybe it's not old age.",
 'how many invitations has she given out?',
 'nothing really.',
 'Yes, I am very excited. I could not forget the picnic I had yesterday.',
 'The weather is hot.',
 'they were on sale for $80.',
 'that tasted so good.',
 'It was really exciting. It was more enjoyable to me because I had never been to Garo Pahar.',
 ' Europe',
 'the next four years will be good years.',
 "i'm doing well. how about you?",
 'I agree.',
 ' A secret organization believed by some to be in control of all governments through a worldwide conspiracy.',
 "that's hard to take.",
 'i love boiled peanuts.',
 'use a tissue next time.',
 'then stop eating the b

In [39]:
len(result)

543

In [46]:
submission2 = pd.DataFrame(columns=['I.D.','Answer'])

In [47]:
test_df.shape

(543, 2)

In [48]:
submission2['I.D.'] = test_df['I.D.']
submission2['Answer'] = result


In [49]:
submission2.head()

Unnamed: 0,I.D.,Answer
0,QN_1,do you think anyone will come to my funeral?
1,QN_2,i wish it would cool off one day.
2,QN_3,"Oh, yes, I really like it."
3,QN_4,that's good to hear.
4,QN_5,"I see it here,we charged you $5 extra a month."


In [50]:
submission2.to_csv('SushmaD_2.csv',index=False)

In [60]:
def ask_question_modified2(question):
    query_vect = tfidf_vectorizer.transform([question])
    similarity = cosine_similarity(query_vect, tfidf_matrix)
    max_similarity = np.argmax(similarity, axis=None)
    return [all_data.iloc[max_similarity]['Answers'],similarity[0, max_similarity]]

In [72]:
#result2 = []
score = []
for i in test_df['Question']:
    #result2.append(ask_question_modified2(i))
    score.append(ask_question_modified2(i))

In [73]:
score

[['do you think anyone will come to my funeral?', 1.0000000000000004],
 ['i wish it would cool off one day.', 1.0],
 ['Oh, yes, I really like it.', 1.0],
 ["that's good to hear.", 0.5881111320332378],
 ['I see it here,we charged you $5 extra a month.', 1.0],
 ['my waist is bigger than it was.', 1.0],
 [' hal is the famous artificial intelligence from "2001".', 1.0],
 ['How does she act? ', 1.0000000000000002],
 ["that's a good question. maybe it's not old age.", 1.0],
 ['how many invitations has she given out?', 1.0],
 ['nothing really.', 1.0],
 ['Yes, I am very excited. I could not forget the picnic I had yesterday.',
  1.0],
 ['The weather is hot.', 1.0],
 ['they were on sale for $80.', 1.0],
 ['that tasted so good.', 1.0],
 ['It was really exciting. It was more enjoyable to me because I had never been to Garo Pahar.',
  1.0000000000000002],
 [' Europe', 1.0],
 ['the next four years will be good years.', 1.0000000000000002],
 ["i'm doing well. how about you?", 1.0],
 ['I agree.', 1.0

In [77]:
len(score)

543

In [90]:
score[0][0]

'do you think anyone will come to my funeral?'

In [91]:
answer = []
similarity = []
for i,j in score:
    answer.append(i)
    similarity.append(j)

In [92]:
answer

['do you think anyone will come to my funeral?',
 'i wish it would cool off one day.',
 'Oh, yes, I really like it.',
 "that's good to hear.",
 'I see it here,we charged you $5 extra a month.',
 'my waist is bigger than it was.',
 ' hal is the famous artificial intelligence from "2001".',
 'How does she act? ',
 "that's a good question. maybe it's not old age.",
 'how many invitations has she given out?',
 'nothing really.',
 'Yes, I am very excited. I could not forget the picnic I had yesterday.',
 'The weather is hot.',
 'they were on sale for $80.',
 'that tasted so good.',
 'It was really exciting. It was more enjoyable to me because I had never been to Garo Pahar.',
 ' Europe',
 'the next four years will be good years.',
 "i'm doing well. how about you?",
 'I agree.',
 ' A secret organization believed by some to be in control of all governments through a worldwide conspiracy.',
 "that's hard to take.",
 'i love boiled peanuts.',
 'use a tissue next time.',
 'then stop eating the b

In [93]:
results = pd.DataFrame(columns=['ID','Answer','Score'])

In [75]:
results

Unnamed: 0,ID,Answer,Score


In [94]:
results['ID'] = test_df['I.D.']
results['Answer'] = answer
results['Score'] = similarity

In [101]:
not_right = results[(results['Score'] < 1) | (results['Score'] > 1)]

In [102]:
len(not_right)

180