In [1]:
import pandas as pd
import nltk 
import numpy as np
import re
from nltk.stem import wordnet # to perform lemmitization
from sklearn.feature_extraction.text import CountVectorizer # to perform bow
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf
from nltk import pos_tag # for parts of speech
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity
from nltk import word_tokenize # to create tokens
from nltk.corpus import stopwords # for stop words

In [2]:
df=pd.read_csv('/Users/swatiraman/Downloads/NLP Hackathon/Hackathon File/TrainData.txt',sep='\t')
df.head(20)

Unnamed: 0,Answer,Question
0,Yes. It's absolutely beautiful today.,The weather is great isn't it?
1,"yes, i like that one, too.",that one. the one that's all black.
2,it's really nice.,i got it from macy's.
3,at 8:00 p.m.,when does it start?
4,"nothing, except my favorite color is blue.",what's the matter with green eyes?
5,I have read just about everything in Project ...,have you ever read a book
6,a fancy name for applied computer science in ...,what is bioinformatics
7,"Sure, the changing rooms are over there.",Can I try it on?
8,I want it taken from my checking account.,Which account are you making this withdrawal f...
9,and you get to play with a lot of dogs.,that's the truth.


In [3]:
df.shape

(5495, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5495 entries, 0 to 5494
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Answer    5495 non-null   object
 1   Question  5495 non-null   object
dtypes: object(2)
memory usage: 86.0+ KB


In [5]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/swatiraman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# function that performs text normalization steps

def text_normalization(text):
    text=str(text).lower() # text to lower case
    spl_char_text=re.sub(r'[^ a-z]','',text) # removing special characters
    tokens=nltk.word_tokenize(spl_char_text) # word tokenizing
    lema=wordnet.WordNetLemmatizer() # intializing lemmatization
    tags_list=pos_tag(tokens,tagset=None) # parts of speech
    lema_words=[]   # empty list 
    for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
        lema_token=lema.lemmatize(token,pos_val) # performing lemmatization
        lema_words.append(lema_token) # appending the lemmatized token into a list
    
    return " ".join(lema_words) # returns the lemmatized tokens as a sentence 

In [8]:
df['lemmatized_text']=df['Question'].apply(text_normalization) # applying the fuction to the dataset to get clean text
df.tail(15)

Unnamed: 0,Answer,Question,lemmatized_text
5480,a blind date is a date with someone you don't ...,what does it mean?,what do it mean
5481,I'm not.,Don't pull my chain.,dont pull my chain
5482,where's the checkbook? i'm ready to rent it wi...,and there are only six units in the whole buil...,and there be only six unit in the whole building
5483,english.,what is your major?,what be your major
5484,A conpiracy run by a very closely knit group ...,what is the illuminatti,what be the illuminatti
5485,what do you mean?,which would you prefer?,which would you prefer
5486,what do you get when you cross a port and fr...,Tell me a joke,tell me a joke
5487,It's going to be $300.,How much would you like to deposit?,how much would you like to deposit
5488,see an eye doctor.,the book is open.,the book be open
5489,How much were they?,They're very comfortable.,theyre very comfortable


In [9]:
# using tf-idf

tfidf=TfidfVectorizer() # intializing tf-id 
x_tfidf=tfidf.fit_transform(df['lemmatized_text']).toarray() # transforming the data into array


In [10]:
# returns all the unique word from data with a score of that word

df_tfidf=pd.DataFrame(x_tfidf,columns=tfidf.get_feature_names()) 
df_tfidf.head()

Unnamed: 0,able,about,abroad,absent,absolutely,absorbed,accept,accident,accord,account,...,your,youre,yours,yourself,youve,youyoure,yuck,yyou,zip,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# defining a function that returns response to query using tf-idf

def chat_tfidf(text):
    lemma=text_normalization(text) # calling the function to perform text normalization
    tf=tfidf.transform([lemma]).toarray() # applying tf-idf
    cos=1-pairwise_distances(df_tfidf,tf,metric='cosine') # applying cosine similarity
    index_value=cos.argmax() # getting index value 
    return df['Answer'].loc[index_value]

In [12]:
test_df = pd.read_csv('/Users/swatiraman/Downloads/NLP Hackathon/Hackathon File/TestData.csv')
test_df.head()

Unnamed: 0,I.D.,Question
0,QN_1,"i'll give you a speech like that, too."
1,QN_2,"i know, you're absolutely right."
2,QN_3,i liked it.
3,QN_4,the baby was eight pounds six ounces.
4,QN_5,I was sold a wireless service unavailable in m...


In [13]:
answer = []
for text in test_df['Question']:
    #print(text,':',chat_tfidf(text))
    answer.append(chat_tfidf(text))

In [14]:
test_df['answer']=answer

In [15]:
#test_df.drop('Answer',axis=1,inplace=True)

In [16]:
test_df.head(20)

Unnamed: 0,I.D.,Question,answer
0,QN_1,"i'll give you a speech like that, too.",do you think anyone will come to my funeral?
1,QN_2,"i know, you're absolutely right.",i wish it would cool off one day.
2,QN_3,i liked it.,"i'll give you a speech like that, too."
3,QN_4,the baby was eight pounds six ounces.,that's good to hear.
4,QN_5,I was sold a wireless service unavailable in m...,"I see it here,we charged you $5 extra a month."
5,QN_6,maybe four or five pounds?,my waist is bigger than it was.
6,QN_7,do you know hal,hal is the famous artificial intelligence fro...
7,QN_8,"Yeah,actually a lot of them are.",How does she act?
8,QN_9,"if it's old age, why don't both of your hands ...",that's a good question. maybe it's not old age.
9,QN_10,it's supposed to start at about eight.,how many invitations has she given out?


In [17]:
test_df[['I.D.','answer']]

Unnamed: 0,I.D.,answer
0,QN_1,do you think anyone will come to my funeral?
1,QN_2,i wish it would cool off one day.
2,QN_3,"i'll give you a speech like that, too."
3,QN_4,that's good to hear.
4,QN_5,"I see it here,we charged you $5 extra a month."
...,...,...
538,QN_539,people who live in hawaii are lucky.
539,QN_540,What is Baseball
540,QN_541,"okay, i'll take it out front right now."
541,QN_542,you may be right.


test_df[['I.D.','answer']].to_csv('Swati_Raman_1.csv', index=False, encoding='utf-8')

In [19]:
import pandas as pd
#data = pd.read_csv('qa.csv')
data=pd.read_csv('/Users/swatiraman/Downloads/NLP Hackathon/Hackathon File/TrainData.txt',sep='\t')
# this function is used to get printable results
def getResults(questions, fn):
    def getResult(q):
        answer, score, prediction = fn(q)
        return [q, prediction, answer, score]
    return pd.DataFrame(list(map(getResult, questions)), columns=["Q", "Prediction", "A", "Score"])


In [21]:
pip install Levenshtein

Collecting Levenshtein
  Downloading Levenshtein-0.16.0-cp38-cp38-macosx_10_9_x86_64.whl (92 kB)
     |████████████████████████████████| 92 kB 806 kB/s            
[?25hCollecting rapidfuzz<1.9,>=1.8.2
  Downloading rapidfuzz-1.8.3-cp38-cp38-macosx_10_9_x86_64.whl (661 kB)
     |████████████████████████████████| 661 kB 364 kB/s            
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.16.0 rapidfuzz-1.8.3
Note: you may need to restart the kernel to use updated packages.


In [25]:
test_data = pd.read_csv('/Users/swatiraman/Downloads/NLP Hackathon/Hackathon File/TestData.csv')
final_data = pd.DataFrame(columns=["Q", "Prediction", "A", "Score"])
from Levenshtein import ratio
def getApproximateAnswer(q):
    max_score = 0
    answer = ""
    prediction = ""
    for idx, row in data.iterrows():
        score = ratio(row["Question"], q)
        if score >= 0.9: # I'm sure, stop here
            return row["Answer"], score, row["Question"]
        elif score > max_score: # I'm unsure, continue
            max_score = score
            answer = row["Answer"]
            prediction = row["Question"]
    if max_score > 0.3: # threshold is lowered
        return answer, max_score, prediction
    return "Sorry, I didn't get you.", max_score, prediction
final_data = getResults(test_data['Question'], getApproximateAnswer2)

In [30]:
final_data.to_csv('final_data.csv', index=False, encoding='utf-8')

In [31]:
final_data.tail(20)

Unnamed: 0,Q,Prediction,A,Score
523,"well, hold on a little longer.","well, hold on a little longer.",i think i'm going to explode.,1.0
524,winter is great. i wish it didn't get so cold ...,winter is great. i wish it didn't get so cold ...,i would rather deal with the winter than the s...,1.0
525,it only took about two hours.,it only took about two hours.,did you take pictures at the world war ii monu...,1.0
526,what do you need for school?,what do you need for school?,i need pencils.,1.0
527,what is the illuminatti,what is the illuminati,A secret organization believed by some to be ...,0.978723
528,what does it mean?,what does it mean?,a blind date is a date with someone you don't ...,1.0
529,Don't pull my chain.,Don't pull my chain.,I'm not.,1.0
530,and there are only six units in the whole buil...,and there are only six units in the whole buil...,where's the checkbook? i'm ready to rent it wi...,1.0
531,what is your major?,What is your major?,I'm majoring in philosophy.,0.947368
532,what is the illuminatti,what is the illuminati,A secret organization believed by some to be ...,0.978723
