In [4]:
#import necessary libraries
import random
import string # to process standard python strings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import pyttsx3
import difflib
import nltk
from nltk.stem import WordNetLemmatizer
#nltk.download('popular', quiet=True) # for downloading packages

#Reading in the data
with open('rac-data.txt','r', encoding='utf8', errors ='ignore') as fin:
    raw = fin.read().lower()
    
#TOkenisation
sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences 
word_tokens = nltk.word_tokenize(raw)# converts to list of words

# Preprocessing
lemmer = WordNetLemmatizer()
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))


# chatbot voice !
def speak(message):
    engine= pyttsx3.init()
    engine.say('{}'.format(message))
    engine.runAndWait()
    

# greetings Keyword matching
GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up","hey","hii","hiii")
GREETING_RESPONSES = ["hi", "hey", "hi there", "hello", "I am glad! You are talking to me"]


words=LemNormalize(raw) + list(GREETING_INPUTS) + ['bye','thanks','thank you','thank']
                  
def word_check(s):
    for word in s.casefold().split():
        if word not in words:
            suggestion= difflib.get_close_matches(word, words)
            sug=suggestion[0]
            s=s.replace(word,sug)
    return s

def remove_punctuations(s):
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    no_punct = ""
    for char in s:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct

def remove_stopwords(text):
    stop = set(stopwords.words('english'))
    #print (stop)
    word_tokens = nltk.word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop] 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop: 
            filtered_sentence.append(w)
    s = " "
    s = s.join(filtered_sentence) 
    return s 

def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

#Generating answer
def response(user_input):
    BOT_response=''
    sent_tokens.append(user_input) # In sent_tokens, our user input is added in list 
    
    word_vectorizer = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english') # We apply tfidf vectorizer and LemNormalize is a function defined in which we remove punctions and then tokenize
    all_word_vectors = word_vectorizer.fit_transform(sent_tokens)  
    similar_vector_values = cosine_similarity(all_word_vectors[-1], all_word_vectors) # Check cosine similarity of last sentence appended i.e user question and all vectors
    idx=similar_vector_values.argsort()[0][-2] # Sort the cosine similarity
    
    matched_vector = similar_vector_values.flatten()
    matched_vector.sort()
    vector_matched = matched_vector[-2] # Last sentence is input itself and highest similarity is of 2nd last sentence
   
    if(vector_matched==0):
        BOT_response=BOT_response+"I am sorry! I don't understand you."
        return BOT_response
    else:
        BOT_response = BOT_response+sent_tokens[idx]
        return BOT_response



continue_dialogue=True
print("Hello. I am BOT, how can i help you?")
speak("Hello. I am BOT, how can i help you?")


while(continue_dialogue==True):
    user_input = input()
    user_input=user_input.lower()
    text=word_check(user_input)
    text=remove_punctuations(text)
    text=remove_stopwords(text)
    print(text)
    if(text!='bye'):
        if(text=='thanks' or text=='thank you' ):
            print("BOT: You are welcome..")
            speak(" You are welcome")
        else:
            if(greeting(text)!=None):
                tmp=greeting(text)
                print("BOT: "+tmp)
                speak(tmp)
            else:
                print("BOT: ",end="")
                temp=response(text)
                print(temp)  
                speak(temp)
                sent_tokens.remove(text)
    else:
        continue_dialogue=False
        print("BOT: Goodbye.")
        speak("goodbye")

Hello. I am BOT, how can i help you?
hiii
iii
BOT: iii.


  'stop_words.' % sorted(inconsistent))


hii
hii
BOT: I am glad! You are talking to me
bye
bye
BOT: Goodbye.
