In [9]:
import nltk

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [11]:
from nltk.corpus import stopwords

In [12]:
from nltk.stem import PorterStemmer

In [13]:
from nltk.tokenize import PunktSentenceTokenizer

In [3]:
# The NLTK Downloads we need
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('nps_chat')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package nps_chat to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\nps_chat.zip.


True

In [1]:
import string
import random

In [2]:
import sklearn

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
import pandas as pd

In [6]:
from termcolor import colored

In [7]:
# Golobal Constants
GREETING_INPUTS = ("hello", "hi")
GREETING_RESPONCES = ["hi", "hey", "*nods*", "hi there", "Talkin' to me?"]
FILE_NAME = "canada_faq.txt"

In [14]:
# Global Variables
lem = nltk.stem.WordNetLemmatizer()
remove_punctuation = dict((ord(punct), None) for punct in string.punctuation)

In [15]:
# Functions

In [16]:
def fetch_features(chat):
    # fetch_features() transforms a chat into a classifier friendly format
    
    features = {}
    for word in nltk.word_tokenize(chat):
        features['contains({})'.format(word.lower())] = True
    return features

In [17]:
def lemmatise(tokens):
    # This method performs the lemmatization on the words
    return [lem.lemmatize(token) for token in tokens]

In [18]:
def tokenise(text):
    # This method tokenizes the words
    return lemmatise(nltk.word_tokenize(text.lower().translate(remove_punctuation)))

In [19]:
def greet(sentence):
    # This method responces with a standard a bot recognizes
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONCES)

In [38]:
def match(user_response):
    resp      =''
    q_list.append(user_response)
    TfidfVec  = TfidfVectorizer(tokenizer=tokenise, stop_words='english')
    tfidf     = TfidfVec.fit_transform(q_list)
    vals      = cosine_similarity(tfidf[-1], tfidf)
    idx       = vals.argsort()[0][-2]
    flat      = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if(req_tfidf==0):
        resp = resp+"Sorry! I don't know the answer to this. Would you like to try again? Type Ciao to exit"
        return resp
    else:
        resp_ids = qa_dict[idx]
        resp_str = ''
        s_id = resp_ids[0]
        end = resp_ids[1]
        while s_id<end :
            resp_str = resp_str + " " + sent_tokens[s_id]
            s_id+=1
        resp = resp+resp_str
        return resp

### Training the Classifier

In [21]:
# Training the classifier

# Fetching the chat corpus
chats = nltk.corpus.nps_chat.xml_posts()[:10000]

# Extract the features from chat
featuresets = [(fetch_features(chat.text), chat.get('class')) for chat in chats]

In [22]:
# splitting into test and train sets
size = int(len(featuresets) * 0.1) # 10%
train_set, test_set = featuresets[size:], featuresets[:size]  # 90 training and 10 testing

In [23]:
classifier = nltk.MaxentClassifier.train(train_set)
# for NaiveBayesClassifier
#classifier = nltk.NaiveBayesClassifier.train(train_set) 

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -2.70805        0.050
             2          -1.25314        0.847
             3          -0.92168        0.881
             4          -0.75008        0.898
             5          -0.63707        0.910
             6          -0.55432        0.918
             7          -0.49069        0.924
             8          -0.44076        0.929
             9          -0.40122        0.932
            10          -0.36960        0.936
            11          -0.34394        0.940
            12          -0.32270        0.944
            13          -0.30477        0.946
            14          -0.28939        0.948
            15          -0.27600        0.950
            16          -0.26421        0.952
            17          -0.25371        0.953
            18          -0.24430        0.954
            19          -0.23580        0.956
 

In [24]:
import joblib # for saving the model

In [26]:
# Save the model
classifier_file_name = 'MaxentClassifier.sav'
joblib.dump(classifier, classifier_file_name)

['MaxentClassifier.sav']

In [27]:
# Loading the model
load_classifier = joblib.load(classifier_file_name)

In [28]:
# print the accuracy final one
print(nltk.classify.accuracy(load_classifier, test_set))

0.708


In [29]:
# Question Bank Creation

In [30]:
ques_bank = open(FILE_NAME, 'r', errors = 'ignore')

In [31]:
qb_text = ques_bank.read()
qb_text = qb_text.lower()

In [32]:
sent_tokens = nltk.sent_tokenize(qb_text) # Converts the list into sentences
word_tokens = nltk.word_tokenize(qb_text) # converts the list of words

In [33]:
qa_dict     = {} #The Dictionary to store questions and corresponding answers
q_list      = [] #List of all questions
s_count     = 0  #Sentence counter

In [34]:
#Extract questions and answers
#Answer is all the content between 2 questions [assumption]

In [35]:
while s_count < len(sent_tokens):
    result = load_classifier.classify(fetch_features(sent_tokens[s_count]))
    if("question" in result.lower()):
        next_question_id = s_count+1
        next_question = load_classifier.classify(fetch_features(sent_tokens[next_question_id]))
        
        while(not("question" in next_question.lower()) and next_question_id < len(sent_tokens) - 1):
            next_question_id += 1
            next_question = load_classifier.classify(fetch_features(sent_tokens[next_question_id]))
        
        q_list.append(sent_tokens[s_count])
        end = next_question_id
        
        if((next_question_id - s_count) > 5):
            end = s_count+5
        
        qa_dict.update({len(q_list)-1:[s_count+1, end]})
        s_count = next_question_id
    
    else: 
        s_count += 1
                

In [36]:
# Responce Fetching

In [None]:
flag = True
print(colored("NEO: \nI'm a Mona, I have all the answers, if you want to exit, type Ciao", 'blue', attrs=['bold']))
while(flag == True):
    print(colored("\nYOU: ",'red', attrs=['bold']))
    u_input = input()
    u_input = u_input.lower()
    
    if(u_input!= 'ciao'):
        if(greet(u_input)!=None):
            print(colored("\nNEO:",'blue',attrs=['bold']))
            print(greet(u_input))
        
        else: 
            print(colored("\nNEO:",'blue',attrs=['bold']))
            print(colored(match(u_input).strip().capitalize(),'blue'))
            q_list.remove(u_input)
    
    else:
        flag=False
        print(colored("\nNEO: Bye! take care..",'blue', attrs=['bold']))

[1m[34mNEO: 
I'm a Neo, I have all the answers, if you want to exit, type Ciao[0m
[1m[31m
YOU: [0m
What is application process?




[1m[34m
NEO:[0m
[34mWe consider your application non-routine if:

you asked to change your personal information, such as:
name
sex designation
date of birth
you missed a:
test
interview
hearing
we need you to submit extra documents, like:
fingerprints
residence documents
we asked you to come to another interview or hearing after you attended your interview
we also consider your citizenship application non-routine if you:

failed a test
didnâ€™t meet the language requirements during your interview
for more information
how are ircc processing times calculated?[0m
[1m[31m
YOU: [0m
Okay so what should I do now




[1m[34m
NEO:[0m
[34mSorry! i don't know the answer to this. would you like to try again? type ciao to exit[0m
[1m[31m
YOU: [0m
Okay, what is your name?




[1m[34m
NEO:[0m
[34mSorry! i don't know the answer to this. would you like to try again? type ciao to exit[0m
[1m[31m
YOU: [0m
how are ircc processing times calculated?




[1m[34m
NEO:[0m
[34mTo check the status of your application, you can:

step 1: check the processing times. step 2: check the status of your application online through the client application status service. step 3: if normal processing time for your application has passed, you may contact the call centre to verify the status of your application. find out more about improvements to our processing times and reducing the backlog.[0m
[1m[31m
YOU: [0m
