In [1]:
pip install wordcloud

Collecting wordcloud
  Downloading https://files.pythonhosted.org/packages/96/36/f712ec19c6e253d27c5cae0d45a4a465ebb5ccf0cb26887a6a3127d13769/wordcloud-1.6.0-cp37-cp37m-win_amd64.whl (153kB)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.6.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import os
import json
import re
import numpy as np
import random
import string
import pickle
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag
import gensim
from gensim import corpora, models, similarities
from sklearn.metrics.pairwise import cosine_similarity

from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

In [2]:
path = '../Data/'
data = pd.read_csv(path + 'faq.csv')
len(data)

1764

In [3]:
data.head()

Unnamed: 0,Question,Answer,Class
0,Do I need to enter ‘#’ after keying in my Card...,Please listen to the recorded message and foll...,security
1,What details are required when I want to perfo...,"To perform a secure IVR transaction, you will ...",security
2,How should I get the IVR Password if I hold a...,An IVR password can be requested only from the...,security
3,How do I register my Mobile number for IVR Pas...,Please call our Customer Service Centre and en...,security
4,How can I obtain an IVR Password,By Sending SMS request: Send an SMS 'PWD<space...,security


In [4]:
def pre_process(questions):
    stop_words = stopwords.words("english")
    
    # Remove non english words
    questions = [re.sub('[^a-z(c++)(c#)]', ' ', x.lower()) for x in questions]
    # Tokenization
    questions_tokens = [nltk.word_tokenize(t) for t in questions]
    # Removing Stop Words
    questions_stop = [[t for t in tokens if (t not in stop_words) and (3 < len(t.strip()) < 15)]
                      for tokens in questions_tokens]
    
    questions_stop = pd.Series(questions_stop)
    return questions_stop

# Initial preprocessing training data
questions = data['Question']
questions_pp = pre_process(questions)

data_tokens = pd.DataFrame({'Question': list(data['Question']),
                            'Question_Tokens': questions_pp,
                            'Answer': list(data['Answer']),
                            'Class': list(data['Class'])
                           })
data_tokens.head()

Unnamed: 0,Question,Question_Tokens,Answer,Class
0,Do I need to enter ‘#’ after keying in my Card...,"[need, enter, keying, card, number, card, expi...",Please listen to the recorded message and foll...,security
1,What details are required when I want to perfo...,"[details, required, want, perform, secure, tra...","To perform a secure IVR transaction, you will ...",security
2,How should I get the IVR Password if I hold a...,"[password, hold, card]",An IVR password can be requested only from the...,security
3,How do I register my Mobile number for IVR Pas...,"[register, mobile, number, password]",Please call our Customer Service Centre and en...,security
4,How can I obtain an IVR Password,"[obtain, password]",By Sending SMS request: Send an SMS 'PWD<space...,security


In [5]:
data_example = pd.DataFrame(data_tokens['Question'])
length = data_example['Question'].apply(len)
data_example = data_example.assign(Question_Length=length)
data_example.head()

Unnamed: 0,Question,Question_Length
0,Do I need to enter ‘#’ after keying in my Card...,83
1,What details are required when I want to perfo...,73
2,How should I get the IVR Password if I hold a...,59
3,How do I register my Mobile number for IVR Pas...,52
4,How can I obtain an IVR Password,33


In [6]:
# Raw data
example = data_example['Question'][1]
raw_title = 'Raw Data'
raw_result = example
raw_result

'What details are required when I want to perform a secure IVR transaction'

In [7]:
# Remove non english words
re_title = 'Remove non-English Words'
re_result = [re.sub('[^a-z(c++)(c#)]', ' ', x.lower()) for x in pd.Series(example)]
re_result

['what details are required when i want to perform a secure ivr transaction']

In [8]:
# Tokenlization
tk_title = 'Tokenlization'
tk_result = [nltk.word_tokenize(t) for t in re_result]
print(tk_result)

[['what', 'details', 'are', 'required', 'when', 'i', 'want', 'to', 'perform', 'a', 'secure', 'ivr', 'transaction']]


In [9]:
# Removing Stop Words
stop_words = stopwords.words("english")
rs_title = 'Removing Stop Words'
rs_result = [[t for t in tokens if (t not in stop_words) and (3 < len(t.strip()) < 15)] for tokens in tk_result]
rs_result

[['details', 'required', 'want', 'perform', 'secure', 'transaction']]

In [10]:
data = {'Step' : [raw_title, re_title, tk_title, rs_title],
        'Results' : [raw_result, re_result, tk_result, rs_result]}
df = pd.DataFrame(data)
cols = ['Step', 'Results']
df = df.ix[:,cols]
pd.set_option('display.max_colwidth', 100)
df

Unnamed: 0,Step,Results
0,Raw Data,What details are required when I want to perform a secure IVR transaction
1,Remove non-English Words,[what details are required when i want to perform a secure ivr transaction]
2,Tokenlization,"[[what, details, are, required, when, i, want, to, perform, a, secure, ivr, transaction]]"
3,Removing Stop Words,"[[details, required, want, perform, secure, transaction]]"


In [11]:
def train_model(train_data):
    """Function trains and creates Word2vec Model using parsed
    data and returns trained model"""
    model = gensim.models.Word2Vec(train_data, min_count=2)
    return model

In [12]:
dict_language = {'0': 'security', '1': 'loans', '2': 'accounts', '3': 'insurance', '4': 'investments', 
                 '5': 'fundstransfer', '6': 'cards'}

data_tokens['Question_Vectors'] = None
data_tokens['Average_Pooling'] = None
    
for key, value in dict_language.items():
    questions_data = list(data_tokens[data_tokens['Class'] == value]['Question_Tokens'])
    # Train model
    model_name = 'word2vec_faq_' + value
    trained_model = train_model(questions_data) # In train_model we apply word to vector on Question_Tokens
    trained_model.save(model_name) # Model file named word2vec_faq_security/loans/accounts etc. is created
    print('Saved %s model successfully' % model_name)
    
    # Save Word2Vec model
    word2vec_pickle_path = path + 'word2vec_faq_' + value + '.bin'
    f = open(word2vec_pickle_path, 'wb')
    pickle.dump(trained_model, f) # write the trained model in empty bin file of each class e.g loans, security,accounts etc.
    f.close()
    
    model = gensim.models.KeyedVectors.load(word2vec_pickle_path) # Load all the models in model
    
    # Calculate the vectors for each question
    for i in range(len(data_tokens)): # No. of items of a data i.e whole data i.e no. of questions
        if data_tokens['Class'][i] == value:
            question_tokens = data_tokens['Question_Tokens'][i]
            question_vectors = []
            for token in question_tokens:
                try:
                    vector = model[token]
                    question_vectors.append(vector)
                except:
                    continue
            # Vectors for each tokens
            data_tokens['Question_Vectors'][i] = question_vectors # Save the word to vector form of question tokens in question vectors
            # Average Pooling of all tokens
            data_tokens['Average_Pooling'][i] = list(pd.DataFrame(question_vectors).mean()) # The vector form of each word in question tokens is replaced by its mean

Saved word2vec_faq_security model successfully
Saved word2vec_faq_loans model successfully
Saved word2vec_faq_accounts model successfully
Saved word2vec_faq_insurance model successfully
Saved word2vec_faq_investments model successfully
Saved word2vec_faq_fundstransfer model successfully
Saved word2vec_faq_cards model successfully


In [13]:
data_tokens['Question_Tokens'] = [" ".join(l) for l in data_tokens['Question_Tokens']]
length = data_tokens['Question_Tokens'].apply(len)
data_tokens = data_tokens.assign(Question_Length=length)
data_tokens.head()

Unnamed: 0,Question,Question_Tokens,Answer,Class,Question_Vectors,Average_Pooling,Question_Length
0,Do I need to enter ‘#’ after keying in my Card number/ Card expiry date/ CVV number,need enter keying card number card expiry date number,Please listen to the recorded message and follow the instructions while entering your card details.,security,"[[0.0022045795, 0.0031682255, 0.0027144125, -0.004793102, 0.00139459, -0.0030819646, -0.00293562...","[0.0006918209022842347, 0.0035093167331069707, 0.0013605110812932252, -8.842535316944123e-06, 0....",53
1,What details are required when I want to perform a secure IVR transaction,details required want perform secure transaction,"To perform a secure IVR transaction, you will need your 16-digit Card number, Card expiry date, ...",security,"[[-0.0036381332, 0.0016050931, -0.0048513315, 0.0038896555, -0.0044063693, -0.0016465527, -0.003...","[-0.0024840704863891006, -0.0013779873261228204, -0.0033064440358430147, 0.0010085380636155605, ...",48
2,How should I get the IVR Password if I hold an add-on card,password hold card,An IVR password can be requested only from the registered mobile number and will be sent to the ...,security,"[[-0.003286957, -0.0040943474, 0.002438291, 0.00062308495, -3.8724123e-05, -0.0008238838, -0.003...","[-0.0011701553594321012, 0.00024229357950389385, 0.0004502396332100034, 0.0026107802405022085, -...",18
3,How do I register my Mobile number for IVR Password,register mobile number password,Please call our Customer Service Centre and ensure that your mobile number is updated in our rec...,security,"[[-0.0019145777, 0.003052564, -0.0042188996, -0.0044614156, 0.0015650395, -0.0023413908, 0.00398...","[-0.0020582330544129945, 0.0002540424757171422, -0.00047699635615572333, -0.0023139998374972492,...",31
4,How can I obtain an IVR Password,obtain password,By Sending SMS request: Send an SMS 'PWD<space>1234' to 9717465555 or to 5676712 from your regis...,security,"[[-0.003286957, -0.0040943474, 0.002438291, 0.00062308495, -3.8724123e-05, -0.0008238838, -0.003...","[-0.0032869570422917604, -0.004094347357749939, 0.0024382909759879112, 0.0006230849539861083, -3...",15


In [14]:
# Export as data as JSON
data_json = json.loads(data_tokens.to_json(orient='records')) # Load all the data in json format

with open(path + 'faq_Word2Vec.json', 'w') as outfile:
    json.dump(data_json, outfile) # Write all the data in json file

In [15]:
try:
    stackoverflow_path = path + 'faq_Word2Vec.json'

    with open(stackoverflow_path) as file:
        reader = json.load(file)

        classes = []
        questions = []
        questions_tokens = []
        answers = []
        question_lengths = []
        question_vectors = []
        average_pooling = []
        for row in reader:
            classes.append(row['Class'])
            questions.append(row['Question'])
            questions_tokens.append(row['Question_Tokens'].split())
            answers.append(row['Answer'])
            question_lengths.append(row['Question_Length'])
            question_vectors.append(row['Question_Vectors'])
            average_pooling.append(row['Average_Pooling'])

        data_tokens = pd.DataFrame({'Class': classes,
                                    'Question': questions,
                                    'Question_Tokens': questions_tokens,
                                    'Answer': answers,
                                    'Question_Length': question_lengths,
                                    'Question_Vectors': question_vectors,
                                    'Average_Pooling': average_pooling})
except:
    pass

data_tokens.head()

Unnamed: 0,Class,Question,Question_Tokens,Answer,Question_Length,Question_Vectors,Average_Pooling
0,security,Do I need to enter ‘#’ after keying in my Card number/ Card expiry date/ CVV number,"[need, enter, keying, card, number, card, expiry, date, number]",Please listen to the recorded message and follow the instructions while entering your card details.,53,"[[0.0022045795, 0.0031682255, 0.0027144125, -0.0047931019, 0.00139459, -0.0030819646, -0.0029356...","[0.0006918209, 0.0035093167, 0.0013605111, -8.8425e-06, 0.0010526863, -0.002093026, -0.001052257..."
1,security,What details are required when I want to perform a secure IVR transaction,"[details, required, want, perform, secure, transaction]","To perform a secure IVR transaction, you will need your 16-digit Card number, Card expiry date, ...",48,"[[-0.0036381332, 0.0016050931, -0.0048513315, 0.0038896555, -0.0044063693, -0.0016465527, -0.003...","[-0.0024840705, -0.0013779873, -0.003306444, 0.0010085381, -0.0034428573, -0.0001123129, -0.0013..."
2,security,How should I get the IVR Password if I hold an add-on card,"[password, hold, card]",An IVR password can be requested only from the registered mobile number and will be sent to the ...,18,"[[-0.003286957, -0.0040943474, 0.002438291, 0.000623085, -3.87241e-05, -0.0008238838, -0.0035217...","[-0.0011701554, 0.0002422936, 0.0004502396, 0.0026107802, -0.0010160639, -0.0019829881, -0.00397..."
3,security,How do I register my Mobile number for IVR Password,"[register, mobile, number, password]",Please call our Customer Service Centre and ensure that your mobile number is updated in our rec...,31,"[[-0.0019145777, 0.0030525641, -0.0042188996, -0.0044614156, 0.0015650395, -0.0023413908, 0.0039...","[-0.0020582331, 0.0002540425, -0.0004769964, -0.0023139998, 0.0008770661, -8.429e-06, 0.00110849..."
4,security,How can I obtain an IVR Password,"[obtain, password]",By Sending SMS request: Send an SMS 'PWD<space>1234' to 9717465555 or to 5676712 from your regis...,15,"[[-0.003286957, -0.0040943474, 0.002438291, 0.000623085, -3.87241e-05, -0.0008238838, -0.0035217...","[-0.003286957, -0.0040943474, 0.002438291, 0.000623085, -3.87241e-05, -0.0008238838, -0.00352173..."


In [16]:
# Greeting function
GREETING_INPUTS = ("hello", "hi", "greetings", "hello i need help", "good day","hey","i need help", "greetings")
GREETING_RESPONSES = ["Good day, How may i of help?", "Hello, How can i help?", "hello", "I am glad! You are talking to me."]
           
def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [19]:
def Talk_To_Javris(data_language, model):
    
    # Preprocessing of user input
    sentence_pp = pre_process(pd.Series(sentence)) # Remove non english words, tokenization, remove stop words

    cosines = []
    try:
        # Get vectors and average pooling
        question_vectors = []
        for token in sentence_pp:
            try:
                vector = model[token] # Apply word to vector model on the entered question
                question_vectors.append(vector) # Make a list of that vectors
            except:
                continue
        question_ap = list(pd.DataFrame(question_vectors[0]).mean()) # That vectors mean is stored in question_ap

        # Calculate cosine similarity
        for t in data_language['Average_Pooling']:
            if t is not None and len(t) == len(question_ap):
                val = cosine_similarity([question_ap], [t]) # See the cosine similarity between the entered question vector mean and all question_token vector mean
                cosines.append(val[0][0])
            else:
                cosines.append(0)
    except:
        pass
            
    # If not in the topic trained
    if len(cosines) == 0:
        not_understood = "Apology, I do not understand. Can you rephrase?"
        return not_understood, 999
    
    else: 
        # Sort similarity
        index_s =[]
        score_s = []
        for i in range(len(cosines)):
            x = cosines[i]
            if x >= 0.3:
                index_s.append(i)
                score_s.append(cosines[i])

        reply_indexes = pd.DataFrame({'index': index_s, 'score': score_s})
        reply_indexes = reply_indexes.sort_values(by="score" , ascending=False)
        #print(reply_indexes)
        # Find Top Questions and Score
        r_index = int(reply_indexes['index'].iloc[0])
        r_score = float(reply_indexes['score'].iloc[0])
        qns = str(data_language.iloc[:, 0][r_index])
        reply = str(data_language.iloc[:, 2][r_index])
        print('\x1b[1;37;40m' + 'JARVIS'+'\x1b[0m'+': '+reply)
        outcome = input("Was this answer helpful? Yes/No: ").lower().strip()
        if outcome == 'yes':
            cnt = 0
        elif outcome == 'no':
            main(sentence,data_language, model)
            
        
        return None
    
def main(sentence, data_language, model):  
     # Preprocessing of user input
    sentence_pp = pre_process(pd.Series(sentence)) 

    cosines = []
    try:
        # Get vectors and average pooling
        question_vectors = []
        for token in sentence_pp:
            try:
                vector = model[token]
                question_vectors.append(vector)
            except:
                continue
        question_ap = list(pd.DataFrame(question_vectors[0]).mean())

        # Calculate cosine similarity
        for t in data_language['Average_Pooling']:
            if t is not None and len(t) == len(question_ap):
                val = cosine_similarity([question_ap], [t])
                cosines.append(val[0][0])
            else:
                cosines.append(0)
    except:
        pass
            
    # If not in the topic trained
    if len(cosines) == 0:
        not_understood = "Apology, I do not understand. Can you rephrase?"
        return not_understood, 999
    
    else: 
        # Sort similarity
        index_s =[]
        score_s = []
        for i in range(len(cosines)):
            x = cosines[i]
            if x >= 0:
                index_s.append(i)
                score_s.append(cosines[i])

        reply_indexes = pd.DataFrame({'index': index_s, 'score': score_s})
        reply_indexes = reply_indexes.sort_values(by="score" , ascending=False)
        arr = []
        answer = []
        for i in range(6):
            r_index = int(reply_indexes['index'].iloc[i])
            r_score = float(reply_indexes['score'].iloc[i])
            qns = str(data_language.iloc[:, 0][r_index])
            ans = str(data_language.iloc[:, 2][r_index])
            arr.append(qns)
            answer.append(ans)
            
        for i in range(1,6):
            x= arr[i]
            print(i,x)
            
        ques_new= int(input('Please enter the question number you find most relevant:'))
        print('\x1b[1;37;40m' + 'JARVIS'+'\x1b[0m'+': '+answer[ques_new])
    return None

In [None]:
flag_language = True
flag_query = True
dict_language = {'0': 'security', '1': 'loans', '2': 'accounts', '3': 'insurance', '4': 'investments', '5': 'fundstransfer', '6': 'cards'}

print('......................................................................................')
print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'My name is Jarvis, a Programming Language Apprentice Bot.')
print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'I will try my best to answer your query.')
print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'If you want to exit, you can type < bye >.')

while(flag_language == True):
    cnt = 0
    print("......................................................................................")
    print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'Please select which language you want to enquire, ' +
      'you can type:')
    print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + '< 0 > for security     < 1 > for loans            < 2 > for accounts')
    print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + '< 3 > for insurance    < 4 > for investments      < 5 > for fundstransfer')
    print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + '< 6 > for cards'      )
   

    print("......................................................................................")
    sentence = input('\x1b[0;30;47m' + 'USER  ' + '\x1b[0m' + ':')
    print("......................................................................................")
    
    if(sentence.lower() != 'bye'):
        if (sentence.lower() in list(dict_language.keys())):
            language = dict_language[sentence.lower()]
            data_language = data_tokens[data_tokens['Class'] == language]
            data_language = pd.DataFrame({'Question': list(data_language['Question']),
                                          'Question_Tokens': list(data_language['Question_Tokens']),
                                          'Answer': list(data_language['Answer']),
                                          'Class': list(data_language['Class']),
                                          'Question_Vectors': list(data_language['Question_Vectors']),
                                          'Average_Pooling': list(data_language['Average_Pooling'])
                                         })
            
            # Read word2vec model
            word2vec_pickle_path = path + 'word2vec_faq_' + language + '.bin'
            model = gensim.models.KeyedVectors.load(word2vec_pickle_path)
            
            flag_language = False
            flag_query = True
    else:
        flag_language = False
        flag_query = False

print("......................................................................................")
print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'Let''s start! Please input your question now.')
    
while(flag_query == True):
    print("......................................................................................")
    sentence = input('\x1b[0;30;47m' + 'USER  ' + '\x1b[0m' + ':')
    print("......................................................................................")

    if(sentence.lower() != 'bye'):
        if(greeting(sentence.lower()) != None):
            print('\x1b[1;37;40m' + 'JARVIS' + '\x1b[0m' + ': ' + greeting(sentence.lower()))
        else:
            Talk_To_Javris(data_language, model)
            #print('\x1b[1;37;40m' + 'JARVIS'+'\x1b[0m'+': '+reply)
            #print("\n"*2)
            #outcome = input("Was this answer helpful? Yes/No: ").lower().strip()
            #if outcome == 'yes':
              #  cnt = 0
           # elif outcome == 'no':
                #ques, reply, score= main(sentence,data_language, model)
                #print('\x1b[1;37;40m' + 'JARVIS'+'\x1b[0m'+': '+reply)
            #For Tracing, comment to remove from print 
            #print("")
            #print("SCORE: " + str(score))
    else:
        flag_query = False
print('\x1b[1;37;40m' + 'JARVIS' + '\x1b[0m' + ': ' + 'Bye! Hope that i am of help.')

......................................................................................
[1;37;40mJarvis[0m: My name is Jarvis, a Programming Language Apprentice Bot.
[1;37;40mJarvis[0m: I will try my best to answer your query.
[1;37;40mJarvis[0m: If you want to exit, you can type < bye >.
......................................................................................
[1;37;40mJarvis[0m: Please select which language you want to enquire, you can type:
[1;37;40mJarvis[0m: < 0 > for security     < 1 > for loans            < 2 > for accounts
[1;37;40mJarvis[0m: < 3 > for insurance    < 4 > for investments      < 5 > for fundstransfer
[1;37;40mJarvis[0m: < 6 > for cards
......................................................................................
USER  :0
......................................................................................
......................................................................................
[1;37;40mJarvis[0m: Lets start! Ple