In [1]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import pickle
import re

This file takes the *._backup.xlsx file for the chatbot, and saves the mapping file (for the chatbot) and take the "Intents" spreadsheet for training the model. Ultimately, this notebook produces the mappings pickle file and the model pickle file.

In [2]:
## Change the data file name ##
backup_filename = 'CFS_backup.xlsx'

In [3]:
class NLP():
    
    def __init__(self):
        
        self.TFIDF_classifier = None
        
        self.data = pd.read_excel('../data/' + backup_filename, sheet_name='Intents')
        self.mappings = pd.read_excel('../data/' + backup_filename, sheet_name='Mappings')
        
        # Save mapping file for the chatbot
        with open('../mapping.sav', 'wb') as f:
            pickle.dump(self.mappings, f)
        
        # TFIDF Model params
        self.TFIDF_vectorizer = TfidfVectorizer()
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer() 
        self.STOPWORDS = '''is a the of or to and in be as for not are your on this you such that from
                            by have will can be a been may there so please it should any where does no help
                            how my i'''.split()
    
    def clean_tfidf(self, question):
        
        try:
            question = question.lower()
            question = re.sub(r'(!|\.|,|\(|\)|\[|\]|\\|\?|\$|#|%|\*)', '', question)
            question = question.split()
            question = [w for w in question if w not in self.STOPWORDS]
            question = [self.stemmer.stem(w) for w in question]
        except:
            return ''
        
        # Spelling?
        return ' '.join(question)
    
    def train_tfidf(self):
        
        # Select only visible intents
        self.data = self.data[self.data['Visibility'] == 'yes']
        
        # Clean the data
        self.data['Cleaned'] = self.data['Questions'].apply(self.clean_tfidf)
        
        # Get X and Y
        X = self.TFIDF_vectorizer.fit_transform(self.data['Cleaned'])
        Y = np.array(self.data['Label'])
        print('Labels: ', Y)
       
        # Build model
        self.TFIDF_classifier = RandomForestClassifier(n_estimators=150, max_depth=8, random_state=42)
        self.TFIDF_classifier.fit(X,Y)
        
        # Estimate current fit
        y_pred = self.TFIDF_classifier.predict(X)
        L1 = (self.data['Level'] == 1).tolist()
        print("L1 accuracy: {}%".format(sum(y_pred[L1]==Y[L1])/sum(L1)*100))
        
        # pickle the model
        with open('../TFIDF_model.sav', 'wb') as f:
            pickle.dump(self.TFIDF_classifier, f)
            
        # pickle the TFIDF vectorizer
        with open('../TFIDF_vectorizer.sav', 'wb') as f:
            pickle.dump(self.TFIDF_vectorizer, f)
        
    def test_tfidf(self, test_question):
        
        if self.TFIDF_classifier == None:
            print("Model not trained")
            return None
        
        question_cleaned = self.clean_tfidf(test_question)
 
        X = self.TFIDF_vectorizer.transform([question_cleaned])
    
        # Make prediction
        y_proba = self.TFIDF_classifier.predict_proba(X)[0]
      
        classes = self.TFIDF_classifier.classes_
      
        # View results
        index = np.argmax(y_proba)
    
        # Format output table and return top 5
        results = pd.DataFrame()
        results['Class'] = classes
        results['Probabilities'] = y_proba
        results['Norm Prob'] = y_proba/np.max(y_proba)
        results.sort_values('Norm Prob', ascending=False, inplace=True)
        results = results.reset_index(drop=True).iloc[:3]
        
        return results
            

#### Import NLP Object

In [4]:
x = NLP()
x.mappings.head(50)

Unnamed: 0.1,Unnamed: 0,Label,Intent,Out-context,In-context,Level,Visibility,Answer-formatted
0,0,0,Is GLRC-AML approval required when risk rating...,,,1,yes,<p>Where you want to lower the risk rating e.g...
1,1,1,What is tax evasion,,,1,yes,<p>Tax evasion is the illegal evasion of taxes...
2,2,2,What is CRAM,,,1,yes,<p>Customer money laundering/terrorism financi...
3,3,3,What is the approval level,,,1,yes,<p>Approval from the Division Head and concurr...
4,4,4,Difference between customer due diligence and ...,,,1,yes,<p>Enhanced due diligence measures (EDD) inclu...
5,5,5,What type of customers are considered as unacc...,,,1,yes,<p>We must not establish business relations wi...
6,6,6,What are the documents required for CDD for na...,,,1,yes,<p>You will need to collect documents that can...
7,7,7,What to do when customer apply new products or...,WhatToDoWhenCustomerApplyNewProductsOrChangePe...,,1,yes,<p>If your customer apply for additional produ...
8,8,8,Info on non-material TER,,WhatToDoWhenCustomerApplyNewProductsOrChangePe...,2,yes,<p>Non-Material trigger events review occurs w...
9,9,9,Info on Material TER,,WhatToDoWhenCustomerApplyNewProductsOrChangePe...,2,yes,<p>Material trigger events review occurs when ...


#### Train TFIDF and export model file

In [5]:
# Training the model will also save the model (and the TFIDF vectorizer) as a pickle file
x.train_tfidf()

Labels:  [ 0  0  0  0  0  0  0  0  1  1  1  1  1  2  2  2  2  2  2  2  3  3  3  3
  3  3  3  4  4  4  4  4  5  5  5  5  5  5  5  5  5  5  5  6  6  6  6  6
  7  7  7  7  7  7  7  7  7  7  7  7  7  7  8  9 10 10 10 10 10 10 10 10
 10 10 10 10 10 11 11 11 11 11 11 11 11 12 12 12 12 12 12 12 13 13 13 13
 13 13 13 13 13 13 13 13 13 14 14 14 14 14 14 14 15 15 15 15 15 15 15 15
 15 15 16 16 16 16 16 16 16 16 16 16 17 17 17 17 17 17 17 17 17 17 18 18
 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 19 19 19 19 19 19 19 19 19
 19 19 19 19 19 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20
 20 20 20 20 20 20 20 21 21 21 21 21 21 21 21 21 21 21 21 21 22 22 22 22
 22 22 22 22 22 22 22 22 22 22 23 23 23 23 23 23 23 23 23 24 24 24 24 24
 24 24 24 24 24 24 24 24 24 24 24 24 24 24 25 25 25 25 25 25 25 25 25 25
 25 25 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 27 27 27
 27 27 27 27 27 27 27 27 27 27 28 28 28 28 28 28 28 28 28 28 28 28 28 28
 29 29 29 29 29 29 29 29 29 29 29 30 30 30

#### Test

In [7]:
# Insert a test query here
query = 'how to cancel my credit card?'

df = x.test_tfidf(query)

for index in df['Class']:
    print(x.mappings[x.mappings['Label']==index]['Intent'].values[0], '\n')


When is enhanced due diligence required for my customer 

Enhanced Due Diligence 

Additional EDD measures 

