In [1]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import pickle
import re
import plotly.express as px

import os,json
from time import time
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from scipy.sparse import coo_matrix, hstack
from sklearn.model_selection import StratifiedKFold
import xlrd

This file takes the *._backup.xlsx file for the chatbot, and saves the mapping file (for the chatbot) and take the "Intents" spreadsheet for training the model. Ultimately, this notebook produces the mappings pickle file and the model pickle file.

## Kelvin's method

In [2]:
## Change the data file name ##
backup_filename = 'CFS_backup.xlsx'

In [3]:
class NLP():
    
    def __init__(self):
        
        self.TFIDF_classifier = None
        
        self.data = pd.read_excel('../data/' + backup_filename, sheet_name='Intents')
        self.mappings = pd.read_excel('../data/' + backup_filename, sheet_name='Mappings')
        
        # Save mapping file for the chatbot
        with open('../mapping.sav', 'wb') as f:
            pickle.dump(self.mappings, f)
        
        # TFIDF Model params
        self.TFIDF_vectorizer = TfidfVectorizer()
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer() 
        self.STOPWORDS = '''is a the of or to and in be as for not are your on this you such that from
                            by have will can be a been may there so please it should any where does no help
                            how my i'''.split()
    
    def clean_tfidf(self, question):
        
        try:
            question = question.lower()
            question = re.sub(r'(!|\.|,|\(|\)|\[|\]|\\|\?|\$|#|%|\*)', '', question)
            question = question.split()
            question = [w for w in question if w not in self.STOPWORDS]
            question = [self.stemmer.stem(w) for w in question]
        except:
            return ''
        
        # Spelling?
        return ' '.join(question)
    
    def train_tfidf(self):
        
        # Select only visible intents
        self.data = self.data[self.data['Visibility'] == 'yes']
        
        # Clean the data
        self.data['Cleaned'] = self.data['Questions'].apply(self.clean_tfidf)
        
        # Get X and Y
        X = self.TFIDF_vectorizer.fit_transform(self.data['Cleaned'])
        Y = np.array(self.data['Label'])
        print('Labels: ', Y)
       
        # Build model
        self.TFIDF_classifier = RandomForestClassifier(n_estimators=150, max_depth=8, random_state=42)
        self.TFIDF_classifier.fit(X,Y)
        
        # Estimate current fit
        y_pred = self.TFIDF_classifier.predict(X)
        L1 = (self.data['Level'] == 1).tolist()
        print("L1 accuracy: {}%".format(sum(y_pred[L1]==Y[L1])/sum(L1)*100))
        
        # pickle the model
        with open('../TFIDF_model.sav', 'wb') as f:
            pickle.dump(self.TFIDF_classifier, f)
            
        # pickle the TFIDF vectorizer
        with open('../TFIDF_vectorizer.sav', 'wb') as f:
            pickle.dump(self.TFIDF_vectorizer, f)
        
    def test_tfidf(self, test_question):
        
        if self.TFIDF_classifier == None:
            print("Model not trained")
            return None
        
        question_cleaned = self.clean_tfidf(test_question)
 
        X = self.TFIDF_vectorizer.transform([question_cleaned])
    
        # Make prediction
        y_proba = self.TFIDF_classifier.predict_proba(X)[0]
      
        classes = self.TFIDF_classifier.classes_
      
        # View results
        index = np.argmax(y_proba)
    
        # Format output table and return top 5
        results = pd.DataFrame()
        results['Class'] = classes
        results['Probabilities'] = y_proba
        results['Norm Prob'] = y_proba/np.max(y_proba)
        results.sort_values('Norm Prob', ascending=False, inplace=True)
        results = results.reset_index(drop=True).iloc[:3]
        
        return results
            

#### Import NLP Object

In [4]:
x = NLP()
x.mappings.head(50)

Unnamed: 0.1,Unnamed: 0,Label,Intent,Out-context,In-context,Level,Visibility,Answer-formatted
0,0,0,Is GLRC-AML approval required when risk rating...,,,1,yes,<p>Where you want to lower the risk rating e.g...
1,1,1,What is tax evasion,,,1,yes,<p>Tax evasion is the illegal evasion of taxes...
2,2,2,What is CRAM,,,1,yes,<p>Customer money laundering/terrorism financi...
3,3,3,What is the approval level,,,1,yes,<p>Approval from the Division Head and concurr...
4,4,4,Difference between customer due diligence and ...,,,1,yes,<p>Enhanced due diligence measures (EDD) inclu...
5,5,5,What type of customers are considered as unacc...,,,1,yes,<p>We must not establish business relations wi...
6,6,6,What are the documents required for CDD for na...,,,1,yes,<p>You will need to collect documents that can...
7,7,7,What to do when customer apply new products or...,WhatToDoWhenCustomerApplyNewProductsOrChangePe...,,1,yes,<p>If your customer apply for additional produ...
8,8,8,Info on non-material TER,,WhatToDoWhenCustomerApplyNewProductsOrChangePe...,2,yes,<p>Non-Material trigger events review occurs w...
9,9,9,Info on Material TER,,WhatToDoWhenCustomerApplyNewProductsOrChangePe...,2,yes,<p>Material trigger events review occurs when ...


#### Train TFIDF and export model file

In [5]:
# Training the model will also save the model (and the TFIDF vectorizer) as a pickle file
x.train_tfidf()

Labels:  [ 0  0  0  0  0  0  0  0  1  1  1  1  1  2  2  2  2  2  2  2  3  3  3  3
  3  3  3  4  4  4  4  4  5  5  5  5  5  5  5  5  5  5  5  6  6  6  6  6
  7  7  7  7  7  7  7  7  7  7  7  7  7  7  8  9 10 10 10 10 10 10 10 10
 10 10 10 10 10 11 11 11 11 11 11 11 11 12 12 12 12 12 12 12 13 13 13 13
 13 13 13 13 13 13 13 13 13 14 14 14 14 14 14 14 15 15 15 15 15 15 15 15
 15 15 16 16 16 16 16 16 16 16 16 16 17 17 17 17 17 17 17 17 17 17 18 18
 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 19 19 19 19 19 19 19 19 19
 19 19 19 19 19 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20
 20 20 20 20 20 20 20 21 21 21 21 21 21 21 21 21 21 21 21 21 22 22 22 22
 22 22 22 22 22 22 22 22 22 22 23 23 23 23 23 23 23 23 23 24 24 24 24 24
 24 24 24 24 24 24 24 24 24 24 24 24 24 24 25 25 25 25 25 25 25 25 25 25
 25 25 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 27 27 27
 27 27 27 27 27 27 27 27 27 27 28 28 28 28 28 28 28 28 28 28 28 28 28 28
 29 29 29 29 29 29 29 29 29 29 29 30 30 30

#### Test

In [7]:
# Insert a test query here
query = 'how to cancel my credit card?'

df = x.test_tfidf(query)

for index in df['Class']:
    print(x.mappings[x.mappings['Label']==index]['Intent'].values[0], '\n')


When is enhanced due diligence required for my customer 

Enhanced Due Diligence 

Additional EDD measures 



## Tf-idf Baseline model 

In [2]:
data = pd.read_excel('bot.xlsx', sheet_name='Intents')
data = data[['Questions','Intent']]

data = data.dropna().drop_duplicates()
data['Questions'] = data['Questions'].apply(lambda x: x.lower())
data['Questions']  = data['Questions'].apply(lambda x: re.sub(r'(!|\.|,|\(|\)|\[|\]|\\|\?|\$|#|%|\*)', '', x))
data['Intent'] = data['Intent'].astype(str)

In [3]:
data.head(3)

Unnamed: 0,Questions,Intent
0,what promotions do you have,Promotions
1,what promotions are available,Promotions
2,promotions,Promotions


In [4]:
# use original data
pipeline = Pipeline([
            ('vect', TfidfVectorizer()),
            ('clf', RandomForestClassifier()),
        ])
        
parameters = {
    'vect__ngram_range': [(1,1),(1,2),(1,3)],
    'clf__n_estimators': [50,100,200,300],
    'clf__max_depth': [8,10,15,20,25,30]
}

# grid search for best estimator
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=2,scoring='accuracy')
grid_search.fit(data['Questions'], data['Intent'])

print("Best score: %0.5f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()): 
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 72 candidates, totalling 360 fits



The least populated class in y has only 1 members, which is less than n_splits=5.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:   16.1s finished


Best score: 0.82025
Best parameters set:
	clf__max_depth: 30
	clf__n_estimators: 200
	vect__ngram_range: (1, 1)


In [5]:
# Get predictions
"""
It is over-fitted because no train-test-split.
Just want to build the pipeline so that can test later after the test data comes
"""
estimator = grid_search.best_estimator_
probs = estimator.predict_proba(data['Questions'])
data['pred'] = np.argsort(probs, axis=1)[:,-1]
data['predicted_label'] = data['pred'].apply(lambda x: estimator.classes_[x])
data.drop(columns = 'pred', axis=1, inplace=True)
data

Unnamed: 0,Questions,Intent,predicted_label
0,what promotions do you have,Promotions,Promotions
1,what promotions are available,Promotions,Promotions
2,promotions,Promotions,Promotions
3,i want to see promotions,Promotions,Promotions
4,view promotions,Promotions,Promotions
...,...,...,...
416,how to update contact number,Update details,Update details
417,update mailing address,Update details,Update details
418,i need to update my email address,Update details,Update details
419,how to update email address,Update details,Update details


In [8]:
# view the most important words
vectorizer = estimator['vect']
clf = estimator['clf']

def print_top10(vectorizer, clf):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    top10 = np.argsort(clf.feature_importances_)[-30:]
    print([feature_names[j] for j in top10])

In [7]:
print_top10(vectorizer, clf)

['reprice', 'activate', 'hardware', 'change', 'application', 'talk', 'bank', 'want', 'cancel', 'atm', 'debit', 'investment', 'limit', 'token', 'how', 'personal', 'waiver', 'late', 'credit', 'for', 'statement', 'fee', 'interest', 'update', 'loan', 'to', 'account', 'promotions', 'my', 'card']


## Add spacy features and view feature importance

### Load data and some pre-processing

In [31]:
data = pd.read_excel('bot.xlsx', sheet_name='Intents')
data = data[['Questions','Intent']]

data = data.dropna().drop_duplicates()
data['Questions'] = data['Questions'].apply(lambda x: x.lower())
data['Questions']  = data['Questions'].apply(lambda x: re.sub(r'(!|\.|,|\(|\)|\[|\]|\\|\?|\$|#|%|\*)', '', x))
data['Intent'] = data['Intent'].astype(str)

data.head()

Unnamed: 0,Questions,Intent
0,what promotions do you have,Promotions
1,what promotions are available,Promotions
2,promotions,Promotions
3,i want to see promotions,Promotions
4,view promotions,Promotions


In [32]:
# load spacy model 
import spacy
nlp = spacy.load("en_core_web_sm")

### Add noun and Verb (delete stop words)

In [33]:
from spacy.lang.en.stop_words import STOP_WORDS
stop_words = list(STOP_WORDS)

In [34]:
data['noun'] = data['Questions'].apply(lambda x: list(set([token.lemma_ for token in nlp(x) if token.pos_ in ['NOUN','PROPN'] and token.text not in stop_words])))
data['verb'] = data['Questions'].apply(lambda x: list(set([token.lemma_ for token in nlp(x) if token.pos_ in ['VERB'] and token.text not in stop_words])))

### Add keywords (base on those words appear in Intent)

In [35]:
keywords = []
for intent in list(set(data['Intent'])):
    keywords.extend(intent.strip().split(' '))
keyword_list = list(set(keywords))

keyword_list = [i.lower() for i in keyword_list if i.lower() not in stop_words]
print(keyword_list)
data['key_word'] = data['Questions'].apply(lambda x: list(set([token.lemma_ for token in nlp(x) if token.text in keyword_list])))

['property', 'suspension', 'fee', 'rewards', 'accounts', 'service', 'sponsorship', 'malaysia', 'promotions', 'details', 'loans', 'activation', 'credit', 'transaction', 'nisp', 'investment', 'securities', 'loans', 'enquiry', 'hardware', 'fee', 'rebates', 'decrease', 'debit', 'customer', 'application', 'repricing', 'replacement', 'officer', 'interest', 'statement', 'waiver', 'application', 'account', 'passbook', 'redeem', 'cancelled', 'cancel', 'token', '365', 'personal', 'singapore', 'speak', 'account', 'auto', 'cancellation', 'uplift', 'increase', 'update', '360', 'late', 'card', 'loan', 'limit', 'dispute', 'statement', 'savings', 'atm', 'change', 'ocbc', 'service', 'home', 'paying', 'card', 'credit', 'rejection', 'onetoken', 'open', 'close', 'request']


### Change text to lemma (delete stop words)

In [36]:
data['lemma'] = data['Questions'].apply(lambda x:' '.join([token.lemma_ for token in nlp(x) if token.text not in stop_words]))

In [37]:
# combine spacy features
noun_cols = pd.DataFrame(data['noun'].values.tolist(),columns = ['noun_1','noun_2','noun_3','noun_4','noun_5']).reset_index(drop=True)
verb_cols = pd.DataFrame(data['verb'].values.tolist(),columns = ['verb_1','verb_2','verb_3']).reset_index(drop=True)
keyword_cols = pd.DataFrame(data['key_word'].values.tolist(),columns = ['keyword_1','keyword_2','keyword_3','keyword_4','keyword_5']).reset_index(drop=True)
data = data.reset_index(drop=True)

In [38]:
processed = pd.concat([data,noun_cols,verb_cols,keyword_cols], axis=1)
processed.drop(columns=['noun','verb','key_word'], inplace=True)

In [39]:
processed.head()

Unnamed: 0,Questions,Intent,lemma,noun_1,noun_2,noun_3,noun_4,noun_5,verb_1,verb_2,verb_3,keyword_1,keyword_2,keyword_3,keyword_4,keyword_5
0,what promotions do you have,Promotions,promotion,promotion,,,,,,,,promotion,,,,
1,what promotions are available,Promotions,promotion available,promotion,,,,,,,,promotion,,,,
2,promotions,Promotions,promotion,promotion,,,,,,,,promotion,,,,
3,i want to see promotions,Promotions,want promotion,promotion,,,,,want,,,promotion,,,,
4,view promotions,Promotions,view promotion,view,promotion,,,,,,,promotion,,,,


### Combine Tf-idf vectors and other features

In [40]:
v = TfidfVectorizer()
x = v.fit_transform(processed['lemma'])

vocab = dict(v.vocabulary_)

In [41]:
# label encoding for spacy features
cols = ['noun_1', 'noun_2', 'noun_3', 'noun_4', 'noun_5',
       'verb_1', 'verb_2', 'verb_3', 'keyword_1', 'keyword_2', 'keyword_3',
       'keyword_4', 'keyword_5']
for col in cols:
    processed[col] = processed[col].apply(lambda x: vocab[x] if x in vocab else 0.0)  

In [42]:
# combine tf-idf vectors and spacy features 
X = hstack((x, processed[cols].values),format='csr')
X_columns=v.get_feature_names()+processed[cols].columns.tolist()

features = pd.DataFrame(X.toarray())
features.columns = X_columns

y = processed['Intent']

In [43]:
features.head()

Unnamed: 0,360,365,access,account,accout,acct,activate,activation,additional,address,...,noun_4,noun_5,verb_1,verb_2,verb_3,keyword_1,keyword_2,keyword_3,keyword_4,keyword_5
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,116.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,116.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,116.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,179.0,0.0,0.0,116.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,116.0,0.0,0.0,0.0,0.0


### Grid search for best parameters and visualize feature importance

In [44]:
pipeline = Pipeline([
            ('clf', RandomForestClassifier()),
        ])
        
parameters = {
    'clf__n_estimators': [350,400,450,500],
    'clf__max_depth': [20,25,30,40,50]
}

# grid search for best estimator
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=2,scoring='accuracy')
grid_search.fit(features, y)

print("Best score: %0.5f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters        .keys()): 
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 20 candidates, totalling 100 fits



The least populated class in y has only 1 members, which is less than n_splits=5.

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   11.4s finished


Best score: 0.83797
Best parameters set:
	clf__max_depth: 20
	clf__n_estimators: 500


In [45]:
# view feature importance 
estimator = grid_search.best_estimator_
clf = estimator['clf']
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1][:30]

feature_names = [X_columns[i] for i in indices]
importance = importances[indices]

feature_importance = pd.DataFrame({'feature_names':feature_names,'importance':importance })

feature_importance['type'] = feature_importance['feature_names'].apply(lambda x: 'spacy feature' if '_' in x else 'tfidf')
feature_importance.sort_values(by = 'importance',ascending = False, inplace=True)

In [46]:
fig = px.bar(feature_importance, x='feature_names', y='importance',color = 'type').update_xaxes(categoryorder = "total descending")
fig.update_layout(
    title="Feature importance of classifier",
)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

## Classifier  - cross-validation score

In [47]:
processed.head()

Unnamed: 0,Questions,Intent,lemma,noun_1,noun_2,noun_3,noun_4,noun_5,verb_1,verb_2,verb_3,keyword_1,keyword_2,keyword_3,keyword_4,keyword_5
0,what promotions do you have,Promotions,promotion,116.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,116.0,0.0,0.0,0.0,0.0
1,what promotions are available,Promotions,promotion available,116.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,116.0,0.0,0.0,0.0,0.0
2,promotions,Promotions,promotion,116.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,116.0,0.0,0.0,0.0,0.0
3,i want to see promotions,Promotions,want promotion,116.0,0.0,0.0,0.0,0.0,179.0,0.0,0.0,116.0,0.0,0.0,0.0,0.0
4,view promotions,Promotions,view promotion,175.0,116.0,0.0,0.0,0.0,0.0,0.0,0.0,116.0,0.0,0.0,0.0,0.0


In [49]:
feature_cols = ['lemma', 'noun_1', 'noun_2', 'noun_3', 'noun_4',
       'noun_5', 'verb_1', 'verb_2', 'verb_3', 'keyword_1', 'keyword_2',
       'keyword_3', 'keyword_4', 'keyword_5']

# StratifiedKFold coss validation 
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(processed[feature_cols], processed['Intent'])
print(skf)

cv_scores = []

for train_index, test_index in skf.split(processed[feature_cols], processed['Intent']):
    # get train, test data for each chunk 
    X_train, X_test = processed.loc[train_index,feature_cols], processed.loc[test_index,feature_cols]
    y_train, y_test = processed.loc[train_index,'Intent'], processed.loc[test_index,'Intent']
    
    v = TfidfVectorizer()
    x_train = v.fit_transform(X_train['lemma'])
    x_test = v.transform(X_test['lemma'])
    
    vocab = dict(v.vocabulary_)
    
    # combine Tf-idf vectors and other features
    cols = ['noun_1', 'noun_2', 'noun_3', 'noun_4', 'noun_5',
       'verb_1', 'verb_2', 'verb_3', 'keyword_1', 'keyword_2', 'keyword_3',
       'keyword_4', 'keyword_5']
    for col in cols:
        X_train[col] = X_train[col].apply(lambda x: vocab[x] if x in vocab else 0.0)  
        X_test[col] = X_test[col].apply(lambda x: vocab[x] if x in vocab else 0.0)  
    
    x_train_combined = hstack((x_train, X_train[cols].values),format='csr')
    x_train_combined_columns=v.get_feature_names() + X_train[cols].columns.tolist()
    
    x_test_combined = hstack((x_test, X_test[cols].values),format='csr')
    x_test_combined_columns=v.get_feature_names()  + X_test[cols].columns.tolist()
    
    x_train_combined = pd.DataFrame(x_train_combined.toarray())
    x_train_combined.columns = x_train_combined_columns
    
    x_test_combined = pd.DataFrame(x_test_combined.toarray())
    x_test_combined.columns = x_test_combined_columns
    
    # build classifier
    clf = RandomForestClassifier(max_depth=40, n_estimators=450)
    clf.fit(x_train_combined, y_train)
    score = clf.score(x_test_combined, y_test)
    cv_scores.append(score)
    
print(np.mean(np.array((cv_scores))), np.std(np.array((cv_scores))))

StratifiedKFold(n_splits=5, random_state=None, shuffle=False)



The least populated class in y has only 1 members, which is less than n_splits=5.



0.8506329113924049 0.024545214467930773
