## Load Libraries

In [1]:
# General Import
import re
import math
import string

import numpy as np
import pandas as pd

from scipy.sparse import hstack

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_distances

import gensim.downloader as api

from nltk.tokenize import word_tokenize

import spacy
from spacy.lang.en.stop_words import STOP_WORDS


In [2]:
# Starting point
import os
import sys
from pathlib import Path

PATH_HOME = Path.home()
PATH_PROJ = Path.cwd()
PATH_DATA = PATH_PROJ

sys.path.append(str(PATH_PROJ))

## Load Data

In [3]:
# TRAIN
df_train = pd.read_csv('data2.csv')
df_train.dropna(inplace=True)
print(df_train.shape)
df_train.head(2)

(641, 3)


Unnamed: 0,Label,Intent,Questions
0,0,Statement request,i would like a copy of my statement
1,0,Statement request,please send me a copy of my statement


In [4]:
# rename dataframe
df_train = df_train.rename(columns={'Intent': 'intent', 'Questions': 'query'})
df_train = df_train[['intent', 'query']]
df_train.head(2)

Unnamed: 0,intent,query
0,Statement request,i would like a copy of my statement
1,Statement request,please send me a copy of my statement


In [5]:
# TEST
df_test = pd.read_csv('uat_data_intent.csv')
df_test.dropna(inplace=True)
print(df_test.shape)
df_test.head(2)

(128, 3)


Unnamed: 0,Question,User Clicked intent,Google-intent
0,how do i submit a dispute?,Cancel credit card transaction,Dispute status
1,I lost my card,Lost or compromised cards,Lost or compromised cards


In [6]:
df_test['correct_google'] = np.where(df_test['User Clicked intent'] == df_test['Google-intent'], 1, 0)
df_test.head()

Unnamed: 0,Question,User Clicked intent,Google-intent,correct_google
0,how do i submit a dispute?,Cancel credit card transaction,Dispute status,0
1,I lost my card,Lost or compromised cards,Lost or compromised cards,1
2,I have not received my purchases from the merc...,Cancel credit card transaction,Cancel ATM Card,0
3,i have a transaction that i did not do,Cancel credit card transaction,Cancel credit card transaction,1
4,how to terminate my card?,Cancel Credit or Debit Card,Card Cancellation,0


In [7]:
google_accuracy = sum(df_test['correct_google']) / len(df_test['correct_google'])
print(" Google NLU accuracy is {:.1%}".format(google_accuracy))

 Google NLU accuracy is 78.1%


In [8]:
# rename dataframe
df_test = df_test.rename(columns={'User Clicked intent': 'intent', 'Question': 'query'})
df_test = df_test[['intent', 'query']]
df_test.head(2)

Unnamed: 0,intent,query
0,Cancel credit card transaction,how do i submit a dispute?
1,Lost or compromised cards,I lost my card


## Utilities

In [9]:
def clean_text(text):
    """ Basic text cleaning
        
        1. lowercase
        2. remove special characters
    """
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [10]:
def nltk_tokenize(text):
    """ tokenize text using NLTK and join back as sentence"""
    # import nltk
    # nltk.download('punkt')
    return ' '.join(word_tokenize(text))

In [11]:
# Function for spacy tokenizer

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_lg')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [12]:
def get_idf_TfidfVectorizer(sentences):
    """ Get idf dictionary by using TfidfVectorizer
    
    Args:
        sentences (list): list of input sentences (str)

    Returns:
        idf (dict): idf[word] = inverse document frequency of that word in all training queries
    """
    # use customized Spacy tokenizer
    vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer)
    vectorizer.fit(sentences)
    # TODO: normalize the idf weights
    idf = {k:vectorizer.idf_[v] for k,v in vectorizer.vocabulary_.items()}
    return idf

In [13]:
def get_sentence_vec(sentence, word2vec, idf=None):
    """ Get embedding of sentence by using word2vec embedding of words
    
    If idf is provided, the sentence is the weighted embedding by
        SUM( embedding[word] x idf[word] )
    
    Args:
        sentence (str): input sentence
        word2vec (dict): loaded word2vec model from Gensim
        idf (dict, optional): inverse document frequency of words in all queries

    Returns:
        emb (np.array): 300-dimentions embedding of sentence
    """
    words = sentence.split()
    words = [word for word in words if word in word2vec.vocab]
    
    # if no word in word2vec vocab, return 0x300 embedding
    if len(words)==0:
        return np.zeros((300,), dtype='float32')
    
    # use mean if no idf provided
    if idf is None:
        emb = word2vec[words].mean(axis=0)
    else:
        # get all idf of words, if new word is not in idf, assign 0.0 weights
        idf_series = np.array([idf.get(word, 0.0) for word in words])
        # change shape to 1 x num_of_words
        idf_series = idf_series.reshape(1, -1)
        # use matrix multiplication to get weighted word vector sum for sentence embeddings
        emb = np.matmul(idf_series, word2vec[words]).reshape(-1)
    return emb

In [14]:
def get_sentences_centre(sentences, word2vec, idf=None, num_features=300):
    """ Get sentences centre by averaging all embeddings of sentences in a list
    
    Depends on function get_sentence_vec()
    
    Args:
        sentence (list): list of input sentences (str)
        word2vec (dict): loaded word2vec model from Gensim
        idf (dict, optional): inverse document frequency of words in all queries

    Returns:
        emb (np.array): 300-dimentions embedding of sentence
    """
    # convert list of sentences to their vectors
    sentences_vec = [get_sentence_vec(sentence, word2vec, idf) for sentence in sentences]
    
    # each row in matrix is 300 dimensions embedding of a sentence
    sentences_matrix = np.vstack(sentences_vec)
    # print(sentences_matrix.shape)
    
    # average of all rows, take mean at y-axis
    sentences_centre = sentences_matrix.mean(axis=0)
    
    # result should be (300,) same as single sentence
    # print(sentences_centre.shape)
    return sentences_centre

In [15]:
def get_cluster_centre(df, intent_list, word2vec, idf=None):
    """ get intent cluster centre based on intent list and word embeddings
    
    Depends on function get_sentences_centre()
    
    Args:
        intent_list (list): List of unique intents(str)
        word2vec (dict): word embeddings dictionary 

    Returns:
        result (dict): intent cluster centres in dictionary format - {intent1:embedding1, intent2:embedding2,...}
    """ 
    result = {intent:get_sentences_centre(df[df.intent == intent]['query'].values, word2vec, idf) for intent in intent_list}
    return result

In [16]:
def get_distance_matrix(df_in, word2vec, leave_one_out=False, idf=False):
    """ Get distance for each query to every intent center
    
    Depends on function get_cluster_centre()
    
    Args:
        df_in (pd.DataFrame): input dataframe with intent and query
        word2vec (dict): word embeddings dictionary 
        leave_one_out (bool): whether leave the input query out of training
        idf (bool): whether use weighted word vectors to get sentence embedding

    Returns:
        result (pd.DataFrame): distance matrix for each query, lowest distance intent idealy should match label
    """
    df = df_in.copy()
    intent_list = df.intent.unique().tolist()
    
    if leave_one_out:
        # print("Leave one out")
        sentence_distance = []
        
        for ind in df.index:
            sentence_distance_tmp = []
            query = df.loc[ind, 'query']
            df_data = df.drop(ind)
            
            sentence_centre_dic = get_cluster_centre(df_data, intent_list, word2vec, idf)
            for intent in intent_list:
                sentence_distance_tmp.append(cosine_distances(get_sentence_vec(query, word2vec, idf).reshape(1,-1), 
                                                              sentence_centre_dic[intent].reshape(1,-1)).item())
            sentence_distance.append(sentence_distance_tmp)

        df_sentence_distance = pd.DataFrame(sentence_distance, columns=intent_list)
        df.reset_index(drop=True, inplace=True)
        result = pd.concat([df, df_sentence_distance], axis=1)
    
    else:

        sentence_centre_dic = get_cluster_centre(df, intent_list, word2vec, idf)
        # build dataframe that contains distance between each query to all intent cluster centre
        for intent in intent_list:
            # distance = cosine_similarity(sentence embedding, intent cluster centre embedding)
            df[intent] = df['query'].apply(lambda x: cosine_distances(get_sentence_vec(x, word2vec, idf).reshape(1,-1), 
                                                                      sentence_centre_dic[intent].reshape(1,-1)).item())
        result = df

    return result

In [17]:
def evaluate_distance_matrix(df_in):
    """ Evaluate distance matrix by compare closest intent center and label """
    df = df_in.copy()
    df.set_index(['intent', 'query'], inplace=True)
    df['cluster'] = df.idxmin(axis=1)
    df.reset_index(inplace=True)
    df['correct'] = (df.cluster == df.intent)
    accuracy = sum(df.correct) / len(df)
    # print("Accuracy for distance-based classification is", '{:.2%}'.format(result))
    return accuracy

In [18]:
def test_clustering_accuracy(df_in, word2vec):
    """ test accuracy based on distance of sentence to each cluster center"""
    df_result = get_distance_matrix(df_in, word2vec)
    # print(df_result.head())
    accuracy = evaluate_distance_matrix(df_result)
    return df_result, accuracy

In [19]:
# TEST
def test_idf_acc(df_in, word2vec, idf):
    df_result = get_distance_matrix(df_in, word2vec, leave_one_out=False, idf=idf)
    # print(df_result.head())
    accuracy = evaluate_distance_matrix(df_result)
    return df_result, accuracy

## Pipeline

In [20]:
# preprocessing questions
df_train['query'] = df_train['query'].apply(clean_text)
df_train['query'] = df_train['query'].apply(nltk_tokenize)
df_train['query'] = df_train['query'].apply(lambda x:' '.join([token.lemma_ for token in nlp(x) if token.lemma_ not in stop_words]))
df_train['query'] = df_train['query'].str.lower()


# preprocessing test as well
df_test['query'] = df_test['query'].apply(clean_text)
df_test['query'] = df_test['query'].apply(nltk_tokenize)
df_test['query'] = df_test['query'].apply(lambda x:' '.join([token.lemma_ for token in nlp(x) if token.lemma_ not in stop_words]))
df_test['query'] = df_test['query'].str.lower()


In [21]:
df_train.head(2)

Unnamed: 0,intent,query
0,Statement request,like copy -pron- statement
1,Statement request,send -pron- copy -pron- statement


In [22]:
df_test.head(2)

Unnamed: 0,intent,query
0,Cancel credit card transaction,submit dispute
1,Lost or compromised cards,lose -pron- card


In [23]:
intent_list = df_train.intent.unique().tolist()
intent_list[:2]

['Statement request', 'Passbook savings accounts']

In [24]:
test_intent_list = df_test.intent.unique().tolist()

In [25]:
set(intent_list) == set(test_intent_list)

False

In [26]:
for item in test_intent_list:
    if item not in intent_list:
        print(item)

In [27]:
for item in intent_list:
    if item not in test_intent_list:
        print(item)

Passbook savings accounts
Credit card statement
Debit card statement
Investment account statement
Change of billing cycle
Student Loan
Tuition fee loan
Education loan
Study loan
Cancel Fund Transfer
CRS Enquiries
Give a compliment
File a complaint
Unsuccessful card transaction
Card Renewal
Card Promotions
Open OCBC Singapore Account
Open OCBC Securities Account 
Open OCBC Malaysia Account
Open NISP Account
Request for sponsorship
Card Application
Apply for ATM card
Change credit card limit
Decrease credit card limit
Credit card application rejection


In [28]:
import warnings
warnings.filterwarnings("ignore")

In [29]:
# get idf
idf = get_idf_TfidfVectorizer(df_train['query'].tolist())

In [30]:
# TEST
try:
    word2vec
except NameError:
    word2vec = api.load("word2vec-google-news-300")  

df_result, accuracy = test_idf_acc(df_train, word2vec, idf)
print("Traing accuracy for word2vec + IDF is", '{:.2%}'.format(accuracy)) 

Traing accuracy for word2vec + IDF is 91.89%


Compare: Accuracy without IDF is ~90%

In [31]:
# get cluster centers from training set
idf = get_idf_TfidfVectorizer(df_train['query'].tolist())
dict_cluster = get_cluster_centre(df_train, intent_list, word2vec, idf)

In [32]:
def get_distance_matrix_idf(df_test, intent_list, dict_cluster, word2vec, idf):
    """ Get distance for each query to every intent center
        
    Args:
        df_test (pd.DataFrame): input test dataframe with intent and query
        intent_list (list): list of intents to loop through
        dict_cluster (dict): dictionary of cluster centres
        word2vec (dict): word embeddings dictionary
        idf (dict): idf of each words

    Returns:
        result (pd.DataFrame): distance matrix for each query, lowest distance intent idealy should match label
    """
    df = df_test.copy()
    for intent in intent_list:
        # distance = cosine_similarity(sentence embedding, intent cluster centre embedding)
        df[intent] = df['query'].apply(lambda x: cosine_distances(get_sentence_vec(x, word2vec, idf).reshape(1,-1), 
                                                                  dict_cluster[intent].reshape(1,-1)).item())
    return df

In [33]:
df_test_cluster = get_distance_matrix_idf(df_test, intent_list, dict_cluster, word2vec, idf)
df_test_cluster.head(2)

Unnamed: 0,intent,query,Statement request,Passbook savings accounts,Card statements,Credit card statement,Debit card statement,Investment account statement,Home loan account statement,360 Account interest dispute,...,Paying a cancelled credit card,How to close my account,Card dispute,Change credit card limit,Increase credit card limit,Decrease credit card limit,Credit card application rejection,Rebates,How to redeem rewards,Update details
0,Cancel credit card transaction,submit dispute,0.859452,0.956631,0.855401,0.84559,0.878371,0.865033,0.869588,0.792492,...,0.784627,0.88274,0.688009,0.862475,0.860961,0.921762,0.781087,0.864479,0.946258,0.915108
1,Lost or compromised cards,lose -pron- card,0.793683,0.757093,0.703357,0.654635,0.690902,0.74597,0.770254,0.704963,...,0.541351,0.600998,0.579094,0.529235,0.524934,0.598681,0.630182,0.737598,0.683356,0.847152


In [34]:
cluster_cols = list(df_test_cluster.columns.values)[2:]
# verify
set(intent_list) == set(cluster_cols)

True

In [35]:
def get_top_3_clusters(data, intent_list):
    data = data.copy()
    cluster_cols = intent_list.copy()

    data['clusters_top3'] = data.apply(lambda x: np.argsort(x[cluster_cols].values)[:3].tolist(), axis=1)

    intents = cluster_cols # get all tickers
    intent2index = {v: i for (i, v) in enumerate(intents)}

    data['target'] = data['intent'].apply(lambda x: intent2index[x])

    top_clusters_cols = pd.DataFrame(data['clusters_top3'].values.tolist(),columns = ['clusters_1','clusters_2','clusters_3']).reset_index(drop=True)
    data = data.reset_index(drop=True)
    data = pd.concat([data,top_clusters_cols], axis=1)

    data.drop(columns = 'clusters_top3', inplace=True)
    data.drop(columns = cluster_cols, inplace=True)
    
    # print(data.head())
    return data, intent2index

In [36]:
df_test_cluster_top_n, _ = get_top_3_clusters(df_test_cluster, cluster_cols)
df_test_cluster_top_n.head()

Unnamed: 0,intent,query,target,clusters_1,clusters_2,clusters_3
0,Cancel credit card transaction,submit dispute,17,25,69,21
1,Lost or compromised cards,lose -pron- card,43,43,53,44
2,Cancel credit card transaction,receive -pron- purchase merchant cancel -pron-,17,67,52,53
3,Cancel credit card transaction,transaction,17,17,21,39
4,Cancel Credit or Debit Card,terminate -pron- card,53,52,53,40


In [37]:
def get_accuracy(data, top=1):
    data = data.copy()
    
    assert top in (1,2,3), "top must be in (0, 1, 2)"
    
    if top == 1:
        # top 1 accuracy
        accuracy = (data[(data['clusters_1'] == data['target'])].shape[0] / data.shape[0])
    elif top == 2:
        # top 2 accuracy
        data["exists"] = data.drop(data.columns[[0,1,2,5]], 1).isin(data["target"]).any(1)
        accuracy = sum(data['exists'])/ data.shape[0]
    elif top == 3:
        # top 3 accuracy
        data["exists"] = data.drop(data.columns[[0,1,2]], 1).isin(data["target"]).any(1)
        accuracy = sum(data['exists'])/ data.shape[0]
    else:
        raise ValueError("top must be in (0, 1, 2)") 
    
    print('Accuracy for top {} clustering result is {:.1%}'.format(top, accuracy))
    return accuracy

In [38]:
get_accuracy(df_test_cluster_top_n, 1)
get_accuracy(df_test_cluster_top_n, 2)
get_accuracy(df_test_cluster_top_n, 3)

Accuracy for top 1 clustering result is 71.9%
Accuracy for top 2 clustering result is 82.8%
Accuracy for top 3 clustering result is 87.5%


0.875

### Combine with NLP features

In [39]:
df_train, intent2index = get_top_3_clusters(df_result, cluster_cols)
df_train.head(2)

Unnamed: 0,intent,query,target,clusters_1,clusters_2,clusters_3
0,Statement request,like copy -pron- statement,0,0,2,3
1,Statement request,send -pron- copy -pron- statement,0,0,2,3


In [40]:
def get_keywords(intent_list):
    """ Get list of keywords from intent """
    keywords = []
    for intent in list(set(intent_list)):
        keywords.extend(intent.strip().split(' '))
    keyword_list = list(set(keywords))
    keyword_list = [i.lower() for i in keyword_list if i.lower() not in stop_words]
    keyword_list.append('nsip')

    keyword_list_lemma = []
    text = nlp(' '.join([w for w in keyword_list]))
    for token in text:
        keyword_list_lemma.append(token.lemma_)
    return keyword_list_lemma

In [41]:
keyword_list_lemma = get_keywords(intent_list)

In [42]:
def get_nlp_features(df):
    """ Get keyword features from dataframe """
    data = df.copy()
    data['lemma'] = data['query'].apply(lambda x:' '.join([token.lemma_ for token in nlp(x) if token.lemma_ not in stop_words]))
    data['keyword'] = data['lemma'].apply(lambda x: list(set([token.lemma_ for token in nlp(x) if token.lemma_ in keyword_list_lemma])))

    data['noun'] = data['query'].apply(lambda x: list(set([token.lemma_ for token in nlp(x) if token.pos_ in ['NOUN','PROPN'] and token.lemma_ not in stop_words])))
    data['verb'] = data['query'].apply(lambda x: list(set([token.lemma_ for token in nlp(x) if token.pos_ in ['VERB'] and token.lemma_ not in stop_words])))

    data['noun'] = data['noun'].apply(lambda x: ' '.join([w for w in x]))
    data['verb'] = data['verb'].apply(lambda x: ' '.join([w for w in x]))
    data['keyword'] = data['keyword'].apply(lambda x: ' '.join([w for w in x]))
    return data

In [43]:
df_train.head(2)

Unnamed: 0,intent,query,target,clusters_1,clusters_2,clusters_3
0,Statement request,like copy -pron- statement,0,0,2,3
1,Statement request,send -pron- copy -pron- statement,0,0,2,3


In [44]:
df_train = get_nlp_features(df_train)
df_train.head(2)

Unnamed: 0,intent,query,target,clusters_1,clusters_2,clusters_3,lemma,keyword,noun,verb,adj
0,Statement request,like copy -pron- statement,0,0,2,3,like copy -pron- statement,statement,copy statement -pron-,,
1,Statement request,send -pron- copy -pron- statement,0,0,2,3,send -pron- copy -pron- statement,statement,statement -pron-,copy send,


In [45]:
df_test = get_nlp_features(df_test_cluster_top_n)
df_test.head(2)

Unnamed: 0,intent,query,target,clusters_1,clusters_2,clusters_3,lemma,keyword,noun,verb,adj
0,Cancel credit card transaction,submit dispute,17,25,69,21,submit dispute,dispute,dispute,submit,
1,Lost or compromised cards,lose -pron- card,43,43,53,44,lose -pron- card,card lose,card -pron-,lose,


In [46]:
# combine model score
countvector_cols = ['lemma', 'keyword', 'noun', 'verb']
top_clusters_cols = ['clusters_1', 'clusters_2', 'clusters_3']

feature_cols = countvector_cols + top_clusters_cols

### Random Forest

In [47]:
def get_train_test(df_train, df_test, feature_cols):
    """ split dataset, get X_train, X_test, y_train, y_test """
    X_train = df_train[feature_cols]
    # print(X_train.head(1))
    y_train = df_train['target']
    # print(y_train.head(1))
    X_test = df_test[feature_cols]
    y_test = df_test['target']
    # print(X_test.head(1))
    # print(y_test.head(1))
    return X_train, y_train, X_test, y_test

In [48]:
X_train, y_train, X_test, y_test = get_train_test(df_train, df_test, feature_cols)

In [59]:
def add_nlp_to_x(X_train, X_test):
    """ Add NLP features to input X """
    v_lemma = TfidfVectorizer()
    x_train_lemma = v_lemma.fit_transform(X_train['lemma'])
    x_test_lemma = v_lemma.transform(X_test['lemma'])
    vocab_lemma = dict(v_lemma.vocabulary_)

    v_keyword = TfidfVectorizer()
    x_train_keyword = v_keyword.fit_transform(X_train['keyword'])
    x_test_keyword = v_keyword.transform(X_test['keyword'])
    vocab_keyword = dict(v_keyword.vocabulary_)

    v_noun = TfidfVectorizer()
    x_train_noun = v_noun.fit_transform(X_train['noun'])
    x_test_noun = v_noun.transform(X_test['noun'])
    vocab_noun = dict(v_noun.vocabulary_)

    v_verb = TfidfVectorizer()
    x_train_verb = v_verb.fit_transform(X_train['verb'])
    x_test_verb = v_verb.transform(X_test['verb'])
    vocab_verb = dict(v_verb.vocabulary_)
    
    # combine all features 
    x_train_combined = hstack((x_train_lemma,x_train_keyword,x_train_noun,x_train_verb,X_train[top_clusters_cols].values),format='csr')
    x_train_combined_columns= v_lemma.get_feature_names()+v_keyword.get_feature_names()+v_noun.get_feature_names()+v_verb.get_feature_names()+top_clusters_cols

    x_test_combined = hstack((x_test_lemma,x_test_keyword,x_test_noun,x_test_verb,X_test[top_clusters_cols].values),format='csr')
    x_test_combined_columns= v_lemma.get_feature_names()+v_keyword.get_feature_names()+v_noun.get_feature_names()+v_verb.get_feature_names()+top_clusters_cols

    x_train_combined = pd.DataFrame(x_train_combined.toarray())
    x_train_combined.columns = x_train_combined_columns

    x_test_combined = pd.DataFrame(x_test_combined.toarray())
    x_test_combined.columns = x_test_combined_columns
    
    return x_train_combined, x_test_combined, v_lemma, v_keyword, v_noun, v_verb

In [60]:
x_train_combined, x_test_combined, v_lemma, v_keyword, v_noun, v_verb = add_nlp_to_x(X_train, X_test)

In [51]:
# build classifier
clf = RandomForestClassifier(max_depth=50, n_estimators=1000)
clf.fit(x_train_combined, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=50, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [52]:
probs = clf.predict_proba(x_test_combined)
best_3 = pd.DataFrame(np.argsort(probs, axis=1)[:,-3:],columns=['top3','top2','top1'])
best_3['top1'] = clf.classes_[best_3['top1']]
best_3['top2'] = clf.classes_[best_3['top2']]
best_3['top3'] = clf.classes_[best_3['top3']]

In [53]:
result = pd.concat([best_3.reset_index(drop=True),pd.DataFrame(y_test).reset_index(drop=True), X_test[feature_cols].reset_index(drop=True)], axis=1)
score_1 = result[result['top1'] == result['target']].shape[0] / result.shape[0]
score_2 = result[(result['top1'] == result['target']) | (result['top2'] == result['target'])].shape[0] / result.shape[0]
score_3 = result[(result['top1'] == result['target']) | (result['top2'] == result['target'])| (result['top3'] == result['target'])].shape[0] / result.shape[0]

In [54]:
print('Accuracy for top 1 clustering + classifier result is {:.1%}'.format(score_1))
print('Accuracy for top 2 clustering + classifier result is {:.1%}'.format(score_2))
print('Accuracy for top 3 clustering + classifier result is {:.1%}'.format(score_3))

Accuracy for top 1 clustering + classifier result is 75.8%
Accuracy for top 2 clustering + classifier result is 82.8%
Accuracy for top 3 clustering + classifier result is 85.9%


Compare: Google NLU accuracy is 78.1%

## API
load model and run on one sentence

In [56]:
import pickle

In [58]:
# save the model to disk
model_filename = 'RFClassifier.pkl'
pickle.dump(clf, open(model_filename, 'wb'))

In [61]:
# save vectorizer
pickle.dump(v_lemma, open('TFIDFVectorizer_lemma', 'wb'))
pickle.dump(v_keyword, open('TFIDFVectorizer_keyword', 'wb'))
pickle.dump(v_noun, open('TFIDFVectorizer_noun', 'wb'))
pickle.dump(v_verb, open('TFIDFVectorizer_verb', 'wb'))

In [73]:
test_query = "Please show me the current promotions"

In [74]:
df = pd.DataFrame()

In [75]:
df = pd.DataFrame(columns=['query'])
df.loc[0] = [test_query]

In [76]:
df

Unnamed: 0,query
0,Please show me the current promotions


In [79]:
# preprocessing test as well
df['query'] = df['query'].apply(clean_text)
df['query'] = df['query'].apply(nltk_tokenize)
df['query'] = df['query'].apply(lambda x:' '.join([token.lemma_ for token in nlp(x) if token.lemma_ not in stop_words]))
df['query'] = df['query'].str.lower()

In [80]:
df = get_nlp_features(df)

In [81]:
df

Unnamed: 0,query,lemma,keyword,noun,verb,adj
0,-pron- current promotion,-pron- current promotion,promotion,promotion,,current


In [82]:
df_cluster = get_distance_matrix_idf(df, intent_list, dict_cluster, word2vec, idf)

In [83]:
df_cluster

Unnamed: 0,query,lemma,keyword,noun,verb,adj,Statement request,Passbook savings accounts,Card statements,Credit card statement,...,Paying a cancelled credit card,How to close my account,Card dispute,Change credit card limit,Increase credit card limit,Decrease credit card limit,Credit card application rejection,Rebates,How to redeem rewards,Update details
0,-pron- current promotion,-pron- current promotion,promotion,promotion,,current,0.896305,0.882283,0.818481,0.787504,...,0.815096,0.925912,0.75781,0.833953,0.814946,0.893213,0.761064,0.720189,0.785271,0.913706


In [86]:
def get_top_3(data, intent_list):
    data = data.copy()
    cluster_cols = intent_list.copy()

    data['clusters_top3'] = data.apply(lambda x: np.argsort(x[cluster_cols].values)[:3].tolist(), axis=1)

    top_clusters_cols = pd.DataFrame(data['clusters_top3'].values.tolist(),columns = ['clusters_1','clusters_2','clusters_3']).reset_index(drop=True)
    data = data.reset_index(drop=True)
    data = pd.concat([data,top_clusters_cols], axis=1)

    data.drop(columns = 'clusters_top3', inplace=True)
    data.drop(columns = cluster_cols, inplace=True)
    
    # print(data.head())
    return data

In [87]:
top_3 = get_top_3(df_cluster, cluster_cols)

In [88]:
top_3

Unnamed: 0,query,lemma,keyword,noun,verb,adj,clusters_1,clusters_2,clusters_3
0,-pron- current promotion,-pron- current promotion,promotion,promotion,,current,45,46,56


In [92]:
def add_nlp(df, v_lemma, v_keyword, v_noun, v_verb, top_clusters_cols):
    """ Add NLP features to input X """
    x_test_lemma = v_lemma.transform(df['lemma'])
    x_test_keyword = v_keyword.transform(df['keyword'])
    x_test_noun = v_noun.transform(df['noun'])
    x_test_verb = v_verb.transform(df['verb'])
    
    # combine all features 
    x_test_combined = hstack((x_test_lemma,
                              x_test_keyword,
                              x_test_noun,
                              x_test_verb,
                              df[top_clusters_cols].values),format='csr')

    x_test_combined_columns = v_lemma.get_feature_names()+\
                              v_keyword.get_feature_names()+\
                              v_noun.get_feature_names()+\
                              v_verb.get_feature_names()+\
                              top_clusters_cols
    
    x_test_combined = pd.DataFrame(x_test_combined.toarray())
    x_test_combined.columns = x_test_combined_columns
    
    return x_test_combined

In [94]:
X_in = add_nlp(top_3, v_lemma, v_keyword, v_noun, v_verb, top_clusters_cols)

In [95]:
probs = clf.predict_proba(X_in)

In [98]:
best_3 = pd.DataFrame(np.argsort(probs, axis=1)[:,-3:],columns=['top3','top2','top1'])
best_3['top1'] = clf.classes_[best_3['top1']]
best_3['top2'] = clf.classes_[best_3['top2']]
best_3['top3'] = clf.classes_[best_3['top3']]

In [99]:
best_3

Unnamed: 0,top3,top2,top1
0,62,46,45


In [103]:
index2intent = {y:x for x,y in intent2index.items()}

In [106]:
def get_target_name(index, index2intent=index2intent):
    return index2intent[index]

In [108]:
best_3['top1_name'] = best_3['top1'].apply(get_target_name)
best_3['top2_name'] = best_3['top2'].apply(get_target_name)
best_3['top3_name'] = best_3['top3'].apply(get_target_name)

In [109]:
best_3

Unnamed: 0,top3,top2,top1,top1_name,top2_name,top3_name
0,62,46,45,Promotions,Card Promotions,Uplift suspension on accounts


In [113]:
top1 = best_3.at[0,'top1_name']
top2 = best_3.at[0,'top2_name']
top3 = best_3.at[0,'top3_name']

In [116]:
print(f'For sentence:\n{test_query}\n')
print(f'Top 1 prediction intent is {top1}')
print(f'Top 2 prediction intent is {top2}')
print(f'Top 3 prediction intent is {top3}')

For sentence:
Please show me the current promotions

Top 1 prediction intent is Promotions
Top 2 prediction intent is Card Promotions
Top 3 prediction intent is Uplift suspension on accounts


## Consolidate

In [None]:
def get_intent_nlp(query, classifier_intent_nlp):
    """ load classification model outside the function  
        
        return a dataframe df
        columns: pred_seq, intent_class, intent_string, pred_prob
        rows: top 3 prediciton, example for first row: 1, 0, Promotions, 0.66
    """
    
    
    return df

In [None]:
def get_intent_nlp_clustering(query, classifier_intent_nlp_clustering, word2vec):
    """ load word2vec dict outside the function
        load classification model outside the function 

        return a dataframe df
        columns: pred_seq, intent_class, intent_string, pred_prob
        rows: top 3 prediciton, example for first row: 1, 0, Promotions, 0.66
    """


    return df