## Dataset
The comma separated dataset has been minimally preprocessed with dropna.

It has only 2 columns:
* intent
    * the target for the classifier to predict
* query
    * the input of user question to the chatbot

In [3]:
# General Import
import numpy as np
import pandas as pd

In [1]:
# Starting point
import os
import sys
from pathlib import Path

PATH_HOME = Path.home()
PATH_PROJ = Path.cwd()
PATH_DATA = PATH_PROJ

sys.path.append(str(PATH_PROJ))

def get_data(path=PATH_DATA/'data.csv'):
    """ load data from csv """
    df = pd.read_csv(path, usecols=['intent', 'query'])
    df = df.dropna().drop_duplicates()
    # From EDA, the intent only have 1 example
    df = df.drop(df[df.intent == 'Late fee waiver for credit card'].index)
    df.reset_index(drop=True, inplace=True)
    return df

In [9]:
# TEST
df = get_data()
df.shape

(394, 2)

In [10]:
df.head()

Unnamed: 0,intent,query
0,Promotions,what promotions do you have?
1,Promotions,what promotions are available?
2,Promotions,promotions
3,Promotions,I want to see promotions
4,Promotions,view promotions


In [11]:
len(df['intent'].value_counts())

44

## Utilities

This section store all the utility **functions** and **classes** created by XX, WJ, Nan to faciliate the text intent classification process.

* Some specific libraries are loaded in the same cell as function
* Cells marked with **TEST** are for testing purpose only
* Docstring style guide 
    * https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html
    * http://google.github.io/styleguide/pyguide.html#384-classes


In [49]:
import re
def clean_text(text):
    """ Basic text cleaning
        
        1. lowercase
        2. remove special characters
    """
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [14]:
from nltk.tokenize import word_tokenize
def nltk_tokenize(text):
    """ tokenize text using NLTK and join back as sentence"""
    # import nltk
    # nltk.download('punkt')
    return ' '.join(word_tokenize(text))

In [40]:
import string
import spacy

In [41]:
# Function for spacy tokenizer

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_lg')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [17]:
# TEST
print(spacy_tokenizer("hello World!"))
print(spacy_tokenizer("run runs running runner"))

['hello', 'world']
['run', 'run', 'run', 'runner']


In [18]:
# Class for spacy pipeline including tokenizer
from spacy.lang.en.stop_words import STOP_WORDS

class SpacyPipeline():
    """ Utilize Spacy Pipeline """
    def __init__(self, word_vector='en_core_web_lg'):
        self.nlp = spacy(word_vector)

    def tokenize(self, text, lemma=True, stop_words=True):
        """ Return tokenized text 
            "hello world!" -> ["hello", "world"]
        """
        self.tokens = self.nlp(text)
        
        if lemma:
            self.tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in self.tokens]
        
        if stop_words:
            self.tokens = [word for word in self.tokens if word not in STOP_WORDS and word not in string.punctuations]

        return self.tokens

    def get_word_embed(self, text):
        """ Get individual word embedding: result.shape = (num_of_words, 300)"""
        with self.nlp.disable_pipes():
            vectors = np.array([token.vector for token in self.nlp(text)])
        return vectors

    def get_doc_embed(self, doc):
        """ Get sentence embeddings based on average of word embeddings of each sentence
            
            Result.shape = (num_of_sentences, 300)
            
            Args:
                doc (pd.Series): series of sentences

            Returns:
                doc_vectors (np.array): embedding matrix of document with each row is a embedding of that sentence
        """
        with self.nlp.disable_pipes():
            doc_vectors = np.array([self.nlp(text).vector for text in doc])
        return doc_vectors

In [52]:
import math

In [53]:
def get_idf(sentences):
    """ Get idf dictionary
    
    Args:
        sentences (list): list of input sentences (str)

    Returns:
        idf (dict): idf[word] = inverse document frequency of that word in all training queries
    """
    num_of_sentences = len(sentences)
    doc_freq = {}  # data frequency    
    for sentence in sentences:
        # words = set(sentence.strip().split())
        words = spacy_tokenizer(sentence)
        for word in words:
            if word not in doc_freq:
                doc_freq[word] = 0.0
            doc_freq[word] += 1.0
    print(doc_freq)
    # to smooth and avoid zero divide, add 1 to count
    for word, count in doc_freq.items():
        print(float(num_of_sentences)/(count))
    idf = {word:math.log(float(num_of_sentences)/(count)) for word, count in doc_freq.items() }
    return idf

In [54]:
# TEST
text = ["The quick brown fox jumped over the lazy dog.",
        "The dog.",
        "The fox"]
test_idf = get_idf(text)
test_idf

{'quick': 1.0, 'brown': 1.0, 'fox': 2.0, 'jump': 1.0, 'lazy': 1.0, 'dog': 2.0}
3.0
3.0
1.5
3.0
3.0
1.5


{'quick': 1.0986122886681098,
 'brown': 1.0986122886681098,
 'fox': 0.4054651081081644,
 'jump': 1.0986122886681098,
 'lazy': 1.0986122886681098,
 'dog': 0.4054651081081644}

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
def get_idf_TfidfVectorizer(sentences):
    """ Get idf dictionary by using TfidfVectorizer
    
    Args:
        sentences (list): list of input sentences (str)

    Returns:
        idf (dict): idf[word] = inverse document frequency of that word in all training queries
    """
    # use customized Spacy tokenizer
    vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer)
    vectorizer.fit(sentences)
    # TODO: normalize the idf weights
    idf = {k:vectorizer.idf_[v] for k,v in vectorizer.vocabulary_.items()}
    return idf

In [44]:
# TEST
text = ["The quick brown fox jumped over the lazy dog.",
        "The dog.",
        "The fox"]
test_idf = get_idf_TfidfVectorizer(text)
test_idf



{'quick': 1.6931471805599454,
 'brown': 1.6931471805599454,
 'fox': 1.2876820724517808,
 'jump': 1.6931471805599454,
 'lazy': 1.6931471805599454,
 'dog': 1.2876820724517808}

## Pipeline

In [15]:
# Use the same tokenizer as WJ
def tokenize(wd): 
    return ' '.join(word_tokenize(wd))
df['query'] = df['query'].apply(tokenize)

In [16]:
def get_sentence_vec(sentence, word2vec, idf=None):
    """ Get embedding of sentence by using word2vec embedding of words
    
    If idf is provided, the sentence is the weighted embedding by
        SUM( embedding[word] x idf[word] )
    
    Args:
        sentence (str): input sentence
        word2vec (dict): loaded word2vec model from Gensim
        idf (dict, optional): inverse document frequency of words in all queries

    Returns:
        emb (np.array): 300-dimentions embedding of sentence
    """
    words = sentence.split()
    words = [word for word in words if word in word2vec.vocab]
    
    # if no word in word2vec vocab, return 0x300 embedding
    if len(words)==0:
        return np.zeros((300,), dtype='float32')
    
    # use mean if no idf provided
    if idf is None:
        emb = word2vec[words].mean(axis=0)
    else:
        # get all idf of words, if new word is not in idf, assign 0.0 weights
        idf_series = np.array([idf.get(word, 0.0) for word in words])
        # change shape to 1 x num_of_words
        idf_series = idf_series.reshape(1, -1)
        # use matrix multiplication to get weighted word vector sum for sentence embeddings
        emb = np.matmul(idf_series, word2vec[words]).reshape(-1)
    return emb

In [17]:
# TEST
import gensim.downloader as api

def test_word2vec_idf(word2vec):
    """ Test the algorithm for get_sentence_vec is giving the right output """
    # setup
    sen = "test cat and dog"
    idf = {"test":1/3, "cat":1/3, "and":0, "dog":1/3}
    
    words = sen.split()
    words = [word for word in words if word in word2vec.vocab]
    
    idf_series = np.array([idf[word] for word in words])
    print("idf shape \t(1 x number of words)\t\t - ", idf_series.reshape(1, -1).shape)
    idf_series = idf_series.reshape(1, -1)
    
    print("word2vec shape \t(number of words x 300)\t\t - ", word2vec[words].shape)
    result = np.matmul(idf_series.reshape(1, -1), word2vec[words])
    result = result.reshape(-1)
    print("result shape \t(1-dimentional: np.array)\t - ", result.shape)
    
    emb1 = word2vec[words].mean(axis=0)
    emb2 = result
    
    assert emb1.all() == emb2.all()
    # return result

try:
    word2vec
except NameError:
    word2vec = api.load("word2vec-google-news-300")  

test_word2vec_idf(word2vec)

idf shape 	(1 x number of words)		 -  (1, 3)
word2vec shape 	(number of words x 300)		 -  (3, 300)
result shape 	(1-dimentional: np.array)	 -  (300,)


In [18]:
def get_sentences_centre(sentences, word2vec, idf=None, num_features=300):
    """ Get sentences centre by averaging all embeddings of sentences in a list
    
    Depends on function get_sentence_vec()
    
    Args:
        sentence (list): list of input sentences (str)
        word2vec (dict): loaded word2vec model from Gensim
        idf (dict, optional): inverse document frequency of words in all queries

    Returns:
        emb (np.array): 300-dimentions embedding of sentence
    """
    # convert list of sentences to their vectors
    sentences_vec = [get_sentence_vec(sentence, word2vec, idf) for sentence in sentences]
    
    # each row in matrix is 300 dimensions embedding of a sentence
    sentences_matrix = np.vstack(sentences_vec)
    # print(sentences_matrix.shape)
    
    # average of all rows, take mean at y-axis
    sentences_centre = sentences_matrix.mean(axis=0)
    
    # result should be (300,) same as single sentence
    # print(sentences_centre.shape)
    return sentences_centre

In [19]:
# TEST
try:
    word2vec
except NameError:
    word2vec = api.load("word2vec-google-news-300")  

test_sentence_list = ["hello world", "test cat and dog"]
get_sentences_centre(test_sentence_list, word2vec).shape

(300,)

In [20]:
# get intent list from dataframe["intent"]
# intent_list = df.intent.unique().tolist()
def get_cluster_centre(df, intent_list, word2vec, idf=None):
    """ get intent cluster centre based on intent list and word embeddings
    
    Depends on function get_sentences_centre()
    
    Args:
        intent_list (list): List of unique intents(str)
        word2vec (dict): word embeddings dictionary 

    Returns:
        result (dict): intent cluster centres in dictionary format - {intent1:embedding1, intent2:embedding2,...}
    """ 
    result = {intent:get_sentences_centre(df[df.intent == intent]['query'].values, word2vec, idf) for intent in intent_list}
    return result

In [21]:
# TEST
data = {'intent':  ['hello', 'cat'],
        'query': ['hello world', 'test cat and dog']
        }

test_df = pd.DataFrame (data, columns = ['intent','query'])
test_intents = ['hello', 'cat']

try:
    word2vec
except NameError:
    word2vec = api.load("word2vec-google-news-300") 

result = get_cluster_centre(test_df, test_intents, word2vec)
result[test_intents[0]].shape

(300,)

In [22]:
from sklearn.metrics.pairwise import cosine_distances

In [23]:
def get_distance_matrix(df_in, word2vec, leave_one_out=False, idf_flag=False):
    """ Get distance for each query to every intent center
    
    Depends on function get_cluster_centre()
    
    Args:
        df_in (pd.DataFrame): input dataframe with intent and query
        word2vec (dict): word embeddings dictionary 
        leave_one_out (bool): whether leave the input query out of training
        idf (bool): whether use weighted word vectors to get sentence embedding

    Returns:
        result (pd.DataFrame): distance matrix for each query, lowest distance intent idealy should match label
    """
    df = df_in.copy()
    intent_list = df.intent.unique().tolist()
    
    if leave_one_out:
        # print("Leave one out")
        sentence_distance = []
        
        for ind in df.index:
            sentence_distance_tmp = []
            query = df.loc[ind, 'query']
            df_data = df.drop(ind)
            
            if idf_flag:
                idf = get_idf_TfidfVectorizer(df['query'].tolist())
            else:
                # print("No IDF")
                idf = None
            
            sentence_centre_dic = get_cluster_centre(df_data, intent_list, word2vec, idf)
            for intent in intent_list:
                sentence_distance_tmp.append(cosine_distances(get_sentence_vec(query, word2vec, idf).reshape(1,-1), 
                                                              sentence_centre_dic[intent].reshape(1,-1)).item())
            sentence_distance.append(sentence_distance_tmp)

        df_sentence_distance = pd.DataFrame(sentence_distance, columns=intent_list)
        df.reset_index(drop=True, inplace=True)
        result = pd.concat([df, df_sentence_distance], axis=1)
    
    else:
        sentence_centre_dic = get_cluster_centre(df, intent_list, word2vec)
        # build dataframe that contains distance between each query to all intent cluster centre
        for intent in intent_list:
            # distance = cosine_similarity(sentence embedding, intent cluster centre embedding)
            df[intent] = df['query'].apply(lambda x: cosine_distances(get_sentence_vec(x, word2vec).reshape(1,-1), 
                                                                      sentence_centre_dic[intent].reshape(1,-1)).item())
        result = df

    return result

In [24]:
def evaluate_distance_matrix(df_in):
    """ Evaluate distance matrix by compare closest intent center and label """
    df = df_in.copy()
    df.set_index(['intent', 'query'], inplace=True)
    df['cluster'] = df.idxmin(axis=1)
    df.reset_index(inplace=True)
    df['correct'] = (df.cluster == df.intent)
    result = sum(df.correct) / len(df)
    # print("Accuracy for distance-based classification is", '{:.2%}'.format(result))
    return result

In [25]:
# TEST
def test_clustering_accuracy(df_in, word2vec):
    """ test accuracy based on distance of sentence to each cluster center"""
    df_result = get_distance_matrix(df_in, word2vec)
    # print(df_result.head())
    accuracy = evaluate_distance_matrix(df_result)
    return accuracy

In [26]:
# TEST
try:
    word2vec
except NameError:
    word2vec = api.load("word2vec-google-news-300")  

test_result = test_clustering_accuracy(df, word2vec)
print("Accuracy for text embedding distance is", '{:.2%}'.format(test_result))

Accuracy for text embedding distance is 90.36%


In [29]:
# TEST
def test_leave_one_out_acc(df_in, word2vec):
    df_result = get_distance_matrix(df_in, word2vec, leave_one_out=True)
    # print(df_result.head())
    accuracy = evaluate_distance_matrix(df_result)
    return accuracy

In [30]:
# TEST
try:
    word2vec
except NameError:
    word2vec = api.load("word2vec-google-news-300")  

test_result = test_leave_one_out_acc(df, word2vec)
print("Accuracy for leave one out is", '{:.2%}'.format(test_result)) 

Accuracy for leave one out is 77.66%


In [34]:
# TEST
def test_idf_acc(df_in, word2vec):
    df_result = get_distance_matrix(df_in, word2vec, leave_one_out=True, idf_flag=True)
    # print(df_result.head())
    accuracy = evaluate_distance_matrix(df_result)
    return accuracy

In [31]:
import warnings
warnings.filterwarnings("ignore")

In [32]:
# further preprocessing
import spacy
nlp = spacy.load("en_core_web_sm")

from spacy.lang.en.stop_words import STOP_WORDS
stop_words = list(STOP_WORDS)

df['query'] = df['query'].apply(lambda x:' '.join([token.lemma_ for token in nlp(x) if token.lemma_ not in stop_words]))

In [42]:
# TEST
try:
    word2vec
except NameError:
    word2vec = api.load("word2vec-google-news-300")  

test_result = test_idf_acc(df, word2vec)
print("Accuracy for leave one out & use IDF is", '{:.2%}'.format(test_result)) 

Accuracy for leave one out & use IDF is 86.04%
