# Notebook Conf

In [1]:
%load_ext autoreload
%autoreload 2

# Import Libraries

In [4]:
from pathlib import Path
import pandas as pd
import re
import numpy as np

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
import gensim.downloader as api

PATH_PROJ = Path.home() / 'project/intent-classification'
PATH_DATA = PATH_PROJ

# Processing Data

## string clean

In [5]:
df = pd.read_csv(PATH_DATA / 'data.csv', usecols=['intent', 'query'])
df.dropna(inplace=True)
df = df.drop(df[df.intent == 'Late fee waiver for credit card'].index)

In [6]:
def string_clean(text):
    """ Basic text cleaning """
    # Remove numbers
    # Remove punctuations
    # Remove single character
    # Stemming
    
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [7]:
df['query'] = df['query'].apply(string_clean)

## tokenize

In [8]:
# import nltk
# nltk.download('punkt')

In [9]:
def tokenize(wd): 
    return ' '.join(word_tokenize(wd))

In [10]:
df['query'] = df['query'].apply(tokenize)

In [11]:
df.head()

Unnamed: 0,intent,query
0,Promotions,what promotions do you have
1,Promotions,what promotions are available
2,Promotions,promotions
3,Promotions,i want to see promotions
4,Promotions,view promotions


# Modeling

## TFIDF

In [12]:
# vectorizer = TfidfVectorizer(lowercase=True, stop_words="english", ngram_range=(1, 3))
# tfidf = vectorizer.fit_transform(df['query'].values.astype('U'))

## word2vec 

In [13]:
word2vec = api.load("word2vec-google-news-300")  

In [14]:
# request for easicredit late fee waiver

In [36]:
def get_sentence_vec(sentence, word2vec):
    words = sentence.split()
    words = [word for word in words if word in word2vec.vocab]
    if len(words)==0:
        return np.zeros((300,), dtype='float32')
    emb = word2vec[words].mean(axis=0)
    return emb

def get_sentence_centre(sentence_list, word2vec, num_features):
    emb = np.zeros((num_features, ), dtype='float32')
    sentence_count = 0
    for sentence in sentence_list:
        sentence_count += 1
        emb = emb + get_sentence_vec(sentence, word2vec)
    return emb / sentence_count

## cluster mean of all queries

In [33]:
intent_list = df.intent.unique().tolist()

In [34]:
def get_cluster_centre(intent_list, word2vec):
    result = {}
    for intent in intent_list:
        result[intent] = get_sentence_centre(df[df.intent == intent]['query'].values, word2vec, 300)
    return result

In [37]:
sentence_centre_dic = get_cluster_centre(intent_list, word2vec)

In [38]:
df_result = df.copy()
for intent in intent_list:
    df_result[intent] = df_result['query'].apply(lambda x: cosine_distances(get_sentence_vec(x, word2vec).reshape(1,-1), 
                                     sentence_centre_dic[intent].reshape(1,-1)).item())

In [39]:
df_result.head()

Unnamed: 0,intent,query,Promotions,Card Promotions,Open Account,OCBC Singapore Account,OCBC Securities Account,OCBC Malaysia Account,NISP Account,Card Cancellation,...,Credit card application rejection,Rebates,How to redeem rewards,360 Account interest dispute,Statement Request,Passbook savings account statement,Credit card statement,Debit card statement,Investment account statement,Update details
0,Promotions,what promotions do you have,0.17841,0.290107,0.511844,0.557279,0.631671,0.661682,0.461206,0.559563,...,0.502403,0.406131,0.393889,0.523117,0.332899,0.526322,0.427617,0.456662,0.403752,0.450933
1,Promotions,what promotions are available,0.204849,0.340928,0.630432,0.665373,0.67779,0.717156,0.592525,0.713416,...,0.611759,0.534145,0.581964,0.56814,0.593805,0.670499,0.641263,0.658776,0.616404,0.572033
2,Promotions,promotions,0.170245,0.313105,0.865135,0.882628,0.871575,0.880556,0.865212,0.755802,...,0.776364,0.718589,0.824465,0.758087,0.901096,0.868723,0.823399,0.845818,0.838721,0.868095
3,Promotions,i want to see promotions,0.157523,0.265592,0.533925,0.559165,0.628026,0.621109,0.47666,0.537391,...,0.518104,0.427042,0.439173,0.594979,0.3368,0.537262,0.426505,0.424465,0.451911,0.480753
4,Promotions,view promotions,0.203142,0.333202,0.715974,0.735376,0.744739,0.736754,0.704219,0.722977,...,0.698798,0.63296,0.739473,0.630649,0.771172,0.755724,0.704092,0.745594,0.721117,0.751306


In [40]:
# df.to_csv(PATH_DATA / 'data2.csv')

In [41]:
df_tmp = df_result.copy()
df_tmp.set_index(['intent', 'query'], inplace=True)
df_tmp['cluster'] = df_tmp.idxmin(axis=1)
df_tmp.reset_index(inplace=True)
df_tmp['correct'] = (df_tmp.cluster == df_tmp.intent)
sum(df_tmp.correct) / len(df_tmp)

0.8987341772151899

## cluster leave one out

In [23]:
def get_cluster_centre(df, intent_list, word2vec):
    result = {}
    for intent in intent_list:
        df_tmp = df[df.intent == intent]
        sentence_centre_tmp = get_sentence_centre(df_tmp['query'].values, word2vec, 300)
        result[intent] = sentence_centre_tmp
    return result

In [24]:
intent_list = df.intent.unique().tolist()
sentence_distance = []
for ind in df.index:
    sentence_distance_tmp = []
    query = df.loc[ind, 'query']
    df_data = df.drop(ind)
    sentence_centre_dic = get_cluster_centre(df_data, intent_list, word2vec)
    for intent in intent_list:
        sentence_distance_tmp.append(cosine_distances(get_sentence_vec(query, word2vec).reshape(1,-1), 
                                                      sentence_centre_dic[intent].reshape(1,-1)).item())
    sentence_distance.append(sentence_distance_tmp)

In [25]:
df_sentence_distance = pd.DataFrame(sentence_distance, columns=intent_list)

In [26]:
df_sentence_distance.head()

Unnamed: 0,Promotions,Card Promotions,Open Account,OCBC Singapore Account,OCBC Securities Account,OCBC Malaysia Account,NISP Account,Card Cancellation,Cancel Credit or Debit Card,Cancel ATM Card,...,Credit card application rejection,Rebates,How to redeem rewards,360 Account interest dispute,Statement Request,Passbook savings account statement,Credit card statement,Debit card statement,Investment account statement,Update details
0,0.218198,0.290107,0.511844,0.557279,0.631671,0.661682,0.461206,0.559563,0.454213,0.598181,...,0.502403,0.406131,0.393889,0.523117,0.332899,0.526322,0.427617,0.456662,0.403752,0.450933
1,0.251088,0.340928,0.630432,0.665373,0.67779,0.717156,0.592525,0.713416,0.66386,0.721407,...,0.611759,0.534145,0.581964,0.56814,0.593805,0.670499,0.641263,0.658776,0.616404,0.572033
2,0.266201,0.313105,0.865135,0.882628,0.871575,0.880556,0.865212,0.755802,0.784507,0.7873,...,0.776364,0.718589,0.824465,0.758087,0.901096,0.868723,0.823399,0.845818,0.838721,0.868095
3,0.196116,0.265592,0.533925,0.559165,0.628026,0.621109,0.47666,0.537391,0.43902,0.516074,...,0.518104,0.427042,0.439173,0.594979,0.3368,0.537262,0.426505,0.424465,0.451911,0.480753
4,0.270685,0.333202,0.715974,0.735376,0.744739,0.736754,0.704219,0.722977,0.727041,0.749748,...,0.698798,0.63296,0.739473,0.630649,0.771172,0.755724,0.704092,0.745594,0.721117,0.751306


In [27]:
df.reset_index(drop=True, inplace=True)

In [28]:
df_result_2 = pd.concat([df, df_sentence_distance], axis=1)

In [29]:
df_result_2.shape

(395, 46)

In [31]:
df_tmp = df_result_2.copy()
df_tmp.set_index(['intent', 'query'], inplace=True)
df_tmp['cluster'] = df_tmp.idxmin(axis=1)
df_tmp.reset_index(inplace=True)
df_tmp['correct'] = (df_tmp.cluster == df_tmp.intent)
sum(df_tmp.correct) / len(df_tmp)

0.7772151898734178

In [145]:
df_result_2.to_csv('data_leave_one_out.csv')