# Notebook Conf

In [1]:
%load_ext autoreload
%autoreload 2

# Import Libraries

In [4]:
from pathlib import Path
import pandas as pd
import re
import numpy as np

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
import gensim.downloader as api

PATH_PROJ = Path.home() / 'project/intent-classification'
PATH_DATA = PATH_PROJ

# Processing Data

## string clean

In [5]:
df = pd.read_csv(PATH_DATA / 'data.csv', usecols=['Intent', 'Questions'])
df = df.rename(columns={'Intent': 'intent', 'Questions': 'query'})
df.dropna(inplace=True)
df = df.drop(df[df.intent == 'Late fee waiver for credit card'].index)

In [6]:
def string_clean(text):
    """ Basic text cleaning """
    # Remove numbers
    # Remove punctuations
    # Remove single character
    # Stemming
    
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [7]:
df['query'] = df['query'].apply(string_clean)

## tokenize

In [8]:
# import nltk
# nltk.download('punkt')

In [9]:
def tokenize(wd): 
    return ' '.join(word_tokenize(wd))

In [10]:
df['query'] = df['query'].apply(tokenize)

## lemmatizer and stopwords

In [11]:
import spacy
nlp = spacy.load("en_core_web_sm")

from spacy.lang.en.stop_words import STOP_WORDS
stop_words = list(STOP_WORDS)

In [12]:
df['query'] = df['query'].apply(lambda x:' '.join([token.lemma_ for token in nlp(x) if token.lemma_ not in stop_words]))

# Modeling

## word2vec 

In [13]:
word2vec = api.load("word2vec-google-news-300")  

In [14]:
# request for easicredit late fee waiver

In [21]:
def get_sentence_vec(sentence, word2vec, idf=None):
    words = sentence.split()
    words = [word for word in words if word in word2vec.vocab]
    if len(words) == 0:
        return np.zeros((300, ), dtype='float32')
    
    # use mean if no idf provided
    if idf is None:
        emb = word2vec[words].mean(axis=0)
    else:
        # get all idf of words
        idf_series = np.array([idf.get(word, 0.0) for word in words])
        # change shape to 1 x num_of_words
        idf_series = idf_series.reshape(1, -1)
        # use matrix multiplication to get weighted word vector sum for sentence embeddings
        # print(idf_series.shape, len(words))
        emb = np.matmul(idf_series, word2vec[words]).reshape(-1)
    return emb

In [16]:
def get_sentence_centre(sentence_list, score_list ,word2vec, num_features):
    emb = np.zeros((num_features, ), dtype='float32')
    sentence_count = 0
    for sentence, score in zip(sentence_list,score_list):
        sentence_count += 1
        emb = emb + get_sentence_vec(sentence, word2vec,idf = score)
    return emb / sentence_count

## cluster leave one out

In [17]:
def get_cluster_centre(df, intent_list, word2vec):
    result = {}
    for intent in intent_list:
        df_tmp = df[df.intent == intent]
        sentence_centre_tmp = get_sentence_centre(df_tmp['query'].values,df_tmp['score'].values ,word2vec, 300)
        result[intent] = sentence_centre_tmp
    return result

In [18]:
def get_tfidf_for_words(text):
    # get matrix use todense(), get ndarray use toarray()
    tfidf_matrix= vectorizer.transform([text]).todense()
    # get index for words in tfidf
    feature_index = tfidf_matrix[0,:].nonzero()[1]
    # create list: [(intent1, idf1), (intent2, idf2), ...]
    tfidf_scores = zip([feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
    # return dict: {intent1: tfidf1, intent2: tfidf2, ...}
    return dict(tfidf_scores)

In [19]:
# embed_mtx = np.array([
#     word2vec[w] if w in word2vec.vocab else [0]*300 for w in feature_names
# ])

# np.matmul( np.asarray(vectorizer.transform(df['query']).todense()), embed_mtx).shape

In [22]:
intent_list = df.intent.unique().tolist()
sentence_distance = []
for ind in df.index:
    sentence_distance_tmp = []
    query = df.loc[ind, 'query']
    df_data = df.drop(ind)
    
    ## get tf-idf score 
    # get a different idf score each time the query is taken out
    vectorizer = TfidfVectorizer()
    vectorizer.fit(df_data['query'])
    
    # feature_names to be used to calculate word2vec.vocab coverage
    feature_names = vectorizer.get_feature_names()
    
    df_data['score'] = df['query'].apply(get_tfidf_for_words)
    print(df_data.head())
    
    # get tfidf score to be used in weighted word vector
    query_score = get_tfidf_for_words(query)
    
    sentence_centre_dic = get_cluster_centre(df_data, intent_list, word2vec)
    for intent in intent_list:
        sentence_distance_tmp.append(cosine_distances(get_sentence_vec(query, word2vec,query_score).reshape(1,-1), 
                                                      sentence_centre_dic[intent].reshape(1,-1)).item())
    sentence_distance.append(sentence_distance_tmp)

In [23]:
df_sentence_distance = pd.DataFrame(sentence_distance, columns=intent_list)

In [24]:
df_sentence_distance.head()

Unnamed: 0,Statement request,Passbook savings accounts,Card statements,Credit card statement,Debit card statement,Investment account statement,Home loan account statement,360 Account interest dispute,Change of billing cycle,Token Activation,...,Paying a cancelled credit card,How to close my account,Card dispute,Change credit card limit,Increase credit card limit,Decrease credit card limit,Credit card application rejection,Rebates,How to redeem rewards,Update details
0,0.239015,0.476868,0.337956,0.419239,0.569444,0.497531,0.53997,0.723792,0.782479,0.818257,...,0.717396,0.724538,0.644826,0.740849,0.791113,0.835014,0.631769,0.770484,0.854625,0.560702
1,0.267362,0.469997,0.365053,0.366122,0.528491,0.406568,0.507421,0.722103,0.795615,0.779281,...,0.626802,0.666503,0.622732,0.720149,0.724,0.784794,0.599908,0.71844,0.817008,0.487535
2,0.304271,0.54258,0.201802,0.385232,0.60666,0.449884,0.444711,0.850517,0.913493,0.994172,...,0.902475,0.845332,0.752034,0.909864,0.915016,0.92886,0.811126,0.958441,0.931494,0.705593
3,0.182261,0.443851,0.28248,0.373305,0.54749,0.458453,0.476687,0.701372,0.777772,0.831823,...,0.69659,0.704418,0.615315,0.710639,0.757346,0.810301,0.603653,0.737596,0.867568,0.544313
4,0.252563,0.577026,0.364759,0.461886,0.615006,0.519045,0.475794,0.776273,0.842217,0.854082,...,0.749669,0.802884,0.678018,0.803683,0.799772,0.862823,0.550518,0.82217,0.941049,0.634665


In [25]:
df.reset_index(drop=True, inplace=True)

In [26]:
df_result_2 = pd.concat([df, df_sentence_distance], axis=1)

In [27]:
df_result_2.head()

Unnamed: 0,intent,query,Statement request,Passbook savings accounts,Card statements,Credit card statement,Debit card statement,Investment account statement,Home loan account statement,360 Account interest dispute,...,Paying a cancelled credit card,How to close my account,Card dispute,Change credit card limit,Increase credit card limit,Decrease credit card limit,Credit card application rejection,Rebates,How to redeem rewards,Update details
0,Statement request,like copy -PRON- statement,0.239015,0.476868,0.337956,0.419239,0.569444,0.497531,0.53997,0.723792,...,0.717396,0.724538,0.644826,0.740849,0.791113,0.835014,0.631769,0.770484,0.854625,0.560702
1,Statement request,send -PRON- copy -PRON- statement,0.267362,0.469997,0.365053,0.366122,0.528491,0.406568,0.507421,0.722103,...,0.626802,0.666503,0.622732,0.720149,0.724,0.784794,0.599908,0.71844,0.817008,0.487535
2,Statement request,-PRON- statement,0.304271,0.54258,0.201802,0.385232,0.60666,0.449884,0.444711,0.850517,...,0.902475,0.845332,0.752034,0.909864,0.915016,0.92886,0.811126,0.958441,0.931494,0.705593
3,Statement request,want hard copy -PRON- statement,0.182261,0.443851,0.28248,0.373305,0.54749,0.458453,0.476687,0.701372,...,0.69659,0.704418,0.615315,0.710639,0.757346,0.810301,0.603653,0.737596,0.867568,0.544313
4,Statement request,statement request,0.252563,0.577026,0.364759,0.461886,0.615006,0.519045,0.475794,0.776273,...,0.749669,0.802884,0.678018,0.803683,0.799772,0.862823,0.550518,0.82217,0.941049,0.634665


In [28]:
# check accuracy
df_tmp = df_result_2.copy()
df_tmp.set_index(['intent', 'query'], inplace=True)
df_tmp['cluster'] = df_tmp.idxmin(axis=1)
df_tmp.reset_index(inplace=True)
df_tmp['correct'] = (df_tmp.cluster == df_tmp.intent)
sum(df_tmp.correct) / len(df_tmp)

0.8003120124804992

In [29]:
# output features
df_result_2.to_csv('data_leave_one_out.csv')

In [32]:
list(df_result_2.columns.values)

['intent',
 'query',
 'Statement request',
 'Passbook savings accounts',
 'Card statements',
 'Credit card statement',
 'Debit card statement',
 'Investment account statement',
 'Home loan account statement',
 '360 Account interest dispute',
 'Change of billing cycle',
 'Token Activation',
 'Student Loan',
 'Tuition fee loan',
 'Education loan',
 'Study loan',
 'Car loan full settlement',
 'Home loan repayment',
 'Cancel Fund Transfer',
 'Cancel credit card transaction',
 'Credit Refund',
 'Account opening for foreigners',
 'Mobile Banking Issues',
 'Account Fraud',
 'Dormant Account Activation',
 'CRS Enquiries',
 'SRS Contribution',
 'Dispute status',
 'Give a compliment',
 'File a complaint',
 'Funds Transfer Status',
 'Telegraphic transfer Status',
 'Make a telegraphic transfer',
 'Unable to log into internet banking',
 'Card application status',
 'Supplementary card application',
 'Access codes for banking services',
 'Interest or Late fee waiver',
 'Annual Fee Waiver',
 'SMS Aler

## check embedding coverage

In [30]:
# check embedding coverage
[x for x in feature_names if x not in word2vec.vocab]

['360',
 'contribtion',
 'enquire',
 'labour',
 'malysian',
 'nisp',
 'nonsingaporean',
 'nsip',
 'ocbc',
 'onetoken',
 'resette',
 'scamme',
 'singapoeran',
 'statemnt',
 'unsuspend']

In [31]:
print(len([x for x in feature_names if x in word2vec.vocab]) / len(feature_names))

0.9553571428571429
