# Pipeline without Text Clustering

In [1]:
# General Import
import re
import math
import string

import numpy as np
import pandas as pd

from scipy.sparse import hstack

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_distances

import gensim.downloader as api

from nltk.tokenize import word_tokenize

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
# Starting point
import os
import sys
from pathlib import Path

PATH_HOME = Path.home()
PATH_PROJ = Path.cwd()
PATH_DATA = PATH_PROJ

sys.path.append(str(PATH_PROJ))

## load data

In [3]:
# TRAIN
df_train = pd.read_csv('data2.csv')
df_train.dropna(inplace=True)
print(df_train.shape)
df_train.head(2)

(641, 3)


Unnamed: 0,Label,Intent,Questions
0,0,Statement request,i would like a copy of my statement
1,0,Statement request,please send me a copy of my statement


In [4]:
# rename dataframe
df_train = df_train.rename(columns={'Intent': 'intent', 'Questions': 'query'})
df_train = df_train[['intent', 'query']]
df_train.head(2)

Unnamed: 0,intent,query
0,Statement request,i would like a copy of my statement
1,Statement request,please send me a copy of my statement


In [5]:
# TEST
df_test = pd.read_csv('uat_data_intent.csv')
df_test.dropna(inplace=True)
print(df_test.shape)
df_test.head(2)

(128, 3)


Unnamed: 0,Question,User Clicked intent,Google-intent
0,how do i submit a dispute?,Cancel credit card transaction,Dispute status
1,I lost my card,Lost or compromised cards,Lost or compromised cards


In [6]:
df_test['correct_google'] = np.where(df_test['User Clicked intent'] == df_test['Google-intent'], 1, 0)
df_test.head()

Unnamed: 0,Question,User Clicked intent,Google-intent,correct_google
0,how do i submit a dispute?,Cancel credit card transaction,Dispute status,0
1,I lost my card,Lost or compromised cards,Lost or compromised cards,1
2,I have not received my purchases from the merc...,Cancel credit card transaction,Cancel ATM Card,0
3,i have a transaction that i did not do,Cancel credit card transaction,Cancel credit card transaction,1
4,how to terminate my card?,Cancel Credit or Debit Card,Card Cancellation,0


In [7]:
# rename dataframe
df_test = df_test.rename(columns={'User Clicked intent': 'intent', 'Question': 'query'})
df_test = df_test[['intent', 'query']]
df_test.head(2)

Unnamed: 0,intent,query
0,Cancel credit card transaction,how do i submit a dispute?
1,Lost or compromised cards,I lost my card


## Utilities

In [8]:
def clean_text(text):
    """ Basic text cleaning
        
        1. lowercase
        2. remove special characters
    """
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [9]:
def nltk_tokenize(text):
    """ tokenize text using NLTK and join back as sentence"""
    # import nltk
    # nltk.download('punkt')
    return ' '.join(word_tokenize(text))

In [10]:
# Function for spacy tokenizer

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_lg')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

## Pipeline

In [11]:
# preprocessing questions
df_train['query'] = df_train['query'].apply(clean_text)
df_train['query'] = df_train['query'].apply(nltk_tokenize)
df_train['query'] = df_train['query'].apply(lambda x:' '.join([token.lemma_ for token in nlp(x) if token.lemma_ not in stop_words]))
df_train['query'] = df_train['query'].str.lower()


# preprocessing test as well
df_test['query'] = df_test['query'].apply(clean_text)
df_test['query'] = df_test['query'].apply(nltk_tokenize)
df_test['query'] = df_test['query'].apply(lambda x:' '.join([token.lemma_ for token in nlp(x) if token.lemma_ not in stop_words]))
df_test['query'] = df_test['query'].str.lower()

In [12]:
df_train.head(2)

Unnamed: 0,intent,query
0,Statement request,like copy -pron- statement
1,Statement request,send -pron- copy -pron- statement


In [13]:
df_test.head(2)

Unnamed: 0,intent,query
0,Cancel credit card transaction,submit dispute
1,Lost or compromised cards,lose -pron- card


In [14]:
intent_list = df_train.intent.unique().tolist()
intent_list[:2]

['Statement request', 'Passbook savings accounts']

In [24]:
intents = intent_list.copy()
intent2index = {v: i for (i, v) in enumerate(intents)}
index2intent = {y:x for x,y in intent2index.items()}

In [15]:
test_intent_list = df_test.intent.unique().tolist()
set(intent_list) == set(test_intent_list)

False

In [16]:
import warnings
warnings.filterwarnings("ignore")

In [17]:
# TEST
try:
    word2vec
except NameError:
    word2vec = api.load("word2vec-google-news-300")  

In [18]:
def get_keywords(intent_list, stop_words):
    """ Get list of keywords from intent """
    keywords = []
    for intent in list(set(intent_list)):
        keywords.extend(intent.strip().split(' '))
    keyword_list = list(set(keywords))
    keyword_list = [i.lower() for i in keyword_list if i.lower() not in stop_words]
    keyword_list.append('nsip')

    keyword_list_lemma = []
    text = nlp(' '.join([w for w in keyword_list]))
    for token in text:
        keyword_list_lemma.append(token.lemma_)
    return keyword_list_lemma

In [19]:
keyword_list_lemma = get_keywords(intent_list, stop_words=STOP_WORDS)

In [20]:
def get_nlp_features(df, keyword_list_lemma):
    """ Get keyword features from dataframe """
    data = df.copy()
    data['lemma'] = data['query'].apply(lambda x:' '.join([token.lemma_ for token in nlp(x) if token.lemma_ not in stop_words]))
    data['keyword'] = data['lemma'].apply(lambda x: list(set([token.lemma_ for token in nlp(x) if token.lemma_ in keyword_list_lemma])))

    data['noun'] = data['query'].apply(lambda x: list(set([token.lemma_ for token in nlp(x) if token.pos_ in ['NOUN','PROPN'] and token.lemma_ not in stop_words])))
    data['verb'] = data['query'].apply(lambda x: list(set([token.lemma_ for token in nlp(x) if token.pos_ in ['VERB'] and token.lemma_ not in stop_words])))

    data['noun'] = data['noun'].apply(lambda x: ' '.join([w for w in x]))
    data['verb'] = data['verb'].apply(lambda x: ' '.join([w for w in x]))
    data['keyword'] = data['keyword'].apply(lambda x: ' '.join([w for w in x]))
    return data

In [25]:
df_train = get_nlp_features(df_train, keyword_list_lemma)
df_train['target'] = df_train['intent'].apply(lambda x: intent2index[x])
df_train.head(2)

Unnamed: 0,intent,query,lemma,keyword,noun,verb,target
0,Statement request,like copy -pron- statement,like copy -pron- statement,statement,statement -pron- copy,,0
1,Statement request,send -pron- copy -pron- statement,send -pron- copy -pron- statement,statement,statement -pron-,send copy,0


In [26]:
df_test = get_nlp_features(df_test, keyword_list_lemma)
df_test['target'] = df_test['intent'].apply(lambda x: intent2index[x])
df_test.head(2)

Unnamed: 0,intent,query,lemma,keyword,noun,verb,target
0,Cancel credit card transaction,submit dispute,submit dispute,dispute,dispute,submit,17
1,Lost or compromised cards,lose -pron- card,lose -pron- card,lose card,-pron- card,lose,43


In [27]:
countvector_cols = ['lemma', 'keyword', 'noun', 'verb']

In [29]:
def get_train_test(df_train, df_test, feature_cols):
    """ split dataset, get X_train, X_test, y_train, y_test """
    X_train = df_train[feature_cols]
    # print(X_train.head(1))
    y_train = df_train['target']
    # print(y_train.head(1))
    X_test = df_test[feature_cols]
    y_test = df_test['target']
    # print(X_test.head(1))
    # print(y_test.head(1))
    return X_train, y_train, X_test, y_test

In [30]:
X_train, y_train, X_test, y_test = get_train_test(df_train, df_test, feature_cols=countvector_cols)

In [31]:
def add_nlp_to_x(X_train, X_test):
    """ Add NLP features to input X """
    v_lemma = TfidfVectorizer()
    x_train_lemma = v_lemma.fit_transform(X_train['lemma'])
    x_test_lemma = v_lemma.transform(X_test['lemma'])
    vocab_lemma = dict(v_lemma.vocabulary_)

    v_keyword = TfidfVectorizer()
    x_train_keyword = v_keyword.fit_transform(X_train['keyword'])
    x_test_keyword = v_keyword.transform(X_test['keyword'])
    vocab_keyword = dict(v_keyword.vocabulary_)

    v_noun = TfidfVectorizer()
    x_train_noun = v_noun.fit_transform(X_train['noun'])
    x_test_noun = v_noun.transform(X_test['noun'])
    vocab_noun = dict(v_noun.vocabulary_)

    v_verb = TfidfVectorizer()
    x_train_verb = v_verb.fit_transform(X_train['verb'])
    x_test_verb = v_verb.transform(X_test['verb'])
    vocab_verb = dict(v_verb.vocabulary_)
    
    # combine all features 
    x_train_combined = hstack((x_train_lemma,
                               x_train_keyword,
                               x_train_noun,
                               x_train_verb),format='csr')
    x_train_combined_columns= v_lemma.get_feature_names()+\
                            v_keyword.get_feature_names()+\
                            v_noun.get_feature_names()+\
                            v_verb.get_feature_names()

    x_test_combined  = hstack((x_test_lemma, 
                               x_test_keyword, 
                               x_test_noun, 
                               x_test_verb), format='csr')
    x_test_combined_columns = v_lemma.get_feature_names()+\
                            v_keyword.get_feature_names()+\
                            v_noun.get_feature_names()+\
                            v_verb.get_feature_names()

    x_train_combined = pd.DataFrame(x_train_combined.toarray())
    x_train_combined.columns = x_train_combined_columns

    x_test_combined = pd.DataFrame(x_test_combined.toarray())
    x_test_combined.columns = x_test_combined_columns
    
    return x_train_combined, x_test_combined, v_lemma, v_keyword, v_noun, v_verb

In [32]:
x_train_combined, x_test_combined, v_lemma, v_keyword, v_noun, v_verb = add_nlp_to_x(X_train, X_test)

In [33]:
# build classifier
clf = RandomForestClassifier(max_depth=50, n_estimators=1000)
clf.fit(x_train_combined, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=50, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [34]:
probs = clf.predict_proba(x_test_combined)
best_3 = pd.DataFrame(np.argsort(probs, axis=1)[:,-3:],columns=['top3','top2','top1'])
best_3['top1'] = clf.classes_[best_3['top1']]
best_3['top2'] = clf.classes_[best_3['top2']]
best_3['top3'] = clf.classes_[best_3['top3']]

In [35]:
result = pd.concat([best_3.reset_index(drop=True),
                    pd.DataFrame(y_test).reset_index(drop=True), 
                    X_test[countvector_cols].reset_index(drop=True)], axis=1)
score_1 = result[result['top1'] == result['target']].shape[0] / result.shape[0]
score_2 = result[(result['top1'] == result['target']) | (result['top2'] == result['target'])].shape[0] / result.shape[0]
score_3 = result[(result['top1'] == result['target']) | (result['top2'] == result['target'])| (result['top3'] == result['target'])].shape[0] / result.shape[0]

In [36]:
print('Accuracy for top 1 clustering + classifier result is {:.1%}'.format(score_1))
print('Accuracy for top 2 clustering + classifier result is {:.1%}'.format(score_2))
print('Accuracy for top 3 clustering + classifier result is {:.1%}'.format(score_3))

Accuracy for top 1 clustering + classifier result is 70.3%
Accuracy for top 2 clustering + classifier result is 79.7%
Accuracy for top 3 clustering + classifier result is 85.2%


## Save vectors

In [38]:
import pickle

In [39]:
# save the model to disk
model_filename = 'RFClassifier2.pkl'
pickle.dump(clf, open(model_filename, 'wb'))

In [40]:
# save vectorizer
with open('TFIDFVectorizer_lemma2.pkl', 'wb') as f:
    pickle.dump(v_lemma, f)
with open('TFIDFVectorizer_keyword2.pkl', 'wb') as f:
    pickle.dump(v_keyword, f)
with open('TFIDFVectorizer_noun2.pkl', 'wb') as f:
    pickle.dump(v_noun, f)
with open('TFIDFVectorizer_verb2.pkl', 'wb') as f:
    pickle.dump(v_verb, f)

In [43]:
# save necessary variables
with open('intent_list2.pkl', 'wb') as f:
    pickle.dump(intent_list, f)
with open('intent2index2.pkl', 'wb') as f:
    pickle.dump(intent2index, f)
with open('keyword_list_lemma2.pkl', 'wb') as f:
    pickle.dump(keyword_list_lemma, f)