## Import

In [1]:
import re
import math
import string
import time

import numpy as np
import pandas as pd

import string
import pickle

from scipy.sparse import hstack

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_distances

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

import gensim.downloader as api

from nltk.tokenize import word_tokenize

## Load

In [2]:
# !wget -P "temp" -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-04-21 02:55:32--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.89.222
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.89.222|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘temp/GoogleNews-vectors-negative300.bin.gz’


2020-04-21 02:59:43 (6.29 MB/s) - ‘temp/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [3]:
# %%time
# from gensim.models import KeyedVectors
# EMBEDDING_FILE = 'temp/GoogleNews-vectors-negative300.bin.gz'
# word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [7]:
# list of punctuation marks
punctuations = string.punctuation

# Create spacy word2vec and list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# load classifier
with open("temp/RFClassifier2.pkl", 'rb') as f:
    clf2 = pickle.load(f)

# load vectorizer
with open('temp/TFIDFVectorizer_lemma2.pkl', 'rb') as f:
    v_lemma2 = pickle.load(f)
with open('temp/TFIDFVectorizer_keyword2.pkl', 'rb') as f:
    v_keyword2 = pickle.load(f)
with open('temp/TFIDFVectorizer_noun2.pkl', 'rb') as f:
    v_noun2 = pickle.load(f)
with open('temp/TFIDFVectorizer_verb2.pkl', 'rb') as f:
    v_verb2 = pickle.load(f)

# load intent2index
with open('temp/intent2index2.pkl', 'rb') as f:
    intent2index2 = pickle.load(f)
    
# load keyword_list_lemma
with open('temp/keyword_list_lemma2.pkl', 'rb') as f:
    keyword_list_lemma2 = pickle.load(f)

# load word2vec
try:
    word2vec
except NameError:
    word2vec = api.load("word2vec-google-news-300")

## Utilities

In [3]:
# utilities
def get_nlp_features(df, keyword_list_lemma):
    """ Get keyword features from dataframe """
    data = df.copy()
    data['lemma'] = data['query'].apply(lambda x:' '.join([token.lemma_ for token in nlp(x) if token.lemma_ not in stop_words]))
    data['keyword'] = data['lemma'].apply(lambda x: list(set([token.lemma_ for token in nlp(x) if token.lemma_ in keyword_list_lemma])))

    data['noun'] = data['query'].apply(lambda x: list(set([token.lemma_ for token in nlp(x) if token.pos_ in ['NOUN','PROPN'] and token.lemma_ not in stop_words])))
    data['verb'] = data['query'].apply(lambda x: list(set([token.lemma_ for token in nlp(x) if token.pos_ in ['VERB'] and token.lemma_ not in stop_words])))

    data['noun'] = data['noun'].apply(lambda x: ' '.join([w for w in x]))
    data['verb'] = data['verb'].apply(lambda x: ' '.join([w for w in x]))
    data['keyword'] = data['keyword'].apply(lambda x: ' '.join([w for w in x]))
    return data

def clean_text(text):
    """ Basic text cleaning
        
        1. lowercase
        2. remove special characters
    """
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

def nltk_tokenize(text):
    """ tokenize text using NLTK and join back as sentence"""
    # import nltk
    # nltk.download('punkt')
    return ' '.join(word_tokenize(text))

def add_nlp_vec_alone(df, v_lemma, v_keyword, v_noun, v_verb):
    """ Transform NLP features to vector for input X using TFIDF """
    x_test_lemma = v_lemma.transform(df['lemma'])
    x_test_keyword = v_keyword.transform(df['keyword'])
    x_test_noun = v_noun.transform(df['noun'])
    x_test_verb = v_verb.transform(df['verb'])
    
    # combine all features 
    x_test_combined = hstack((x_test_lemma,
                              x_test_keyword,
                              x_test_noun,
                              x_test_verb),format='csr')

    x_test_combined_columns = v_lemma.get_feature_names()+\
                              v_keyword.get_feature_names()+\
                              v_noun.get_feature_names()+\
                              v_verb.get_feature_names()
    
    x_test_combined = pd.DataFrame(x_test_combined.toarray())
    x_test_combined.columns = x_test_combined_columns
    
    return x_test_combined

def get_target_name(index, index2intent):
    return index2intent[index]

## Run

In [20]:
def get_intent_nlp(query):
    """ 
        return a dataframe df
        columns: pred_seq, intent_class, intent_string, pred_prob
        rows: top 3 prediciton, example for first row: 1, 0, Promotions, 0.66
    """

    start = time.time()

    #%% pipeline
    # convert question to dataframe
    df = pd.DataFrame()
    df = pd.DataFrame(columns=['query'])
    df.loc[0] = [query]
    
    print("Part1", time.time() - start)

    # preprocessing test query
    df['query'] = df['query'].apply(clean_text)
    df['query'] = df['query'].apply(nltk_tokenize)
    df['query'] = df['query'].apply(lambda x:' '.join([token.lemma_ for token in nlp(x) if token.lemma_ not in stop_words]))
    df['query'] = df['query'].str.lower()
    
    print("Part2", time.time() - start)

    # get nlp features
    df = get_nlp_features(df, keyword_list_lemma2)
    
    print("Part3", time.time() - start)
    
    X_in = add_nlp_vec_alone(df, v_lemma2, v_keyword2, v_noun2, v_verb2)
    
    print("Part4", time.time() - start)

    # get prediction proba, use another classifier
    # TODO: train classifier 2
    probs = clf2.predict_proba(X_in)
    
    print("Part5", time.time() - start)

    # get index for top 3 prediction by proba
    ind = np.argsort(probs, axis=1)[:,-3:]

    # save probability
    proba = probs[0][ind[0]]
    
    print("Part6", time.time() - start)

    # save predicitons as dataframe
    best_3 = pd.DataFrame(ind,columns=['top3','top2','top1'])
    best_3['top1'] = clf.classes_[best_3['top1']]
    best_3['top2'] = clf.classes_[best_3['top2']]
    best_3['top3'] = clf.classes_[best_3['top3']]
    best_3['top3_prob'] = proba[0]
    best_3['top2_prob'] = proba[1]
    best_3['top1_prob'] = proba[2]
    
    print("Part7", time.time() - start)

    # get index to intent dictionary from intent2index
    index2intent = {y:x for x,y in intent2index2.items()}

    # get class name of top predictions
    best_3['top1_name'] = best_3['top1'].apply(get_target_name, index2intent=index2intent)
    best_3['top2_name'] = best_3['top2'].apply(get_target_name, index2intent=index2intent)
    best_3['top3_name'] = best_3['top3'].apply(get_target_name, index2intent=index2intent)
    
    print("Part8", time.time() - start)

    # output prediction
    top1 = best_3.at[0,'top1_name']
    top2 = best_3.at[0,'top2_name']
    top3 = best_3.at[0,'top3_name']
    top1_prob = best_3.at[0,'top1_prob']
    top2_prob = best_3.at[0,'top2_prob']
    top3_prob = best_3.at[0,'top3_prob']
    
    print("Part9", time.time() - start)

#     print(f'For sentence:\n{query}\n')
#     print(f'Top 1 prediction intent is {top1} with probability {100*top1_prob:.2f}%')
#     print(f'Top 2 prediction intent is {top2} with probability {100*top2_prob:.2f}%')
#     print(f'Top 3 prediction intent is {top3} with probability {100*top3_prob:.2f}%')

    top1_class = best_3.at[0,'top1']
    top2_class = best_3.at[0,'top2']
    top3_class = best_3.at[0,'top3']

    # convert to output
    df = pd.DataFrame([
            [1, top1_class, top1, top1_prob],
            [2, top2_class, top2, top2_prob],
            [3, top3_class, top3, top3_prob]
        ], columns=['pred_seq', 'intent_class', 'intent', 'pred_prob'])
    
    print("Part10", time.time() - start)

    inference_time = time.time() - start
    return df, inference_time

In [21]:
test_query = "Please show me the current promotions"

In [22]:
get_intent_nlp(test_query)

Part1 0.007563114166259766
Part2 0.022623538970947266
Part3 0.05807757377624512
Part4 0.06486225128173828
Part5 0.17626595497131348
Part6 0.1767432689666748
Part7 0.18040680885314941
Part8 0.18406200408935547
Part9 0.18462753295898438
Part10 0.18629121780395508


(   pred_seq  intent_class             intent  pred_prob
 0         1            45         Promotions   0.980395
 1         2            46    Card Promotions   0.015000
 2         3            26  Give a compliment   0.002132,
 0.18636584281921387)

In [11]:
import cProfile
cProfile.run('get_intent_nlp(test_query)')

For sentence:
Please show me the current promotions

Top 1 prediction intent is Promotions with probability 98.04%
Top 2 prediction intent is Card Promotions with probability 1.50%
Top 3 prediction intent is Give a compliment with probability 0.21%
         197789 function calls (196781 primitive calls) in 0.500 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(all)
        5    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(amax)
       22    0.000    0.000    0.001    0.000 <__array_function__ internals>:2(append)
        1    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(argsort)
        3    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(array_equal)
       46    0.000    0.000    0.001    0.000 <__array_function__ internals>:2(atleast_1d)
       30    0.000    0.000    0.001    0.000 <_

        3    0.000    0.000    0.000    0.000 base.py:1217(_get_names)
        1    0.000    0.000    0.000    0.000 base.py:1220(_set_names)
        1    0.000    0.000    0.000    0.000 base.py:1250(set_names)
       22    0.000    0.000    0.000    0.000 base.py:1383(nlevels)
       11    0.000    0.000    0.001    0.000 base.py:1656(is_unique)
       20    0.000    0.000    0.000    0.000 base.py:1670(is_integer)
        2    0.000    0.000    0.000    0.000 base.py:1679(is_object)
        3    0.000    0.000    0.000    0.000 base.py:1730(inferred_type)
        4    0.000    0.000    0.000    0.000 base.py:1737(is_all_dates)
        1    0.000    0.000    0.000    0.000 base.py:2210(_validate_sort_keyword)
       96    0.000    0.000    0.000    0.000 base.py:242(nnz)
        1    0.000    0.000    0.000    0.000 base.py:2436(difference)
      726    0.002    0.000    0.011    0.000 base.py:247(is_dtype)
        1    0.000    0.000    0.000    0.000 base.py:2581(_assert_can_do_set

        5    0.000    0.000    0.000    0.000 compressed.py:1131(sort_indices)
       18    0.000    0.000    0.001    0.000 compressed.py:1140(prune)
        4    0.000    0.000    0.000    0.000 compressed.py:127(_set_self)
       17    0.001    0.000    0.002    0.000 compressed.py:138(check_format)
       17    0.000    0.000    0.003    0.000 compressed.py:30(__init__)
        4    0.000    0.000    0.000    0.000 compressed.py:464(_mul_vector)
        4    0.000    0.000    0.002    0.001 compressed.py:491(_mul_sparse_matrix)
        4    0.000    0.000    0.003    0.001 compressed.py:586(sum)
        2    0.000    0.000    0.000    0.000 concat.py:104(<genexpr>)
        2    0.000    0.000    0.000    0.000 concat.py:105(<genexpr>)
        2    0.000    0.000    0.000    0.000 concat.py:115(__init__)
        3    0.000    0.000    0.000    0.000 concat.py:119(<genexpr>)
        3    0.000    0.000    0.000    0.000 concat.py:120(<genexpr>)
        2    0.000    0.000    0.000   

        1    0.000    0.000    0.000    0.000 fromnumeric.py:993(_argsort_dispatcher)
        1    0.000    0.000    0.000    0.000 fromnumeric.py:997(argsort)
        1    0.000    0.000    0.000    0.000 frozen.py:66(__getitem__)
        3    0.000    0.000    0.000    0.000 function.py:42(__call__)
       15    0.000    0.000    0.000    0.000 function_base.py:1137(_diff_dispatcher)
       15    0.000    0.000    0.000    0.000 function_base.py:1141(diff)
        1    0.000    0.000    0.000    0.000 function_base.py:257(iterable)
       22    0.000    0.000    0.000    0.000 function_base.py:4636(_append_dispatcher)
       22    0.000    0.000    0.001    0.000 function_base.py:4640(append)
     1000    0.012    0.000    0.023    0.000 functools.py:44(update_wrapper)
     1000    0.002    0.000    0.002    0.000 functools.py:74(wraps)
     3073    0.005    0.000    0.008    0.000 generic.py:10(_check)
        1    0.000    0.000    0.001    0.001 generic.py:1110(rename_axis)
      

        1    0.000    0.000    0.000    0.000 nanops.py:334(_wrap_results)
        1    0.000    0.000    0.000    0.000 nanops.py:396(nanany)
        1    0.000    0.000    0.001    0.001 nanops.py:460(nansum)
        3    0.000    0.000    0.000    0.000 nanops.py:55(check)
        1    0.000    0.000    0.001    0.001 nanops.py:59(_f)
        4    0.000    0.000    0.000    0.000 nanops.py:62(<genexpr>)
        5    0.000    0.000    0.000    0.000 nonproj.pyx:133(deprojectivize (wrapper))
        5    0.000    0.000    0.000    0.000 nonproj.pyx:133(deprojectivize)
        3    0.000    0.000    0.000    0.000 numeric.py:107(_shallow_copy)
       58    0.000    0.000    0.000    0.000 numeric.py:155(is_all_dates)
        4    0.000    0.000    0.000    0.000 numeric.py:159(ones)
       16    0.000    0.000    0.000    0.000 numeric.py:1786(isscalar)
        3    0.000    0.000    0.000    0.000 numeric.py:2283(_array_equal_dispatcher)
        3    0.000    0.000    0.000    0.000 n

       31    0.000    0.000    0.000    0.000 {built-in method pandas._libs.missing.checknull}
        4    0.000    0.000    0.000    0.000 {built-in method pandas._libs.missing.isnaobj}
        2    0.000    0.000    0.000    0.000 {built-in method posix.fspath}
        1    0.000    0.000    0.000    0.000 {built-in method posix.get_terminal_size}
        8    0.000    0.000    0.000    0.000 {built-in method posix.getpid}
        1    0.000    0.000    0.000    0.000 {built-in method scipy.sparse._sparsetools.coo_tocsr}
        4    0.000    0.000    0.000    0.000 {built-in method scipy.sparse._sparsetools.csc_matvec}
        1    0.000    0.000    0.000    0.000 {built-in method scipy.sparse._sparsetools.csr_has_canonical_format}
        5    0.000    0.000    0.000    0.000 {built-in method scipy.sparse._sparsetools.csr_has_sorted_indices}
        4    0.000    0.000    0.000    0.000 {built-in method scipy.sparse._sparsetools.csr_matmat_pass1}
        4    0.000    0.000    0.0