# Chatbot intent classification

## 0. Load data

In [0]:
import numpy as np
import pandas as pd

In [0]:
# read data
df = pd.read_csv("data.csv")
df = df.dropna()

In [5]:
df.head()

Unnamed: 0,label,intent,query
0,0,Promotions,what promotions do you have?
1,0,Promotions,what promotions are available?
2,0,Promotions,promotions
3,0,Promotions,I want to see promotions
4,0,Promotions,view promotions


In [6]:
df.shape

(396, 3)

In [7]:
df.label.value_counts()

9     24
11    18
29    18
59    16
49    15
22    12
35    12
36    12
19    11
18    10
48    10
37    10
38     9
2      9
58     9
34     9
33     9
0      9
46     9
57     8
42     8
40     8
23     8
3      7
4      7
56     7
10     7
47     7
21     7
39     7
24     7
45     7
44     7
41     7
20     7
25     6
54     6
8      6
7      6
43     6
26     5
5      5
55     4
6      4
31     1
Name: label, dtype: int64

In [0]:
df = df.drop(df[df.label == 31].index)

In [9]:
df.label.value_counts()

9     24
11    18
29    18
59    16
49    15
22    12
35    12
36    12
19    11
18    10
48    10
37    10
38     9
2      9
58     9
34     9
33     9
0      9
46     9
57     8
42     8
40     8
23     8
21     7
3      7
4      7
56     7
10     7
47     7
39     7
24     7
45     7
44     7
41     7
20     7
25     6
54     6
8      6
7      6
43     6
26     5
5      5
55     4
6      4
Name: label, dtype: int64

In [10]:
df.shape

(395, 3)

## 1. TF-IDF


In [0]:
import re
import math
from string import punctuation

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [12]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
# def get_data(path):
#     """ load data from csv """
#     # read data
#     df = pd.read_csv(path)
#     # drop na
#     df = df.dropna()
#     # drop label with only one query
#     df = df.drop(df[df.label == 31].index)
#     return df

In [0]:
def string_clean(text):
    """ Basic text cleaning """
    # Remove numbers
    # Remove punctuations
    # Remove single character
    # Stemming
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [0]:
def build_sentence_vec(sentence, model, num_features, index2word_set, idf=None):
    """ Build sentence embedding """
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            if idf is None:
                idf_weight = 1
            else:
                idf_weight = idf[word]
            feature_vec = np.add(feature_vec, model[word] * idf_weight)
    if n_words > 0:
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [0]:
def compute_idf_weights(doc_list):
    """ Compute idf based on all documnets """
    df = {}  # data frequency
    for doc in doc_list:
        words = set(doc.strip().split())
        for word in words:
            if word not in df:
                df[word] = 0.0
            df[word] += 1.0

    idf = {}
    N = len(doc_list)
    for word, count in df.items():
        idf[word] = math.log(float(N) / (1 + count))
    return idf

In [0]:
def get_data(path):
    """ load data from csv """
    # read data
    df = pd.read_csv(path)
    # drop na
    df = df.dropna()
    # drop label with only one query
    df = df.drop(df[df.label == 31].index)
    return df


def sentence_embedding(method='tfidf', word2vec_model=None):
    # 1. load data
    df = get_data("data.csv")
    data = df.copy()

    # 2. Preprocessing
    data['query'] = data['query'].apply(string_clean)

    # 3. Tokenizing
    def tokenize(s): 
        return " ".join(word_tokenize(s))
    data['query'] = data['query'].apply(tokenize)

    # 4. Get similarity matrix and distnace
    if method == 'tfidf':
        # Use tfidf alone to calculate similarity
        tfidf = TfidfVectorizer(lowercase=True, stop_words="english", ngram_range=(1, 3)).fit_transform(data['model_text'].values.astype('U'))

        # cosine similarity
        similarity = tfidf * tfidf.T
        distance = 1 - similarity.todense()

    else:
        # Use word2vec
        index2word_set = set(word2vec_model.index2word)
        if method == 'word2vec':
            emb = [build_sentence_vec(data.iloc[i, data.columns.get_loc('model_text')], model=word2vec_model, num_features=300,
                                  index2word_set=index2word_set) for i in range(len(data))]

        # Use word2vec + idf
        elif method == 'idf-word2vec':
            idf = compute_idf_weights(data['model_text'])
            emb = [build_sentence_vec(data.iloc[i, data.columns.get_loc('model_text')], model=word2vec_model, num_features=300,
                               index2word_set=index2word_set, idf=idf) for i in range(len(data))]
        emb = np.array(emb)
        similarity = cosine_similarity(emb)
        distance = 1 - similarity

    return distance

In [0]:
data = df.copy()

In [0]:
data['query'] = data['query'].apply(string_clean)

In [0]:
def tokenize(s): 
    return " ".join(word_tokenize(s))
data['query'] = data['query'].apply(tokenize)

Use tfidf alone to calculate similarity

In [21]:
data['query'][0]

'what promotions do you have'

In [22]:
data['query'][1]

'what promotions are available'

In [23]:
data['query'][2]

'promotions'

In [0]:
tfidf = TfidfVectorizer(lowercase=True, stop_words="english", ngram_range=(1, 3)).fit_transform(data['query'].values.astype('U'))

In [25]:
tfidf.shape

(395, 1015)

In [0]:
similarity = tfidf * tfidf.T

In [27]:
similarity.shape

(395, 395)

In [28]:
similarity.todense()[0]

matrix([[1.        , 0.4365589 , 1.        , 0.50538786, 0.44372551,
         1.        , 1.        , 0.42448213, 1.        , 0.61639195,
         0.32781829, 0.61639195, 0.35414045, 0.33082039, 0.61639195,
         0.61639195, 0.3226112 , 0.61639195, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0

In [0]:
distance = 1 - similarity.todense()

In [30]:
distance

matrix([[ 0.00000000e+00,  5.63441104e-01,  0.00000000e+00, ...,
          1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
        [ 5.63441104e-01, -2.22044605e-16,  5.63441104e-01, ...,
          1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
        [ 0.00000000e+00,  5.63441104e-01,  0.00000000e+00, ...,
          1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
        ...,
        [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
          1.11022302e-16,  1.85135611e-01,  1.85135611e-01],
        [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
          1.85135611e-01,  0.00000000e+00,  0.00000000e+00],
        [ 1.00000000e+00,  1.00000000e+00,  1.00000000e+00, ...,
          1.85135611e-01,  0.00000000e+00,  0.00000000e+00]])

## 2. Word2vec

In [31]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-03-30 01:52:01--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.142.70
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.142.70|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2020-03-30 01:52:26 (64.8 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [0]:
from gensim.models import KeyedVectors

In [33]:
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz' # from above
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [34]:
# debug
word2vec["cat"].shape

(300,)

In [0]:
df = get_data("data.csv")

In [0]:
data = df.copy()

In [0]:
data['query'] = data['query'].apply(string_clean)

In [0]:
def tokenize(s): 
    return " ".join(word_tokenize(s))
data['query'] = data['query'].apply(tokenize)

In [0]:
word2vec_model = word2vec

In [0]:
index2word_set = set(word2vec_model.index2word)

In [0]:
emb = [build_sentence_vec(data.iloc[i, data.columns.get_loc('query')], model=word2vec_model, num_features=300,
                            index2word_set=index2word_set) for i in range(len(data))]

In [0]:
emb = np.array(emb)
similarity = cosine_similarity(emb)
distance = 1 - similarity

In [43]:
distance

array([[ 1.1920929e-07,  2.9730374e-01,  4.9945098e-01, ...,
         4.8597598e-01,  5.9918547e-01,  4.8402524e-01],
       [ 2.9730374e-01,  0.0000000e+00,  3.8349301e-01, ...,
         6.5329713e-01,  6.7476648e-01,  6.5881020e-01],
       [ 4.9945098e-01,  3.8349301e-01, -8.3446503e-07, ...,
         8.6953247e-01,  8.5623157e-01,  8.6642522e-01],
       ...,
       [ 4.8597598e-01,  6.5329713e-01,  8.6953247e-01, ...,
         5.9604645e-08,  1.6287935e-01,  2.0581901e-02],
       [ 5.9918547e-01,  6.7476648e-01,  8.5623157e-01, ...,
         1.6287935e-01,  1.7881393e-07,  1.7406398e-01],
       [ 4.8402524e-01,  6.5881020e-01,  8.6642522e-01, ...,
         2.0581901e-02,  1.7406398e-01,  2.3841858e-07]], dtype=float32)

## 3. Cluster Centre