In [53]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Add, Dropout, Concatenate, Embedding, Bidirectional, LSTM, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from models import InferSent
import torch
import nltk
from tqdm import tqdm
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras import regularizers
import os
import datetime
np.random.seed(3252) 


# Datasets

The following data set is formed from the combination of the datasets:
- https://github.com/clinc/oos-eval ["An Evaluation Dataset for Intent Classification and Out-of-Scope Prediction" (EMNLP 2019) ]
- NLU Dataset Benchmarking Natural Language Understanding Services for building Conversational Agents(https://arxiv.org/abs/1903.05566). The NLU dataset was combined by looking at the answer, scenario + intent
- "Snips Voice Platform: an embedded Spoken Language Understanding system for private-by-design voice interfaces"https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engin - found from deep.pavlov website

## Other Datasets considered
- https://voice.mozilla.org/en/datasets
Conversation Corpus:
https://www.clarin.eu/resource-families/spoken-corpora
https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html
https://www.aclweSpoken corporab.org/anthology/D18-1305.pdf
https://www.linguistics.ucsb.edu/research/santa-barbara-corpus
https://neurohive.io/en/datasets/natural-questions-new-large-scale-corpus-for-question-answering-by-google-ai/
https://arxiv.org/abs/1506.08909 - stanford multi turn dataset

However, these datasets haven't been processed and labeled by intents unlike the other datasets we selected. If the training is lacking, we can probalby probably speech to text and label some data. These datasets are also good for testing

## Intents 
222 different intents. View `intent_counts.csv` to see full list of intents and counts. 
There are a total of 64174 lines of text in the combined dataset.

### None Type Intent Classification
TODO: Add More oos(Out of scope queries) that handle queries that are bad. One suggestion was to put Movie Subtitles Dataset. 


In [4]:
df = pd.read_json('data.json', orient='records')

In [5]:
intent_counts = df.groupby(['intents']).size().to_frame('count') # df.groupby(['intents']).count().to_csv('intent_counts.csv')
intent_counts = intent_counts.sort_values('count', ascending=False)

total_count = intent_counts["count"].sum()
intent_counts["prop"] = intent_counts["count"]/total_count * 100
# intent_counts.to_csv('intent_counts.csv')
intent_counts

Unnamed: 0_level_0,count,prop
intents,Unnamed: 1_level_1,Unnamed: 2_level_1
PlayMusic,2300,3.584565
GetWeather,2300,3.584565
BookRestaurant,2273,3.542485
SearchScreeningEvent,2259,3.520666
RateBook,2256,3.515990
...,...,...
interest_rate,150,0.233776
international_fees,150,0.233776
international_visa,150,0.233776
jump_start,150,0.233776


# Preprocessing the datasets

# CLINC150 dataset/oos-eval dataset Preprocessing
```python
with open('oos-eval/data/data_full.json', 'r') as f:
    oos_eval_dataset = json.load(f)
    out_of_scope = oos_eval_dataset["oos_val"]
    in_scope_labels = oos_eval_dataset["val"]
    train = oos_eval_dataset["oos_train"]
df = pd.DataFrame(train, columns=['text', 'intents'])
df = df.where(df['intents'] == 'oos').append(pd.DataFrame(out_of_scope, columns=['text', 'intents']), ignore_index=True)
data.extend(list(df.T.to_dict().values()))
```

# NLU Dataset
```python
nlu_dataset = pd.read_csv('NLU-Evaluation-Data/Collected-Original-Data/paraphrases_and_intents_26k_normalised_all.csv', delimiter=';')
nlu_dataset = nlu_dataset[["answer_normalised", "scenario", "intent", "suggested_entities"]]
nlu_dataset["full_intent"] = nlu_dataset["scenario"] + nlu_dataset["intent"].str.title() 
nlu_dataset = nlu_dataset[["answer_normalised", "full_intent"]]
nlu_dataset.columns = ["text", "intents"]
nlu_dataset = nlu_dataset.iloc[:, ::-1]
df = df.append(nlu_dataset, ignore_index=True)
df.to_json('test2.json', orient='records')
```

# Training the Classifiers

### Note: Although I'm not doing any paramater tuning, I have Train, Validation, and Test datasets

In [6]:
df_train, df_test = train_test_split(df, test_size=0.1)
df_train, df_val = train_test_split(df_train, test_size=0.1)

X_train, y_train = df_train["text"].to_numpy(), df_train["intents"].to_numpy()
X_val, y_val = df_val["text"].to_numpy(), df_val["intents"].to_numpy()
X_test, y_test = df_test["text"].to_numpy(), df_test["intents"].to_numpy()\


In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_encoded = label_encoder.transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

number_items = max(y_train_encoded) - min(y_train_encoded) + 1

def get_one_hot(arr, number_items):
    return tf.one_hot(arr, number_items)

def get_labels_decoded(arr):
    return label_encoder.inverse_transform(arr)

y_train_one_hot = get_one_hot(y_train_encoded, number_items)
y_val_one_hot = get_one_hot(y_val_encoded, number_items)
y_test_one_hot = get_one_hot(y_test_encoded, number_items)


# Classifiers
- https://www.researchgate.net/publication/301932124_Intent_Classification_of_Short-Text_on_Social_Media
- https://allennlp.org/ - [Deep contextualized word representations]. ELMo based embeddings 

## Word Embeddings
- ELMo
- BERT 
- USE
GloVe: https://github.com/maciejkula/glove-python or SpaCy package implementation
Word2Vec: gensim package implementation
FastText: https://github.com/facebookresearch/fastText or gensim package implementation
StarSpace: https://github.com/facebookresearch/StarSpace


## Sentence Embeddings
InferSent: https://github.com/facebookresearch/InferSent
Sent2Vec: https://github.com/epfml/sent2vec
Infersent performance: https://arxiv.org/pdf/1705.02364.pdf

A good analysis of the different word embeddings/sentence embeddings is https://dspace.cvut.cz/bitstream/handle/10467/77029/F3-DP-2018-Brich-Tomas-Semantic_Sentence_Similarity_for_Intent_Recognition_Task.pdf?sequence=-1&isAllowed=y#ref%3Asec_is ["Semantic Sentence Similarityfor Intent Recognition Task"] says Sent2Vec is the best on benchmarks

Another analysis https://arxiv.org/pdf/1806.06259.pdf ["Evaluation of sentence embeddings in downstreamand linguistic probing tasks". 

According to this a combo of Elmo+InferSent seems to be the best option.

### Embedding Chosen

Infersent or Elmo Seems to be the best option:
Then classifiying intents, we may be given longer sentences, Inferset is known to have good performance https://arxiv.org/abg/abs/1705.02364 especiall on giving many correct results were retrieved in the top n, where n is an integer["Evaluation of sentence embeddings in downstream and linguistic probing tasks"].  

Elmo with allen should also be tested. 

## Model Chosen
According to https://arxiv.org/abs/1705.02364["Supervised Learning of Universal Sentence Representations from Natural Language Inference Data"] a BiDaf model works pretty well. Copied from https://github.com/rajatgermany/qa-nlp

Other Models tested: Random Forest, Linear Regression, CNN. Since they were easy to add and test as a baseline
 
https://arxiv.org/pdf/1207.0580.pdf - dropout after every layer

### Open Source Classifiers tested
- deeppavlov
- allennlp


In [8]:
# Attempt Deepalov
# from deeppavlov import build_model, configs

# model = build_model('sample_pavolv_config.json', download=True)  # in case of necessity to download some data
# from deeppavlov import build_model, configs
# print(configs.classifiers.keys())

In [80]:
#%%
# Load model

def generate_Infersent_model():
    # Load model
    model_version = 1
    MODEL_PATH = "encoder/infersent%s.pkl" % model_version
    params_model = {'bsize': 64, 
                    'word_emb_dim': 300, 
                    'enc_lstm_dim': 2048,
                    'pool_type': 'max',
                    'dpout_model': 0.0, 
                    'version': model_version}
    model = InferSent(params_model)
    model.load_state_dict(torch.load(MODEL_PATH))

    W2V_PATH = 'GloVe/glove.840B.300d.txt'
    model.set_w2v_path(W2V_PATH)

    model.build_vocab_k_words(K=100000)

    return model

def get_doc2vec(text, model):
    emb = model.encode(text, verbose=True)
    return emb

def generate_basic_fully_connected():
    input_1 = Input((4096,), dtype=tf.float32)
    x = Dense(2000, activation='relu', kernel_regularizer=regularizers.l2(0.0001))(input_1)
    
    x = Dense(500, activation='relu', kernel_regularizer=regularizers.l2(0.0001))(x)
    x = Dropout(rate=0.5)(x)
    
    x = Dense(450, activation='relu', kernel_regularizer=regularizers.l2(0.0001))(x)
    x = Dropout(rate=0.5)(x)
    
    x = Dense(250, activation='relu', kernel_regularizer=regularizers.l2(0.0001))(x)
    out = Dense(222, activation='softmax')(x)
    dual_model = Model(inputs=input_1, outputs=out)
    
    adamOpti = Adam()
    dual_model.compile(optimizer=adamOpti, loss='categorical_crossentropy', metrics=['acc'])
    
    dual_model.summary()
    return dual_model


In [10]:
model = generate_Infersent_model()

Vocab size : 100000


In [11]:
get_doc2vec(["Hello new world. How are you doing?"], model)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Nb words kept : 11/11 (100.0%)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Speed : 4.8 sentences/s (cpu mode, bsize=64)


array([[ 0.15559037,  0.01629045,  0.00050003, ...,  0.01910147,
        -0.03814263,  0.05127323]], dtype=float32)

In [12]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

res = get_doc2vec(["happy", "running"], model)
print(cosine(res[0],res[1]))
res = get_doc2vec(["happy", "great"], model)
print(cosine(res[0],res[1]))

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Nb words kept : 6/6 (100.0%)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Speed : 15.5 sentences/s (cpu mode, bsize=64)
0.4536041


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Nb words kept : 6/6 (100.0%)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Speed : 15.1 sentences/s (cpu mode, bsize=64)
0.7514287


In [13]:
print("loading X Train vec")
X_train_vec = get_doc2vec(X_train, model)

print("loading X Val vec")
X_val_vec = get_doc2vec(X_val, model)

print("loading X Test vec")
X_test_vec = get_doc2vec(X_test, model)

loading X Train vec


HBox(children=(FloatProgress(value=0.0, max=51972.0), HTML(value='')))


Nb words kept : 505444/516186 (97.9%)


HBox(children=(FloatProgress(value=0.0, max=813.0), HTML(value='')))


Speed : 133.7 sentences/s (cpu mode, bsize=64)
loading X Val vec


HBox(children=(FloatProgress(value=0.0, max=5775.0), HTML(value='')))


Nb words kept : 56131/57329 (97.9%)


HBox(children=(FloatProgress(value=0.0, max=91.0), HTML(value='')))


Speed : 126.8 sentences/s (cpu mode, bsize=64)
loading X Test vec


HBox(children=(FloatProgress(value=0.0, max=6417.0), HTML(value='')))


Nb words kept : 62789/64137 (97.9%)


HBox(children=(FloatProgress(value=0.0, max=101.0), HTML(value='')))


Speed : 122.2 sentences/s (cpu mode, bsize=64)


# Fully Connected Model

In [81]:

logs_base_dir = './logs'
os.makedirs("./logs", exist_ok=True)
logdir = os.path.join(logs_base_dir, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = TensorBoard(logdir, histogram_freq=1)

# %load_ext tensorboard
# %tensorboard --logdir logs_base_dir

In [82]:
ffcc = generate_basic_fully_connected()
ffcc.fit(X_train_vec, y_train_one_hot, 
         epochs=100, batch_size=64,
         validation_data=[X_val_vec, y_val_one_hot]
         , callbacks = [EarlyStopping(monitor='val_acc', patience=6), tensorboard_callback])

Model: "model_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_23 (InputLayer)        [(None, 4096)]            0         
_________________________________________________________________
dense_123 (Dense)            (None, 2000)              8194000   
_________________________________________________________________
dense_124 (Dense)            (None, 500)               1000500   
_________________________________________________________________
dropout_54 (Dropout)         (None, 500)               0         
_________________________________________________________________
dense_125 (Dense)            (None, 450)               225450    
_________________________________________________________________
dropout_55 (Dropout)         (None, 450)               0         
_________________________________________________________________
dense_126 (Dense)            (None, 250)               112

<tensorflow.python.keras.callbacks.History at 0x161db4748>

In [28]:
def calculate_accuracy(model, X_vec, y_labels, nueral_net=True):
    predictions = ffcc.predict(X_vec)
    if nueral_net:
        predicted_labels = get_labels_decoded(np.argmax(predictions, axis=1)) # each integer is [0, 0, 0,1]
    else:
        predicted_labels = get_labels_decoded(predictions)

    correct = 0
#     classification = {key:defaultdict(int) for key in y_labels}
    classification = []
    for pred,y_true in zip(predicted_labels,y_labels):
        if pred == y_true:
            correct += 1
        classification.append({"y_true": y_true, "pred": pred})

    classification = pd.DataFrame(classification)
    classification = classification.groupby(['y_true','pred']).size().to_frame('count')
    accuracy = correct/len(predictions)
    
    return accuracy, classification

## Train Accuracy

In [188]:
train_accuracy, train_classifications = calculate_accuracy(ffcc, X_train_vec, y_train)
print("Train Accuracy", train_accuracy)


Train Accuracy 0.8921727083814361


In [40]:
train_classifications

NameError: name 'train_classifications' is not defined

## Validation Accuracy

In [29]:
validation_accuracy, validation_classifications = calculate_accuracy(ffcc, X_val_vec, y_val)
print("Validation Accuracy", validation_accuracy)


Validation Accuracy 0.78995670995671


## Test Accuracy 

In [190]:
test_accuracy, test_classifications = calculate_accuracy(ffcc, X_test_vec, y_test)
print("Test Accuracy", test_accuracy)

Test Accuracy 0.7963222689730404


# LogisticRegression 
Takes to long to run

In [212]:
# logistic_model = LogisticRegression(multi_class='multinomial', solver='newton-cg', verbose=1)
# logistic_model.fit(X_train_vec, y_train_encoded)
# TO slow


In [None]:
# predicted_labels = get_labels_decoded(logistic_model.predict(X_train_vec))
# print("Logistic regression Train Accuracy : ", metrics.accuracy_score(y_train, predicted_labels))

# BiLSTM

In [226]:
def generate_bilstm():
    input_1 = Input((4096,), dtype=tf.float32)
    
    questionEmbd = Embedding(input_dim=20000,
                             mask_zero=False, 
                             trainable=False, output_dim=500)(input_1)
    
    bidirectional = Bidirectional(LSTM(300, return_sequences=True))(questionEmbd)
    bidirectional_flatten = Flatten()(bidirectional)
    
    out = Dense(222,activation='sigmoid')(bidirectional_flatten)
    dual_model = Model(inputs=input_1, outputs=out)
    dual_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    dual_model.summary()
    return dual_model


In [14]:
# bilstm = generate_bilstm()
# bilstm.fit(X_train_vec, y_train_one_hot, epochs=10,batch_size=300)

In [None]:
get_labels_decoded(y_train_one_hot)