# Install Required Libs

In [0]:
!pip install numpy==1.17.3
!pip install pandas==0.25.3
!pip install nltk==3.2.5
!pip install keras==2.2.5
!pip install tensorflow==1.15.0
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Load Train & Test Data


In [0]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.tsv", sep="\t" ,names=["text","intent"])
test = pd.read_csv("test.tsv", sep="\t",names=["text","intent"])

In [0]:
train.describe()

Unnamed: 0,text,intent
count,4634,4634
unique,4634,22
top,all flights from boston to washington dc on th...,flight
freq,1,3426


In [0]:
test.describe()

Unnamed: 0,text,intent
count,850,850
unique,850,20
top,show me ground transportation in phoenix,flight
freq,1,613


In [0]:
training_classes = np.unique(train['intent'])
test_classes = np.unique(test['intent'])

print("Unique intent classes in training set: {0}".format(training_classes))
print("Unique intent classes in test set: {0}".format(test_classes))

in_test_but_notin_train = [(t if t not in training_classes else '') for t in test_classes]
in_train_but_notin_test = [(t if t not in test_classes else '') for t in training_classes]

print(in_test_but_notin_train)
print(in_train_but_notin_test)

Unique intent classes in training set: ['abbreviation' 'aircraft' 'aircraft+flight+flight_no' 'airfare'
 'airfare+flight_time' 'airline' 'airline+flight_no' 'airport' 'capacity'
 'cheapest' 'city' 'distance' 'flight' 'flight+airfare' 'flight_no'
 'flight_time' 'ground_fare' 'ground_service' 'ground_service+ground_fare'
 'meal' 'quantity' 'restriction']
Unique intent classes in test set: ['abbreviation' 'aircraft' 'airfare' 'airfare+flight' 'airline' 'airport'
 'capacity' 'city' 'day_name' 'distance' 'flight' 'flight+airfare'
 'flight+airline' 'flight_no' 'flight_no+airline' 'flight_time'
 'ground_fare' 'ground_service' 'meal' 'quantity']
['', '', '', 'airfare+flight', '', '', '', '', 'day_name', '', '', '', 'flight+airline', '', 'flight_no+airline', '', '', '', '', '']
['', '', 'aircraft+flight+flight_no', '', 'airfare+flight_time', '', 'airline+flight_no', '', '', 'cheapest', '', '', '', '', '', '', '', '', 'ground_service+ground_fare', '', '', 'restriction']


In [0]:
# Extract and Merge all class labels
# Test set contains an additional class which is 'day_name' 
def extract_classes(arr, all_classes=[]):
  for t in arr:
    if ('+' in t):
      classes = t.split('+')
      for c in classes:
        if (c not in all_classes):
          all_classes.append(c)
    elif (t not in all_classes):
      all_classes.append(t)
  
  return all_classes

all_classes = extract_classes(training_classes)
all_classes = np.array(extract_classes(test_classes, all_classes = all_classes))
all_classes

array(['abbreviation', 'aircraft', 'flight', 'flight_no', 'airfare',
       'flight_time', 'airline', 'airport', 'capacity', 'cheapest',
       'city', 'distance', 'ground_fare', 'ground_service', 'meal',
       'quantity', 'restriction', 'day_name'], dtype='<U14')

# Preprocess Data

In [0]:
# Convert labels to one-hot encoded format
train_arr = np.array(train['text'])
train_lbl_arr = np.array(train['intent'])
test_arr = np.array(test['text'])
test_lbl_arr = np.array(test['intent'])

train_lbl_encoded_arr = np.zeros((np.shape(train_lbl_arr)[0], len(all_classes)))
print(np.shape(train_lbl_encoded_arr))

test_lbl_encoded_arr = np.zeros((np.shape(test_lbl_arr)[0], len(all_classes)))
print(np.shape(test_lbl_encoded_arr))

for t in range(len(train_lbl_arr)):
  intent_str = train_lbl_arr[t]
  if '+' in intent_str:
    intents = intent_str.split('+')
    for i in intents:      
      train_lbl_encoded_arr[t, np.where(all_classes==i)[0][0]] = 1
  else:    
    train_lbl_encoded_arr[t, np.where(all_classes==intent_str)[0][0]] = 1

for t in range(len(test_lbl_arr)):
  intent_str = test_lbl_arr[t]
  if '+' in intent_str:
    intents = intent_str.split('+')
    for i in intents:      
      test_lbl_encoded_arr[t, np.where(all_classes==i)[0][0]] = 1
  else:    
    test_lbl_encoded_arr[t, np.where(all_classes==intent_str)[0][0]] = 1

(4634, 18)
(850, 18)


In [0]:
from nltk.tokenize import word_tokenize

def preprocess(s, return_as_tokenized = False):
  # Make lowercase and tokenize
  s = word_tokenize(s.lower())

  # Replace numerical values with a #NUM# identifier
  s = [('#NUM#' if t.isnumeric() else t) for t in s]

  if (return_as_tokenized):
    return s
    
  return ' '.join(s)

train_arr_preprocessed = [preprocess(t) for t in train_arr]
test_arr_preprocessed = [preprocess(t) for t in test_arr]
print(np.shape(train_arr_preprocessed))
print(np.shape(test_arr_preprocessed))

(4634,)
(850,)


# Model-1: TF.IDF Feature with Only Dense Layered NN

## Create TF.IDF Feature Vectors


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(train_arr_preprocessed)
x_test_tfidf = vectorizer.transform(test_arr_preprocessed)

x_train_tfidf = x_train_tfidf.todense()
x_test_tfidf = x_test_tfidf.todense()

print(np.shape(test_lbl_encoded_arr))
print(np.shape(x_test_tfidf))
print(np.shape(train_lbl_encoded_arr))
print(np.shape(x_train_tfidf))

(850, 18)
(850, 722)
(4634, 18)
(4634, 722)


## Create Dense Layered Model


In [0]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, Embedding, Dropout

model = Sequential()
model.add(Dense(200, activation='tanh', input_dim=np.shape(x_train_tfidf)[1]))
model.add(Dense(200, activation='tanh'))
model.add(Dense(18, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train_tfidf, train_lbl_encoded_arr, epochs=150, batch_size=100, verbose=2, validation_data=(x_test_tfidf, test_lbl_encoded_arr))

# Calculate Label Prediction Accuracy
pred = model.predict(x_test_tfidf)
total_label_erros = np.sum(np.abs(np.round(pred)-test_lbl_encoded_arr))
total_label_count = np.sum(test_lbl_encoded_arr)
label_accuracy = total_label_erros/total_label_count
1-label_accuracy

Train on 4634 samples, validate on 850 samples
Epoch 1/150
 - 16s - loss: 0.2671 - acc: 0.9482 - val_loss: 0.0940 - val_acc: 0.9699
Epoch 2/150
 - 0s - loss: 0.0748 - acc: 0.9756 - val_loss: 0.0734 - val_acc: 0.9785
Epoch 3/150
 - 0s - loss: 0.0543 - acc: 0.9833 - val_loss: 0.0618 - val_acc: 0.9814
Epoch 4/150
 - 1s - loss: 0.0410 - acc: 0.9881 - val_loss: 0.0538 - val_acc: 0.9835
Epoch 5/150
 - 0s - loss: 0.0318 - acc: 0.9907 - val_loss: 0.0480 - val_acc: 0.9864
Epoch 6/150
 - 0s - loss: 0.0250 - acc: 0.9928 - val_loss: 0.0444 - val_acc: 0.9872
Epoch 7/150
 - 0s - loss: 0.0201 - acc: 0.9943 - val_loss: 0.0413 - val_acc: 0.9872
Epoch 8/150
 - 0s - loss: 0.0168 - acc: 0.9953 - val_loss: 0.0389 - val_acc: 0.9875
Epoch 9/150
 - 0s - loss: 0.0144 - acc: 0.9960 - val_loss: 0.0373 - val_acc: 0.9879
Epoch 10/150
 - 0s - loss: 0.0122 - acc: 0.9968 - val_loss: 0.0353 - val_acc: 0.9884
Epoch 11/150
 - 0s - loss: 0.0107 - acc: 0.9971 - val_loss: 0.0344 - val_acc: 0.9889
Epoch 12/150
 - 0s - loss:

0.8647398843930636

## Calculate Overall Label Prediction Accuracy

In [0]:
# Calculate Label Prediction Accuracy
pred = model.predict(x_test_tfidf)
total_label_erros = np.sum(np.abs(np.round(pred)-test_lbl_encoded_arr))
total_label_count = np.sum(test_lbl_encoded_arr)
label_accuracy = total_label_erros/total_label_count
1-label_accuracy

0.8647398843930636

## Save Model Files And TF.IDF Vectorizer

In [0]:
# Save model and tokenizer
import pickle

model.save("tfidf_model.h5")

with open('vectorizer.npy', 'wb') as handle:
    pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Model-2 Neural Embedding Layer with RNN


## Word Tokenization & Padding For Embedding Layer

In [0]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

# Tüm kelimeleri derle ve numeric formatta ayrıştır
vocabulary_size = 1000
max_sentence_len = 50
tokenizer = Tokenizer(num_words = vocabulary_size)
tokenizer.fit_on_texts(train_arr_preprocessed)

train_embedding_inputs = tokenizer.texts_to_sequences(train_arr_preprocessed)
train_embedding_inputs = pad_sequences(train_embedding_inputs, maxlen = max_sentence_len)

test_embedding_inputs = tokenizer.texts_to_sequences(test_arr_preprocessed)
test_embedding_inputs = pad_sequences(test_embedding_inputs, maxlen = max_sentence_len)

## Create RNN Model

In [0]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, Embedding     

model = Sequential()
model.add(Embedding(vocabulary_size, 100, input_length=max_sentence_len))
model.add(LSTM(10, return_sequences=False))
model.add(Dense(200, activation='tanh'))
model.add(Dense(200, activation='tanh'))
model.add(Dense(18, activation='sigmoid')) #tanh
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
 
model.fit(train_embedding_inputs, train_lbl_encoded_arr, epochs=150, batch_size=100, verbose=2, validation_data=(test_embedding_inputs, test_lbl_encoded_arr))

Train on 4634 samples, validate on 850 samples
Epoch 1/150
 - 19s - loss: 0.2741 - acc: 0.9616 - val_loss: 0.1137 - val_acc: 0.9699
Epoch 2/150
 - 2s - loss: 0.0932 - acc: 0.9712 - val_loss: 0.1008 - val_acc: 0.9699
Epoch 3/150
 - 2s - loss: 0.0851 - acc: 0.9719 - val_loss: 0.0891 - val_acc: 0.9741
Epoch 4/150
 - 2s - loss: 0.0728 - acc: 0.9776 - val_loss: 0.0766 - val_acc: 0.9795
Epoch 5/150
 - 2s - loss: 0.0570 - acc: 0.9821 - val_loss: 0.0703 - val_acc: 0.9808
Epoch 6/150
 - 2s - loss: 0.0471 - acc: 0.9850 - val_loss: 0.0653 - val_acc: 0.9822
Epoch 7/150
 - 2s - loss: 0.0411 - acc: 0.9880 - val_loss: 0.0601 - val_acc: 0.9849
Epoch 8/150
 - 2s - loss: 0.0348 - acc: 0.9904 - val_loss: 0.0575 - val_acc: 0.9867
Epoch 9/150
 - 2s - loss: 0.0302 - acc: 0.9921 - val_loss: 0.0543 - val_acc: 0.9878
Epoch 10/150
 - 2s - loss: 0.0260 - acc: 0.9929 - val_loss: 0.0485 - val_acc: 0.9893
Epoch 11/150
 - 2s - loss: 0.0226 - acc: 0.9934 - val_loss: 0.0497 - val_acc: 0.9889
Epoch 12/150
 - 2s - loss:

<keras.callbacks.History at 0x7fc44ceca7b8>

## Calculate Overall Label Prediction Accuracy

In [0]:
# Calculate Label Prediction Accuracy
pred = model.predict(test_embedding_inputs)
total_label_erros = np.sum(np.abs(np.round(pred)-test_lbl_encoded_arr))
total_label_count = np.sum(test_lbl_encoded_arr)
label_accuracy = total_label_erros/total_label_count
1-label_accuracy

0.8705202312138729

In [0]:
np.shape(test_embedding_inputs)

(850, 50)

## Savel Model Files And Tokenizer

In [0]:
 Save model and tokenizer
import pickle

model.save("rnn_model.h5")

with open('rnn_tokenizer.npy', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)