In [80]:
from gensim.models import FastText
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import pandas as PD
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline

In [22]:
stopword = stopwords.words('english')
stemmer = LancasterStemmer()
lem = WordNetLemmatizer()
snowball_stemmer = SnowballStemmer("english")

In [11]:
column_names = ["intent", "query"]

In [12]:
data = PD.DataFrame(PD.read_csv("atis_intents.csv", names=column_names))

In [21]:
data.head()

Unnamed: 0,intent,query
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...


## Build a fastText model for text processing

In [18]:
queries = data[column_names[1]]
intent = data[column_names[0]]

In [44]:
# tokenize sentences
token_queries = [sent_tokenize(sent) for sent in queries]
tokens = []
for sent in token_queries:
    sent_token = [word_tokenize(w) for w in sent ]
    tokens.append(sent_token[0])

In [55]:
tokens[0]

['i',
 'want',
 'to',
 'fly',
 'from',
 'boston',
 'at',
 '838',
 'am',
 'and',
 'arrive',
 'in',
 'denver',
 'at',
 '1110',
 'in',
 'the',
 'morning']

In [47]:
# initialize the model and build model vocabs

word_embed_model = FastText(size=30, window=3, min_count=1)
word_embed_model.build_vocab(sentences=tokens)
word_embed_model.train(sentences=tokens, total_examples=len(tokens), epochs=10)

In [50]:
word_embed_model["welcome"].shape

  """Entry point for launching an IPython kernel.


(30,)

In [54]:
similarities = word_embed_model.wv.most_similar(positive=['flight'], negative=['interface'])
similarities

[('flights', 0.5673556327819824),
 ('to', 0.42487087845802307),
 ('list', 0.42161622643470764),
 ('fly', 0.4090701639652252),
 ('go', 0.36568373441696167),
 ('san', 0.3497317135334015),
 ('me', 0.34542104601860046),
 ('right', 0.3422325849533081),
 ('k', 0.33783388137817383),
 ('from', 0.33678507804870605)]

In [71]:
queries_vectors = []

for sent in token_queries:
    sent_token = [word_embed_model[word_tokenize(w)] for w in sent ]
    queries_vectors.append(sent_token[0])

  after removing the cwd from sys.path.


In [72]:
queries_vectors[0].shape

(18, 30)

In [73]:
queries_vectors[0].shape ,len(queries_vectors[0]), len(tokens[0]) 

((18, 30), 18, 18)

In [74]:
max_len = max(len(a) for a in tokens)

In [77]:
sent_vector_padded = tf.keras.preprocessing.sequence.pad_sequences(queries_vectors, maxlen=50, truncating="post",padding="post", dtype="float32"  )

In [78]:
sent_vector_padded.shape

(4978, 50, 30)

## Process targets

In [98]:
classes = PD.get_dummies(data['intent'], prefix='intent')
classes = np.array(classes)

In [99]:
classes.shape

(4978, 22)