# ATIS Dataset

Airline Travel Information System dataset is a standard benchmark dataset for the task of intent detection.

In [1]:
# Place the Google Drive Sharing link
file_share_link = "https://drive.google.com/file/d/1QK10QsL29ykVNIUri_cF6uCEtt-SQ4vY/view?usp=sharing"

# extract the ID of the file
file_id = file_share_link[file_share_link.find("d/")+2 : file_share_link.find("/v")]
print(file_id)
!gdown "$file_id" # Download the data from Google Drive

1QK10QsL29ykVNIUri_cF6uCEtt-SQ4vY
Downloading...
From: https://drive.google.com/uc?id=1QK10QsL29ykVNIUri_cF6uCEtt-SQ4vY
To: /content/atis.zip
100% 142k/142k [00:00<00:00, 111MB/s]


In [2]:
!unzip atis.zip

Archive:  atis.zip
  inflating: atis_intents.csv        
  inflating: atis_intents_test.csv   
  inflating: atis_intents_train.csv  


In [3]:
import os
import numpy as np
import random
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Conv1D, MaxPooling1D, Embedding, LSTM
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import Constant

from sklearn.preprocessing import LabelEncoder

In [4]:
train_df = pd.read_csv('atis_intents_train.csv', header=None)
train_df.columns = ['intents','text']
train_df.head()

Unnamed: 0,intents,text
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...


In [5]:
test_df = pd.read_csv('atis_intents_test.csv', header=None)
test_df.columns = ['intents','text']
test_df.head()

Unnamed: 0,intents,text
0,atis_flight,i would like to find a flight from charlotte ...
1,atis_airfare,on april first i need a ticket from tacoma to...
2,atis_flight,on april first i need a flight going from pho...
3,atis_flight,i would like a flight traveling one way from ...
4,atis_flight,i would like a flight from orlando to salt la...


In [19]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4834 entries, 0 to 4833
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   intents  4834 non-null   int64 
 1   text     4834 non-null   object
dtypes: int64(1), object(1)
memory usage: 75.7+ KB


## Data Preprocessing

In [6]:
MAX_SEQUENCE_LENGTH = 300
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.3

In [7]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_df['text'])

train_sequences = tokenizer.texts_to_sequences(train_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

word_index = tokenizer.word_index
print(f'length of unique words (tokens) is {len(word_index)}')

length of unique words (tokens) is 871


In [10]:
train_sequences[:2]

[[18, 67, 1, 38, 2, 9, 68, 394, 84, 16, 78, 15, 12, 68, 511, 15, 4, 35],
 [7, 3, 26, 57, 2, 19, 1, 21, 5, 75, 35]]

In [11]:
train_df.head()

Unnamed: 0,intents,text
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...


In [12]:
train_df['intents'].unique()

array(['atis_flight', 'atis_flight_time', 'atis_airfare', 'atis_aircraft',
       'atis_ground_service', 'atis_airline', 'atis_abbreviation',
       'atis_quantity'], dtype=object)

In [13]:
set(train_df['intents'])

{'atis_abbreviation',
 'atis_aircraft',
 'atis_airfare',
 'atis_airline',
 'atis_flight',
 'atis_flight_time',
 'atis_ground_service',
 'atis_quantity'}

In [14]:
train_df.isnull().sum()

intents    0
text       0
dtype: int64

In [15]:
le = LabelEncoder()
le.fit(train_df['intents'])
train_df['intents'] = le.transform(train_df['intents'])
test_df['intents'] = le.transform(test_df['intents'])

In [16]:
train_df.head()

Unnamed: 0,intents,text
0,4,i want to fly from boston at 838 am and arriv...
1,4,what flights are available from pittsburgh to...
2,5,what is the arrival time in san francisco for...
3,2,cheapest airfare from tacoma to orlando
4,2,round trip fares from pittsburgh to philadelp...


In [17]:
test_df.head()

Unnamed: 0,intents,text
0,4,i would like to find a flight from charlotte ...
1,2,on april first i need a ticket from tacoma to...
2,4,on april first i need a flight going from pho...
3,4,i would like a flight traveling one way from ...
4,4,i would like a flight from orlando to salt la...


In [18]:
type(train_sequences)

list

In [20]:
max([len(l) for l in train_sequences])

46

In [21]:
max([len(l) for l in test_sequences])

30

In [22]:
train_valid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_valid_labels = to_categorical(train_df['intents'])
test_labels = to_categorical(test_df['intents'])

In [23]:
train_valid_data[:2]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_valid_data, train_valid_labels, test_size=VALIDATION_SPLIT, random_state=42)

In [25]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3383, 300), (1451, 300), (3383, 8), (1451, 8))

## Embeddings

In [27]:
#Download Pretrained Glove Embeddings

file_share_link = "https://drive.google.com/file/d/1ag1E217mcCw_J0_dStlNB3ZeC1NdG3x1/view?usp=sharing"

# extract the ID of the file
file_id = file_share_link[file_share_link.find("d/")+2 : file_share_link.find("/v")]
print(file_id)
import gdown
gdown.download(
    f"https://drive.google.com/uc?export=download&confirm=pbef&id={file_id}"
)

1ag1E217mcCw_J0_dStlNB3ZeC1NdG3x1


Downloading...
From: https://drive.google.com/uc?export=download&confirm=pbef&id=1ag1E217mcCw_J0_dStlNB3ZeC1NdG3x1
To: /content/glove.6B.100d.txt.zip
100%|██████████| 138M/138M [00:03<00:00, 35.9MB/s]


'glove.6B.100d.txt.zip'

In [28]:
!unzip "/content/glove.6B.100d.txt.zip"
!rm "/content/glove.6B.100d.txt.zip"

Archive:  /content/glove.6B.100d.txt.zip
  inflating: glove.6B.100d.txt       


In [29]:
GLOVE_DIR = '/content'

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt') , encoding="utf-8") as f:
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
  if i > MAX_NUM_WORDS:
    continue
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    # words not found in embedding index will be all-zeros
    embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [32]:
model = Sequential([
    embedding_layer,
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    MaxPooling1D(5),

    Conv1D(filters=128, kernel_size=5, activation='relu'),
    MaxPooling1D(5),

    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),

    Dense(128, activation='relu'),
    Dense(8, activation='softmax')
])

In [33]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 100)          87200     
                                                                 
 conv1d_3 (Conv1D)           (None, 296, 128)          64128     
                                                                 
 max_pooling1d_2 (MaxPoolin  (None, 59, 128)           0         
 g1D)                                                            
                                                                 
 conv1d_4 (Conv1D)           (None, 55, 128)           82048     
                                                                 
 max_pooling1d_3 (MaxPoolin  (None, 11, 128)           0         
 g1D)                                                            
                                                                 
 conv1d_5 (Conv1D)           (None, 7, 128)           

In [34]:
model.fit(X_train, y_train,
          batch_size=128,
          epochs=5,
          validation_data=(X_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x79e29d616920>

In [35]:
model.evaluate(test_data, test_labels)



[0.38133925199508667, 0.8899999856948853]

In [39]:
predicts=model.predict(test_data)



In [46]:
predicts[:10]

array([[2.4759132e-04, 4.3209218e-03, 3.2196602e-01, 2.6119836e-03,
        5.8454937e-01, 7.3551454e-02, 4.3911833e-04, 1.2313528e-02],
       [9.4746101e-06, 3.0887502e-05, 9.9598885e-01, 3.9541861e-05,
        3.0044592e-03, 6.8688625e-04, 2.5010384e-05, 2.1475353e-04],
       [4.6852941e-04, 2.5409898e-03, 6.8240654e-01, 5.3120409e-03,
        2.4969840e-01, 4.6636511e-02, 1.1192960e-03, 1.1817590e-02],
       [1.2296668e-03, 8.5962890e-03, 4.1869256e-01, 6.8516657e-03,
        4.5169103e-01, 8.5164264e-02, 2.3601991e-03, 2.5414351e-02],
       [5.2386695e-05, 1.0217371e-03, 2.9535764e-01, 1.1448173e-03,
        6.5749770e-01, 3.8684841e-02, 3.9549309e-04, 5.8454098e-03],
       [9.4638544e-06, 3.4373722e-04, 3.0811628e-02, 5.9655693e-04,
        9.4197029e-01, 2.3717092e-02, 8.8145171e-05, 2.4631559e-03],
       [5.5299670e-04, 2.8188811e-03, 4.0491053e-01, 5.7490696e-03,
        4.9818006e-01, 7.1857169e-02, 1.8704142e-03, 1.4060851e-02],
       [2.5297015e-04, 1.4582041e-03, 6.3