In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'atis-airlinetravelinformationsystem:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F284285%2F585165%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240906%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240906T235230Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D8f74fad814d4bd1206dd27a9a02ede71b12f12200c861430db8cdcd1c353cb081d0c15b1e2cfb241db0398b7b0898b3a1a35fa83feb0d650d543c2d3ace7ec9fc0009a6a7650d7cd7f35677eacbd31b542614aa42c663a5d8a2ddf41e905ecb727d9bd9798723ad29044e7792177e27673ac2e0feefdaeef861a84bc7569d11c0ff1613f2e6056e4c8811f10ed605621663912f8d78c278c9b293389d50a6f1bb9ae2b29720559ccc8dbfa74cf1225e46bd1013a356b4c90808b3380f632c05df320e66f048f41fc9847b3aab01429de450103b2556b84810fb941cd6b59a55760c2a710da6b7d0812c54015b59c26367d7bf46c519db641df2b13b8e8ead7bb'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading atis-airlinetravelinformationsystem, 142369 bytes compressed
Downloaded and uncompressed: atis-airlinetravelinformationsystem
Data source import complete.


# Loading the Dataset

In [None]:
# ! pip install -q opendatasets
# import opendatasets as od
# od.download('https://www.kaggle.com/datasets/hassanamin/atis-airlinetravelinformationsystem')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/atis-airlinetravelinformationsystem/atis_intents_test.csv
/kaggle/input/atis-airlinetravelinformationsystem/atis_intents.csv
/kaggle/input/atis-airlinetravelinformationsystem/atis_intents_train.csv


# 1.Import libraries

In [None]:
# 1. Import libraries
import pandas as pd  #for redaing  .csv dataset
import numpy as np

!pip install scikit-learn
#convert text into numerical
from sklearn.feature_extraction.text import CountVectorizer  # CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer # TF(Term Frequency) & IDF (Inverse Document Frequency)

#Label Encoding
from sklearn.preprocessing import LabelEncoder
# split the dataset into train and test data
from sklearn.model_selection import train_test_split
#naive bayes classifciation
from sklearn.naive_bayes import MultinomialNB
#model accuracy
from sklearn.metrics import confusion_matrix,accuracy_score

#enable table format
# from google.colab import data_table
# data_table.enable_dataframe_formatter()
#disable table format
# from google.colab import data_table
# data_table.disable_dataframe_formatter()




# 2.Read the dataset

In [None]:
#df=pd.read_csv('/content/atis-airlinetravelinformationsystem/atis_intents.csv')
df = pd.read_csv('/kaggle/input/atis-airlinetravelinformationsystem/atis_intents.csv')
df.columns = ['Intent','Text'] #Re-naming the columns
df.head()

Unnamed: 0,Intent,Text
0,atis_flight,what flights are available from pittsburgh to...
1,atis_flight_time,what is the arrival time in san francisco for...
2,atis_airfare,cheapest airfare from tacoma to orlando
3,atis_airfare,round trip fares from pittsburgh to philadelp...
4,atis_flight,i need a flight tomorrow from columbus to min...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4977 entries, 0 to 4976
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Intent  4977 non-null   object
 1   Text    4977 non-null   object
dtypes: object(2)
memory usage: 77.9+ KB


In [None]:
df.Intent.value_counts()
#len(df.Intent.value_counts())

Unnamed: 0_level_0,count
Intent,Unnamed: 1_level_1
atis_flight,3665
atis_airfare,423
atis_ground_service,255
atis_airline,157
atis_abbreviation,147
atis_aircraft,81
atis_flight_time,54
atis_quantity,51
atis_flight#atis_airfare,21
atis_airport,20


>Findings
- There are 4977 entries and 22 different type of intents

In [None]:
# get rid of some columns where intent counts is less than 10
#atis_flight#atis_airfare
# atis_meal
# atis_restriction
# atis_airline#atis_flight_no
# atis_ground_service#atis_ground_fare
# atis_airfare#atis_flight_time
# atis_cheapest
# atis_aircraft#atis_flight#atis_flight_no


In [None]:
# type(df.Intent.value_counts())
df.Intent.value_counts()[0:15]

Unnamed: 0_level_0,count
Intent,Unnamed: 1_level_1
atis_flight,3665
atis_airfare,423
atis_ground_service,255
atis_airline,157
atis_abbreviation,147
atis_aircraft,81
atis_flight_time,54
atis_quantity,51
atis_flight#atis_airfare,21
atis_airport,20


In [None]:
df.Intent.value_counts()[0:15].keys()

Index(['atis_flight', 'atis_airfare', 'atis_ground_service', 'atis_airline',
       'atis_abbreviation', 'atis_aircraft', 'atis_flight_time',
       'atis_quantity', 'atis_flight#atis_airfare', 'atis_airport',
       'atis_distance', 'atis_city', 'atis_ground_fare', 'atis_capacity',
       'atis_flight_no'],
      dtype='object', name='Intent')

In [None]:
#df=df[df.Intent.isin(df.Intent.value_counts()[0:15].keys())]
top_15_columns= df.Intent.value_counts()[0:15].keys()
data =[]
for idx,intent in enumerate(df['Intent']):
  if intent in top_15_columns:
    data.append([df['Text'][idx],df['Intent'][idx]])

df_sliced=pd.DataFrame(data,columns=['Text','Intent'])
df_sliced.head()

Unnamed: 0,Text,Intent
0,what flights are available from pittsburgh to...,atis_flight
1,what is the arrival time in san francisco for...,atis_flight_time
2,cheapest airfare from tacoma to orlando,atis_airfare
3,round trip fares from pittsburgh to philadelp...,atis_airfare
4,i need a flight tomorrow from columbus to min...,atis_flight


In [None]:
df_sliced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4959 entries, 0 to 4958
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    4959 non-null   object
 1   Intent  4959 non-null   object
dtypes: object(2)
memory usage: 77.6+ KB


# 3.Data Preprocessing on X_data and Y_data and 4. Split the dataset into X_data and Y_data
Reduce the model complexity such a way accuracy should improve withoutoverfitting or underfitting.



In [None]:
# convert unstructure data to structure format
# in Machine Learning - Vecto
# # Countvectorizer
##TFIDFVectorizer
# Deep Learning
# from skearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfVectorizer
#help(CountVectorizer)
cv= CountVectorizer(ngram_range =(1,1),
                stop_words =None,
                max_df =1.0,
                min_df =1,
              max_features =None )
X= cv.fit_transform(df_sliced['Text'])
X.shape
#4959 -- no . of rows
#859 -- for each row no. of columns

(4959, 859)

In [None]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
pd.DataFrame(X.toarray(),columns=cv.get_feature_names_out())

Unnamed: 0,0900,10,100,1000,1017,1020,1024,1026,1030,1039,...,worth,would,year,yes,yn,york,you,your,yyz,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4955,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4956,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4957,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
## removing stopwords
cv_nostop= CountVectorizer(ngram_range =(1,1),
                stop_words ="english",  ##stopwords considered as english
                max_df =1.0,
                min_df =1,
              max_features =None )
X_nostop= cv_nostop.fit_transform(df_sliced['Text'])
X_nostop.shape
# now column got 718 means (859-718) columns got removed which was stopwords

(4959, 718)

In [None]:
#X_nostop.toarray()

In [None]:
#pd.DataFrame(X_nostop.toarray(),columns=cv_nostop.get_feature_names_out())

In [None]:
##  stopwords and word should repeat minimum 2 times
cv_nostop_min2= CountVectorizer(ngram_range =(1,1),
                stop_words ="english",  ##stopwords considered as english
                max_df =1.0,
                min_df =2, # remove a word which  repeated atleast 2 times
              max_features =None )
X_nostop_min2= cv_nostop_min2.fit_transform(df_sliced['Text'])
X_nostop_min2.shape
# now column got 478 means (859-478) columns removed

(4959, 478)

In [None]:
#X_nostop_min2.toarray()

In [None]:
#pd.DataFrame(X_nostop_min2.toarray(),columns=cv_nostop_min2.get_feature_names_out())

In [None]:
##  stopwords and word should repeat minimum 2 times
cv_nostop_min2_max= CountVectorizer(ngram_range =(1,1),
                stop_words ="english",  ##stopwords considered as english
                max_df =0.6, #remove a word which repeated 20% in the documents
                min_df =2, # remove a word which  repeated atleast 2 times
              max_features =None )
X_nostop_min2_max= cv_nostop_min2_max.fit_transform(df_sliced['Text'])
X_nostop_min2_max.shape
# now column got 478 means (859-478) columns removed

(4959, 478)

In [None]:
#X_nostop_min2_max.toarray()

In [None]:
#pd.DataFrame(X_nostop_min2_max.toarray(),columns=cv_nostop_min2_max.get_feature_names_out())

In [None]:
##  select with top 250 words in text
cv_nostop_top250= CountVectorizer(ngram_range =(1,1),
                stop_words ="english",  ##stopwords considered as english
                max_df =0.6, #remove a word which repeated 60% in the documents
                min_df =2, # remove a word which  repeated atleast 2 times
              max_features =250  #select with top 250 words
                                  )
X_nostop_top250= cv_nostop_top250.fit_transform(df_sliced['Text'])
X_nostop_top250.shape
# now column got 250 means (859-250) columns removed

(4959, 250)

In [None]:
#X_nostop_top250.toarray()

In [None]:
#pd.DataFrame(X_nostop_top250.toarray(),columns=cv_nostop_top250.get_feature_names_out())

In [None]:
##  select with bigram  words in text
cv_nostop_bigram= CountVectorizer(ngram_range =(1,2),
                stop_words ="english",  ##stopwords considered as english
              max_features =None
                                  )
X_nostop_bigram= cv_nostop_bigram.fit_transform(df_sliced['Text'])
X_nostop_bigram.shape
# now column got 6015 means (859-6015) columns removed

(4959, 6015)

In [None]:
#X_nostop_bigram.toarray()

In [None]:
#pd.DataFrame(X_nostop_bigram.toarray(),columns=cv_nostop_bigram.get_feature_names_out())

In [None]:
cv_nostop_bigram.get_feature_names_out()[20:30]

array(['1024', '1024 morning', '1026', '1030', '1030 1130', '1039',
       '1039 denver', '1039 thursday', '1045', '1055'], dtype=object)

In [None]:
### TF-IDF Vectorizer  ## Was covered in lecture 4
#from sklearn.feature_extraction.text import TfidfVectorizer
#help(TfidfVectorizer)
vectorize_tfidf=TfidfVectorizer(stop_words="english",   ##stopwords considered as english
                                max_df=0.98, #remove a word which repeated 98% in the documents
                                min_df=1, # remove a word no repeeation
                                max_features=None)
vectorize_tfidf.fit(df_sliced['Text'])
X_tfidf=vectorize_tfidf.transform(df_sliced['Text'])
X_tfidf.shape

(4959, 718)

In [None]:
pd.DataFrame(X_tfidf.toarray(),columns=vectorize_tfidf.get_feature_names_out())

Unnamed: 0,0900,10,100,1000,1017,1020,1024,1026,1030,1039,...,wish,working,world,worth,year,yes,yn,york,yyz,zone
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.534323,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4954,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4955,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4956,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4957,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y_data=le.fit_transform(df_sliced['Intent'])
y_data

array([ 8, 11,  2, ...,  3,  8,  8])

# 5.Split the data into training and testing and 6. Use only training data to train the model

In [None]:
#cv_nostop_min2
X_train,X_test,y_train,y_test=train_test_split(X_nostop_min2,
                                               y_data,test_size=0.2,
                                               random_state=42)

mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred_test=mnb.predict(X_test)

In [None]:
y_pred_test

array([ 8,  8,  8,  8,  8,  8, 13,  8,  8,  8,  8,  8,  8,  0,  8,  8,  8,
       13,  8, 13,  0,  2,  8,  8,  2,  8,  8,  8,  8,  8,  1, 13,  8,  8,
        8,  8,  2,  8,  8,  8,  8,  8,  8,  2,  8,  2,  1,  8,  8,  2,  8,
        8,  8,  2,  2,  8,  2,  8,  8,  1,  8,  8,  8,  2,  8,  8,  8,  8,
        8,  8, 13,  8,  8,  8,  8, 13,  8,  8,  8,  3,  8,  8,  8,  8,  8,
        8, 13,  8,  8,  8, 13,  2,  8,  8,  2,  8,  8,  8,  8, 13,  8,  8,
       13,  2,  8,  8,  8,  8,  8,  8,  8,  0,  8,  3,  8,  2,  2,  8,  8,
        8,  2,  8,  8,  2,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  1,  8,
        8,  8,  8,  8,  8,  8, 11,  8,  8,  8, 13,  8,  8,  8, 13,  8,  8,
       13,  8,  8,  8,  8,  8,  8,  8,  2,  8, 13,  8,  8,  8,  8,  8,  8,
        8, 13,  8,  2,  8,  8,  8,  8,  8, 11,  8,  8, 11,  8,  8,  2,  2,
        8,  8,  8,  8,  8,  8,  8,  8,  2,  8,  8,  8,  8,  8,  8,  8,  8,
        8,  2,  8, 11,  8,  8,  8,  8,  8,  8,  8,  8,  8,  2,  2,  8,  8,
        8,  8,  8,  8,  8

# 8.pass the test to the trained model to predict y_pred_test - validation metrics

In [None]:
def model_accuracy(y_test,y_pred_test):
  cm= confusion_matrix(y_test,y_pred_test)
  accuracy = accuracy_score(y_test,y_pred_test)
  #print(cm)
  print(accuracy)

In [None]:
model_accuracy(y_test,y_pred_test)

0.8991935483870968


In [None]:
#X data
X_train,X_test,y_train,y_test=train_test_split(X,
                                               y_data,test_size=0.2,
                                               random_state=42)

mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred_test=mnb.predict(X_test)

In [None]:
model_accuracy(y_test,y_pred_test)

0.8911290322580645


In [None]:
#X_nostop
X_train,X_test,y_train,y_test=train_test_split(X_nostop,
                                               y_data,test_size=0.2,
                                               random_state=42)

mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred_test=mnb.predict(X_test)

In [None]:
model_accuracy(y_test,y_pred_test)

0.8921370967741935


In [None]:
#X_nostop_bigram
X_train,X_test,y_train,y_test=train_test_split(X_nostop_bigram,
                                               y_data,test_size=0.2,
                                               random_state=42)

mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred_test=mnb.predict(X_test)

In [None]:
model_accuracy(y_test,y_pred_test)

0.8981854838709677


In [None]:
#X_tfidf -- model using TF & IDF vectorization
X_train,X_test,y_train,y_test=train_test_split(X_tfidf,
                                               y_data,test_size=0.2,
                                               random_state=42)

mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred_test=mnb.predict(X_test)

In [None]:
model_accuracy(y_test,y_pred_test)

0.8417338709677419
