# Leitura dos Dados

Inicialmente iremos realizar a leitura dos dados que serão utilizados para pergunta e resposta. Estamos considerando como other a intenção de Q&A.

In [47]:
import numpy as np
import pandas as pd

In [48]:
qa_data = pd.read_csv("q&a_intent_train.csv", names= ["target", "text"])

qa_data

Unnamed: 0,target,text
0,other,What is the principle behind flight?
1,other,What are the four forces acting on an airplane?
2,other,What is the difference between IFR and VFR?
3,other,What is a black box in aviation?
4,other,What is the busiest airport in the world by pa...
...,...,...
94,other,"What is ""ACARS""?"
95,other,"What is ""Alternate Airport""?"
96,other,"at does ""pan-pan"" mean?"
97,other,"What is ""decision height"" (DH)?"


Leitura dos dados de treino e teste disponibilizados pelo ATIS dataset.

In [49]:
atis_train_data = pd.read_csv("atis_intents_train.csv", names= ["target", "text"])
atis_test_data = pd.read_csv("atis_intents_test.csv", names= ["target", "text"])

print("ATIS train dataset size is:", len(atis_train_data))
print("ATIS test dataset size is:", len(atis_test_data))

ATIS train dataset size is: 4834
ATIS test dataset size is: 800


Iremos construir um dataset para treino e teste considerando os dados que temos até então. Para tanto, separaremos qa_data em treino e teste (considerando por volta de 20% para teste) e depois construíremos um dataset para treino e um para teste unindo as tabelas até então existentes.

In [50]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
qa_train_data, qa_test_data = train_test_split(qa_data, test_size=0.20, random_state=42)

print("Q&A train dataset size is:", len(qa_train_data))
print("Q&A test dataset size is:", len(qa_test_data))

Q&A train dataset size is: 79
Q&A test dataset size is: 20


In [51]:
train_data = pd.concat([qa_train_data, atis_train_data], ignore_index=True)

train_data

Unnamed: 0,target,text
0,other,"What is a ""taxiway""?"
1,other,"What is a ""slot-restricted"" airport?"
2,other,"What is ""NextGen"" in U.S. aviation?"
3,other,"What does the term ""gate hold"" mean in aviation?"
4,other,"What does ""direct flight"" mean as opposed to ""..."
...,...,...
4908,atis_airfare,what is the airfare for flights from denver t...
4909,atis_flight,do you have any flights from denver to baltim...
4910,atis_airline,which airlines fly into and out of denver
4911,atis_flight,does continental fly from boston to san franc...


In [52]:
test_data = pd.concat([qa_test_data, atis_test_data], ignore_index=True)

test_data

Unnamed: 0,target,text
0,other,"What is ""yaw"" in aviation?"
1,other,"What is a ""deadhead"" flight?"
2,other,"What is ""Alternate Airport""?"
3,other,What is the purpose of ailerons on an aircraft?
4,other,"What is ""decision height"" (DH)?"
...,...,...
815,atis_flight,please find all the flights from cincinnati t...
816,atis_flight,find me a flight from cincinnati to any airpo...
817,atis_flight,i'd like to fly from miami to chicago on amer...
818,atis_flight,i would like to book a round trip flight from...


# SVM

## Bag of Words

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(train_data["text"])

In [54]:
X_train = tfidf_vectorizer.transform(train_data["text"])

X_train

<4913x964 sparse matrix of type '<class 'numpy.float64'>'
	with 51273 stored elements in Compressed Sparse Row format>

In [55]:
X_test = tfidf_vectorizer.transform(test_data["text"])

X_test

<820x964 sparse matrix of type '<class 'numpy.float64'>'
	with 7766 stored elements in Compressed Sparse Row format>

## SVM Classifier

### Define and Train the SVM Classifier

In [56]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize the SVM classifier with a linear kernel
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the classifier using the BoW features from the training data
svm_classifier.fit(X_train, train_data['target'])

### Predict on the Test Data and Evaluate the Model

In [57]:
# Predict the target values for the test data
y_test_pred = svm_classifier.predict(X_test)

# Evaluate the predictions against the actual target values from the test data
print("Accuracy on test data:", accuracy_score(test_data['target'], y_test_pred))
print("\nClassification Report on test data:\n", classification_report(test_data['target'], y_test_pred))

Accuracy on test data: 0.975609756097561

Classification Report on test data:
                      precision    recall  f1-score   support

  atis_abbreviation       0.89      1.00      0.94        33
      atis_aircraft       0.67      0.89      0.76         9
       atis_airfare       0.96      0.98      0.97        48
       atis_airline       1.00      0.92      0.96        38
        atis_flight       0.99      0.98      0.99       632
   atis_flight_time       1.00      1.00      1.00         1
atis_ground_service       1.00      1.00      1.00        36
      atis_quantity       0.38      1.00      0.55         3
              other       1.00      0.75      0.86        20

           accuracy                           0.98       820
          macro avg       0.88      0.95      0.89       820
       weighted avg       0.98      0.98      0.98       820



In [58]:
def predict_intention(text):
    # Preprocess the text using spaCy or any other preprocessing steps you have
    preprocessed_text = preprocess_text(text)  # Assuming preprocess_text is your custom preprocessing function
    
    # Transform the preprocessed text into BoW format using the same vectorizer
    text_bow = vectorizer.transform([preprocessed_text])
    
    # Predict the intention using the trained SVM classifier
    predicted_intention = svm_classifier.predict(text_bow)
    
    # Return the predicted intention
    return predicted_intention[0]

# Example usage
example_text = "What are the four forces acting on an airplane?"
predicted_intention = predict_intention(example_text)
print(f"The predicted intention for '{example_text}' is '{predicted_intention}'.")


The predicted intention for 'What are the four forces acting on an airplane?' is 'atis_aircraft'.


# Word Embeddings

## Tokenization

In [80]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [91]:
# Initialize the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data["text"])

In [93]:
word_index = tokenizer.word_index
max_vocab_size = len(word_index) + 1
input_lenght = 25

In [94]:
train_data_tokens = tokenizer.texts_to_sequences(train_data["text"])
train_input = pad_sequences(train_data_tokens, maxlen=input_lenght)

In [95]:
test_data_tokes = tokenizer.texts_to_sequences(test_data["text"])
test_input = pad_sequences(test_data_tokes, maxlen=input_lenght)

## Encode the Labels

In [96]:
label_transformer = preprocessing.LabelEncoder()
label_transformer.fit(train_data["target"])

In [102]:
labels = label_transformer.transform(train_data["target"])
test_labels = label_transformer.transform(test_data["target"])

labels = to_categorical(np.asarray(labels))
test_labels = to_categorical(np.asarray(test_labels))

In [103]:
X_train, X_val, y_train, y_val  = train_test_split(train_input, labels, test_size=0.2, random_state=1)

## Create Embedding Matrix

In [104]:
embedded_dim = 300
embedded_index = dict()
with open ('glove.42B.300d.txt','r',encoding='utf8') as glove : 
    for line in glove:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:],dtype='float32')
        embedded_index[word] = vector
glove.close()
embedded_matrix = np.zeros((max_vocab_size, embedded_dim))
for x , i in word_index.items():
    vector = embedded_index.get(x)
    if vector is not None:
        embedded_matrix[i] = vector

FileNotFoundError: [Errno 2] No such file or directory: 'glove.42B.300d.txt'