<a href="https://colab.research.google.com/github/zegabr/pln-chatbot/blob/main/flight_dialogue_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
import matplotlib.pyplot as plt

In [20]:
from pandas import read_csv

train_dataset = read_csv('https://raw.githubusercontent.com/zegabr/pln-chatbot/main/train_dataset.csv', names=['Phrase', 'Intent'])
train_dataset = train_dataset.drop_duplicates(subset=['Phrase'])
# pega 2 np arrays, um com as frases e outro com os respectivos intents
train_phrases = np.array(train_dataset.Phrase)[1:]
train_intents = np.array(train_dataset.Intent)[1:]
print(train_phrases)

test_dataset = read_csv('https://raw.githubusercontent.com/zegabr/pln-chatbot/main/test_dataset.csv', names=['Phrase', 'Intent'])
test_dataset = test_dataset.drop_duplicates(subset=['Phrase'])
# pega 2 np arrays, um com as frases e outro com os respectivos intents
test_phrases = np.array(test_dataset.Phrase)[1:]
test_intents = np.array(test_dataset.Intent)[1:]
print(test_intents)

['I need a one way flight and prefer traveling in Premium Economy class.'
 'I would like to leave next Friday.'
 'I am traveling to NYC from Seattle, WA. I prefer to travel on Delta Airlines.'
 ... 'Can you search other flights'
 "I'll be back March 12th, economy preferred"
 "That's good, thats all thanks"]
['INFORM' 'INFORM' 'INFORM' ... 'INFORM' 'REQUEST' 'SELECT']


In [21]:
intent_mapper = {
  'NEGATE': 0,
  'NEGATE_INTENT': 1,
  'REQUEST_ALTS': 2,
  'GOODBYE': 3,
  'REQUEST': 4,
  'THANK_YOU': 5,
  'AFFIRM': 6,
  'AFFIRM_INTENT': 7,
  'SELECT': 8,
  'INFORM': 9,
  'INFORM_INTENT': 10
}
train_intents_encoded = np.array(list(map(lambda x: intent_mapper[x],train_intents)))
print(train_intents_encoded)
test_intents_encoded = np.array(list(map(lambda x: intent_mapper[x],test_intents)))
print(test_intents_encoded)

[9 9 9 ... 9 9 8]
[9 9 9 ... 9 4 8]


In [22]:
# one hot encoding pra train phrases (feito artesanalmente)
def get_one_hot_encoding(train_phrases, test_phrases):
  max_phrase_size = 0
  # map words to numbers
  word_to_number = {}
  curr = 1
  all_phrases = np.concatenate((train_phrases, test_phrases))
  for phrase in all_phrases:
    max_phrase_size = max(max_phrase_size,len(phrase.split()))
    for word in phrase.split():
      if word not in word_to_number:
        word_to_number[word] = curr
        curr += 1
  
  # map train_phrases to vectors of numbers
  one_hotted_train = []
  for phrase in train_phrases:
    curr_vector = []
    for word in phrase.split():
      curr_vector.append(word_to_number[word])
    # add zero as padding
    while len(curr_vector) < max_phrase_size:
      curr_vector.append(0)
    one_hotted_train.append(curr_vector)
  
  # map test to vectors of numbers
  one_hotted_test = []
  for phrase in test_phrases:
    curr_vector = []
    for word in phrase.split():
      curr_vector.append(word_to_number[word])
    # add zero as padding
    while len(curr_vector) < max_phrase_size:
      curr_vector.append(0)
    one_hotted_test.append(curr_vector)
  
  #return nparray
  return np.array(one_hotted_train), np.array(one_hotted_test)

train_phrases_encoded, test_phrases_encoded = get_one_hot_encoding(train_phrases, test_phrases)
train_phrases_encoded

array([[  1,   2,   3, ...,   0,   0,   0],
       [  1,  14,  15, ...,   0,   0,   0],
       [  1,  20,   9, ...,   0,   0,   0],
       ...,
       [ 42,  43, 321, ...,   0,   0,   0],
       [ 62,  63, 311, ...,   0,   0,   0],
       [ 92, 290, 319, ...,   0,   0,   0]])

In [24]:
# criando modelo com random forest
# codigo pego daqui https://colab.research.google.com/github/ProfLuciano/cd/blob/gh-pages/notebooks/classification.ipynb#scrollTo=ekLczfhAeLh5
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=1000)
model.fit(train_phrases_encoded, train_intents_encoded)
ypred_test = model.predict(test_phrases_encoded)
ypred_train = model.predict(train_phrases_encoded)

from sklearn.metrics import accuracy_score
print("ACC TRAINING:" + str(accuracy_score(train_intents_encoded, ypred_train)))
print("ACC TEST:" + str(accuracy_score(test_intents_encoded, ypred_test)))

ACC TRAINING:1.0
ACC TEST:0.7335203366058906
