In [1]:
# Importações necessárias
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import tensorflow as tf

In [2]:
# Carregamento do arquivoo
with open ("Sarcasm_Headlines_Dataset.json", 'r') as f:
    datastore = json.load(f)

In [3]:
# Constantes
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [4]:
# Listas vazias
sentences = []
labels = []

# Percorre o arquivo e vai preenchendo as listas
for item in datastore:
    sentences.append(item['headline']) # Adiciona a lista
    labels.append(item['is_sarcastic']) # Adiciona a lista

In [5]:
training_sentences = sentences[0:training_size] # frases para treinamento
testing_sentences = sentences[training_size:] # frases para teste
training_labels = labels[0:training_size] # labels para treinamento
testing_labels = labels[training_size:] # labels para teste

In [6]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences) # Atribuindo o tokenizer
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # Adiciona os zeros para todas ficarem  com o mesmo tamanho

testing_sequences = tokenizer.texts_to_sequences(testing_sentences) # Atribuindo o tokenizer
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # Adiciona os zeros para todas ficarem  com o mesmo tamanho


In [7]:
# Transformação para que fique compatível com o tensorflow
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)
training_padded = np.array(training_padded)
testing_padded = np.array(testing_padded)

In [8]:
# Criação do modelo
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), 
    tf.keras.layers.GlobalAveragePooling1D(),  
    tf.keras.layers.Dense(128, activation='relu'), 
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),  
    tf.keras.layers.Dropout(0.2), 
    tf.keras.layers.Dense(24, activation='relu'),  
    tf.keras.layers.Dense(1, activation='sigmoid')
])



In [9]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) # Compilação

In [10]:
num_epochs = 20 # número de epocas
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/20
625/625 - 9s - 14ms/step - accuracy: 0.5903 - loss: 0.6627 - val_accuracy: 0.7953 - val_loss: 0.4813
Epoch 2/20
625/625 - 3s - 5ms/step - accuracy: 0.8025 - loss: 0.4267 - val_accuracy: 0.8243 - val_loss: 0.3899
Epoch 3/20
625/625 - 3s - 5ms/step - accuracy: 0.8451 - loss: 0.3548 - val_accuracy: 0.8004 - val_loss: 0.4113
Epoch 4/20
625/625 - 3s - 5ms/step - accuracy: 0.8691 - loss: 0.3104 - val_accuracy: 0.8220 - val_loss: 0.4053
Epoch 5/20
625/625 - 3s - 5ms/step - accuracy: 0.8863 - loss: 0.2787 - val_accuracy: 0.8553 - val_loss: 0.3489
Epoch 6/20
625/625 - 3s - 5ms/step - accuracy: 0.8995 - loss: 0.2467 - val_accuracy: 0.8478 - val_loss: 0.3500
Epoch 7/20
625/625 - 3s - 5ms/step - accuracy: 0.9056 - loss: 0.2329 - val_accuracy: 0.8517 - val_loss: 0.3398
Epoch 8/20
625/625 - 3s - 5ms/step - accuracy: 0.9103 - loss: 0.2220 - val_accuracy: 0.8310 - val_loss: 0.4057
Epoch 9/20
625/625 - 3s - 5ms/step - accuracy: 0.9207 - loss: 0.2020 - val_accuracy: 0.8457 - val_loss: 0.3810


In [11]:
# Foi possível chegar a um resultado melhor com uma redução 33% no número de épocas 
sentence = [       # Frases para teste
    "granny starting to fear spiders in the garden migth be real",
    "the weather today is brigth and sunny"
]
sequence = tokenizer.texts_to_sequences(sentence) # Tornando a frase em sequencias de palavras
padded= pad_sequences(sequence, maxlen=max_length, padding=padding_type,truncating=trunc_type) # Deixando todas do mesmo tamanho
print(model.predict(padded)) # Predicao

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step
[[0.9975208 ]
 [0.02571205]]
