In [1]:
# Importações necessárias
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import tensorflow as tf

In [2]:
# Carregamento do arquivoo
with open ("Sarcasm_Headlines_Dataset.json", 'r') as f:
    datastore = json.load(f)

In [3]:
# Constantes
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [4]:
# Listas vazias
sentences = []
labels = []

# Percorre o arquivo e vai preenchendo as listas
for item in datastore:
    sentences.append(item['headline']) # Adiciona a lista
    labels.append(item['is_sarcastic']) # Adiciona a lista

In [5]:
training_sentences = sentences[0:training_size] # frases para treinamento
testing_sentences = sentences[training_size:] # frases para teste
training_labels = labels[0:training_size] # labels para treinamento
testing_labels = labels[training_size:] # labels para teste

In [6]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences) # Atribuindo o tokenizer
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # Adiciona os zeros para todas ficarem  com o mesmo tamanho

testing_sequences = tokenizer.texts_to_sequences(testing_sentences) # Atribuindo o tokenizer
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) # Adiciona os zeros para todas ficarem  com o mesmo tamanho


In [7]:
# Transformação para que fique compatível com o tensorflow
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)
training_padded = np.array(training_padded)
testing_padded = np.array(testing_padded)


In [8]:
# Criação do modelo
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), # camada de embedding
    tf.keras.layers.GlobalAveragePooling1D(), # camade de pooling
    tf.keras.layers.Dense(24, activation='relu'), # camada densa 2
    tf.keras.layers.Dense(1, activation='sigmoid') # camada densa 1
])



In [9]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) # Compilação
model.summary()   

In [10]:
num_epochs = 30 # número de epocas
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
625/625 - 4s - 6ms/step - accuracy: 0.5711 - loss: 0.6765 - val_accuracy: 0.5794 - val_loss: 0.6425
Epoch 2/30
625/625 - 2s - 4ms/step - accuracy: 0.7387 - loss: 0.5484 - val_accuracy: 0.8071 - val_loss: 0.4650
Epoch 3/30
625/625 - 2s - 4ms/step - accuracy: 0.8167 - loss: 0.4175 - val_accuracy: 0.8281 - val_loss: 0.4028
Epoch 4/30
625/625 - 2s - 4ms/step - accuracy: 0.8480 - loss: 0.3551 - val_accuracy: 0.8320 - val_loss: 0.3825
Epoch 5/30
625/625 - 2s - 4ms/step - accuracy: 0.8682 - loss: 0.3154 - val_accuracy: 0.8505 - val_loss: 0.3565
Epoch 6/30
625/625 - 2s - 4ms/step - accuracy: 0.8798 - loss: 0.2894 - val_accuracy: 0.8514 - val_loss: 0.3491
Epoch 7/30
625/625 - 2s - 4ms/step - accuracy: 0.8882 - loss: 0.2682 - val_accuracy: 0.8338 - val_loss: 0.3635
Epoch 8/30
625/625 - 2s - 4ms/step - accuracy: 0.9003 - loss: 0.2440 - val_accuracy: 0.8478 - val_loss: 0.3536
Epoch 9/30
625/625 - 2s - 4ms/step - accuracy: 0.9091 - loss: 0.2253 - val_accuracy: 0.8419 - val_loss: 0.3671
E

In [11]:
sentence = [       # Frases para teste
    "granny starting to fear spiders in the garden migth be real",
    "the weather today is brigth and sunny"
]
sequence = tokenizer.texts_to_sequences(sentence) # Tornando a frase em sequencias de palavras
padded= pad_sequences(sequence, maxlen=max_length, padding=padding_type,truncating=trunc_type) # Deixando todas do mesmo tamanho
print(model.predict(padded)) # Predicao

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step
[[0.9841677 ]
 [0.00323261]]
