In [10]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

## Tokenizer turn text into tokens

In [2]:
# Train data
sentences = [
    "I love my dog",
    "I love my cat",
    "Fuck you, you stupid bitch",
    "I love u",
    "my dog!!!"
]
# OOV will replace the word that is not in the vocab with oov token
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_idx = tokenizer.word_index
print(word_idx)

{'<OOV>': 1, 'i': 2, 'love': 3, 'my': 4, 'dog': 5, 'you': 6, 'cat': 7, 'fuck': 8, 'stupid': 9, 'bitch': 10, 'u': 11}


In [None]:
tokenizer??

## Text into sequence

In [None]:
test_data = [
    "Fuck you dumbass trick",
    "Hoe ass bitch",
    "Hi, How are you today you dumbass stupid ass hoe ass trick"
]
# Word that is not in the index
sequences = tokenizer.texts_to_sequences(test_data)
# Pad the sequence if length is not equal
padded = pad_sequences(sequences, padding='post', maxlen=None)
padded

## Sentiment Text Prediction

In [24]:
import requests
from sklearn.model_selection import train_test_split

In [25]:
res = requests.get("https://storage.googleapis.com/learning-datasets/sarcasm.json")
data = res.json()

In [26]:
# Put sentences and labels into 2 lists
sentences = []
labels = []
for item in data:
    sentences.append(item["headline"])
    labels.append(item["is_sarcastic"])

In [27]:
# Define variables
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [28]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=42)

In [29]:
# Create token for training set
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

# Create token for sequences
training_seq = tokenizer.texts_to_sequences(X_train)
training_padded = pad_sequences(training_seq, padding=padding_type, truncating=trunc_type, maxlen=max_length)
val_seq = tokenizer.texts_to_sequences(X_test)
val_padded = pad_sequences(val_seq, padding=padding_type, truncating=trunc_type, maxlen=max_length)

In [12]:
# Convert to numpy array to train with tensorflow
train_data = np.array(training_padded)
train_labels = np.array(y_train)
test_data = np.array(val_padded)
test_labels = np.array(y_test)

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D

In [14]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [15]:
epochs = 30
history = model.fit(train_data, train_labels, epochs=epochs, validation_data=(test_data, test_labels), verbose=2)

Epoch 1/30
668/668 - 1s - loss: 0.6741 - accuracy: 0.5723 - val_loss: 0.6147 - val_accuracy: 0.6715 - 1s/epoch - 2ms/step
Epoch 2/30
668/668 - 1s - loss: 0.4510 - accuracy: 0.8258 - val_loss: 0.3826 - val_accuracy: 0.8418 - 628ms/epoch - 940us/step
Epoch 3/30
668/668 - 1s - loss: 0.3200 - accuracy: 0.8733 - val_loss: 0.3465 - val_accuracy: 0.8527 - 624ms/epoch - 934us/step
Epoch 4/30
668/668 - 1s - loss: 0.2700 - accuracy: 0.8949 - val_loss: 0.3341 - val_accuracy: 0.8562 - 627ms/epoch - 939us/step
Epoch 5/30
668/668 - 1s - loss: 0.2351 - accuracy: 0.9098 - val_loss: 0.3313 - val_accuracy: 0.8609 - 622ms/epoch - 931us/step
Epoch 6/30
668/668 - 1s - loss: 0.2094 - accuracy: 0.9195 - val_loss: 0.3601 - val_accuracy: 0.8429 - 624ms/epoch - 935us/step
Epoch 7/30
668/668 - 1s - loss: 0.1880 - accuracy: 0.9296 - val_loss: 0.3522 - val_accuracy: 0.8538 - 623ms/epoch - 933us/step
Epoch 8/30
668/668 - 1s - loss: 0.1700 - accuracy: 0.9379 - val_loss: 0.3604 - val_accuracy: 0.8527 - 627ms/epoch - 

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
model.save('sentiment.h5')

In [31]:
from tensorflow.keras.models import load_model

# Load model and predict
model = load_model('sentiment.h5')
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
def predict_sentiment(sentence):
    sequences = tokenizer.texts_to_sequences(sentence)
    padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    yhat = model.predict(padded)
    result = []
    for p in yhat:
        if p > 0.5:
            result.append("Sarcastic")
        else:
            result.append("Not Sarcastic")
    return result

In [32]:
sentence = ["granny starting to fear spiders in the garden might be real", 
            "game of thrones season finale showing this sunday night", 
            "Government makes it seem like UFO might be real!", 
            "Hi, my name is ... nice to meet you...",
            "Might be real!!"
           ]
results = predict_sentiment(sentence)
results



['Sarcastic', 'Not Sarcastic', 'Sarcastic', 'Not Sarcastic', 'Not Sarcastic']