# Simple NLP Model with sarcasm dataset

## Step 0. Library Import

In [1]:
import json
import urllib
import numpy as np 

import tensorflow as tf 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint



## Step 1. Load Dataset

In [2]:
url = 'https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json'
url_save_path = 'sarcasm.json'


In [3]:
urllib.request.urlretrieve(url, url_save_path)

('sarcasm.json', <http.client.HTTPMessage at 0x7f07cb2a75d0>)

In [4]:
with open(url_save_path) as f:
    json_dataset = json.load(f)

In [5]:
json_dataset[:5]

[{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
  'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
  'is_sarcastic': 0},
 {'article_link': 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365',
  'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
  'is_sarcastic': 0},
 {'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697',
  'headline': "mom starting to fear son's web series closest thing she will have to grandchild",
  'is_sarcastic': 1},
 {'article_link': 'https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302',
  'headline': 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
  'is_sarcastic': 1},
 {'article_link': 'https://www.huffingtonpost.com/entry/jk-rowling-w

## Step 2. Data Preprocessing

### 2-a. Train / Test split

In [6]:
X = []
y = []
for data in json_dataset:
    X.append(data['headline'])
    y.append(data['is_sarcastic'])

In [7]:
for i in range(5):
    print('headline : ', X[i])
    print('is sarcastic ? :', ['No' if i == 0 else 'Yes'][0])

headline :  former versace store clerk sues over secret 'black code' for minority shoppers
is sarcastic ? : No
headline :  the 'roseanne' revival catches up to our thorny political mood, for better and worse
is sarcastic ? : Yes
headline :  mom starting to fear son's web series closest thing she will have to grandchild
is sarcastic ? : Yes
headline :  boehner just wants wife to listen, not come up with alternative debt-reduction ideas
is sarcastic ? : Yes
headline :  j.k. rowling wishes snape happy birthday in the most magical way
is sarcastic ? : Yes


In [8]:
train_size = 20000

X_train = X[:train_size]
y_train = y[:train_size]

X_valid = X[train_size:]
y_valid = y[train_size:]

### 2-b. Tokenizer

In [9]:
vocab_size = 1000
oov_tok = "<OOV>"

In [10]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

In [11]:
tokenizer.fit_on_texts(X_train)

In [12]:
X_train = tokenizer.texts_to_sequences(X_train)
X_valid = tokenizer.texts_to_sequences(X_valid)

In [13]:
X_train[:5]

[[328, 1, 799, 1, 1, 47, 389, 1, 1, 6, 1, 1],
 [4, 1, 1, 1, 23, 2, 161, 1, 390, 1, 6, 251, 9, 889],
 [153, 890, 2, 891, 1, 1, 595, 1, 221, 133, 36, 45, 2, 1],
 [1, 38, 213, 382, 2, 1, 29, 288, 23, 10, 1, 1, 1, 958],
 [715, 672, 1, 1, 1, 662, 553, 5, 4, 92, 1, 90]]

In [14]:
X_valid[:5]

[[1, 1, 1, 1, 30, 1, 1, 5, 519, 109],
 [202, 1, 8, 31, 1, 1, 2, 854, 773],
 [18, 380, 191, 2, 915, 76, 8, 4, 1],
 [1, 1, 299, 337, 3, 1, 1],
 [162, 1, 1, 6, 1, 1, 348, 1]]

### 2-c. Pad Sequences

In [15]:
max_length = 120
trunc_type = 'post'
pad_type = 'post'

In [16]:
X_train_padded = pad_sequences(X_train, maxlen=max_length, truncating=trunc_type, padding=pad_type)
X_valid_padded = pad_sequences(X_valid, maxlen=max_length, truncating=trunc_type, padding=pad_type)

In [17]:
X_train_padded[:1]

array([[328,   1, 799,   1,   1,  47, 389,   1,   1,   6,   1,   1,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0]], dtype=int32)

In [18]:
X_valid_padded[:1]

array([[  1,   1,   1,   1,  30,   1,   1,   5, 519, 109,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0]], dtype=int32)

In [19]:
print(X_train_padded.shape, X_valid_padded.shape)

(20000, 120) (6709, 120)


### 2-d. label type : list -> numpy array

In [20]:
print('X Datatype : ')
print(type(X_train), type(X_valid))
print('X_padded Datatype : ')
print(type(X_train_padded), type(X_valid_padded))
print('-------------------------')
print('y Datatype : ')
print(type(y_train), type(y_valid))

X Datatype : 
<class 'list'> <class 'list'>
X_padded Datatype : 
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
-------------------------
y Datatype : 
<class 'list'> <class 'list'>


In [21]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [22]:
print('final X Datatype : ')
print(type(X_train_padded), type(X_valid_padded))
print('-------------------------')
print('final y Datatype : ')
print(type(y_train), type(y_valid))

final X Datatype : 
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
-------------------------
final y Datatype : 
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


## Step 3. Modeling

In [24]:
embedding_dim = 16
vocab_size = 1000
max_length = 120

In [25]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64, dropout=0.5)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])


In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 16)           16000     
                                                                 
 bidirectional (Bidirectiona  (None, 120, 128)         41472     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 120, 128)         98816     
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                        

## Step 4. Model Compile

In [27]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

## Step 5. Model Checkpoint

In [28]:
filepath = 'my_checkpoint.ckpt'
cp = ModelCheckpoint(
    filepath=filepath,
    save_weights_only=True,
    save_best_only=True,
    monitor='val_loss',
    verbose=1
)

## Step 6. Model Fit

In [29]:
epochs=17

In [30]:
model.fit(
    X_train_padded, y_train,
    validation_data = (X_valid_padded, y_valid),
    callbacks=[cp],
    epochs=epochs
)

Epoch 1/17
Epoch 1: val_loss improved from inf to 0.40550, saving model to my_checkpoint.ckpt
Epoch 2/17
Epoch 2: val_loss improved from 0.40550 to 0.38672, saving model to my_checkpoint.ckpt
Epoch 3/17
Epoch 3: val_loss improved from 0.38672 to 0.37126, saving model to my_checkpoint.ckpt
Epoch 4/17
Epoch 4: val_loss did not improve from 0.37126
Epoch 5/17
Epoch 5: val_loss did not improve from 0.37126
Epoch 6/17
Epoch 6: val_loss did not improve from 0.37126
Epoch 7/17
Epoch 7: val_loss did not improve from 0.37126
Epoch 8/17
Epoch 8: val_loss did not improve from 0.37126
Epoch 9/17
Epoch 9: val_loss did not improve from 0.37126
Epoch 10/17
Epoch 10: val_loss did not improve from 0.37126
Epoch 11/17
Epoch 11: val_loss did not improve from 0.37126
Epoch 12/17
Epoch 12: val_loss did not improve from 0.37126
Epoch 13/17
Epoch 13: val_loss did not improve from 0.37126
Epoch 14/17
Epoch 14: val_loss did not improve from 0.37126
Epoch 15/17
Epoch 15: val_loss did not improve from 0.37126
Ep

<keras.callbacks.History at 0x7f07c5158890>

##

## Step 7. Model Evaludate & Save

In [None]:
model.load_weights(filepath)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fbdb2b4e1d0>

In [None]:
model.evaluate(X_valid_padded, y_valid)



[0.36871227622032166, 0.8323147892951965]

In [31]:
model.save('sarcasm_3687.h5')