# 단어 토큰화, Embedding, LSTM layer를 활용한 뉴스 데이터 sarcasm 판단
https://teddylee777.github.io/tensorflow/news-sarcasm


In [1]:
import json
import tensorflow as tf
import numpy as np
import urllib
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional
from tensorflow.keras.models import Sequential

In [2]:
url = 'https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json'
urllib.request.urlretrieve(url, 'sarcasm.json')

('sarcasm.json', <http.client.HTTPMessage at 0x7fbee4c5cba8>)

In [3]:
with open('sarcasm.json', 'r') as f:
    data = json.load(f)

In [4]:
data[:5]

[{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
  'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
  'is_sarcastic': 0},
 {'article_link': 'https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365',
  'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
  'is_sarcastic': 0},
 {'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697',
  'headline': "mom starting to fear son's web series closest thing she will have to grandchild",
  'is_sarcastic': 1},
 {'article_link': 'https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302',
  'headline': 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
  'is_sarcastic': 1},
 {'article_link': 'https://www.huffingtonpost.com/entry/jk-rowling-w

In [5]:
labels = []
sentences = []
for d in data:
    sentences.append(d['headline'])
    labels.append(d['is_sarcastic'])

train_ratio = 0.8
train_size = int(len(data) * train_ratio)
train_size, len(data)

(21367, 26709)

In [25]:
train_sentences = sentences[:train_size]
valid_sentences = sentences[train_size:]
train_labels = labels[:train_size]
valid_labels = labels[train_size:]

In [29]:
# vocab_size는 Token화 진행시 최대 빈도숫자가 높은 1000개의 단어만을 활용하겠다는 의미  
vocab_size = 1000
token = Tokenizer(vocab_size, oov_token='<OOV>')  
token.fit_on_texts(sentences)
word_index = token.word_index

In [30]:
# Tokenizer로 sentences를 numerical value로 변환
train_sequences = token.texts_to_sequences(train_sentences)
valid_sequences = token.texts_to_sequences(valid_sentences)
train_sentences[:5]

["former versace store clerk sues over secret 'black code' for minority shoppers",
 "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
 "mom starting to fear son's web series closest thing she will have to grandchild",
 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
 'j.k. rowling wishes snape happy birthday in the most magical way']

In [31]:
train_sequences[:5]

[[308, 1, 679, 1, 1, 48, 382, 1, 1, 6, 1, 1],
 [4, 1, 1, 1, 22, 2, 166, 1, 416, 1, 6, 258, 9, 1],
 [145, 838, 2, 907, 1, 1, 582, 1, 221, 143, 39, 46, 2, 1],
 [1, 36, 224, 400, 2, 1, 29, 319, 22, 10, 1, 1, 1, 968],
 [767, 719, 1, 908, 1, 623, 594, 5, 4, 95, 1, 92]]

# 6. 문장의 길이 맞추기 (pad_sequences)
학습을 위해서는 input의 길이가 동일 해야    
지금의 sequences는 길이가 들쭉날쭉    
pad_sequences를 통해 길이가 긴 문장을 자르거나, 길이가 짧은 문장은 padding     

In [32]:
train_padded = pad_sequences(train_sequences, truncating='post', padding='post', maxlen=120)
valid_padded = pad_sequences(valid_sequences, truncating='post', padding='post', maxlen=120)

#7. label을 np.array로 변환
list 타입은 허용하지 않기 때문에, labels를 np.array로 변환

In [33]:
train_labels = np.asarray(train_labels)
valid_labels = np.asarray(valid_labels)
train_labels, valid_labels

(array([0, 0, 1, ..., 0, 1, 1]), array([1, 1, 1, ..., 0, 0, 0]))

#8. 모델링 (Modeling)
vocab_size = 1000이므로 단어들은 1000차원 공간안에 정의되어 있다고 말할 수 있다          
이를 16차원으로 내려 Data Sparsity를 해결

In [24]:
model = Sequential([Embedding(vocab_size, embedding_dim=16, input_length=_maxlen), Bidirectional(LSTM(32)),
        Dense(24, activation='relu'), Dense(1, activation='sigmoid')])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 16)           16000     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                12544     
_________________________________________________________________
dense_2 (Dense)              (None, 24)                1560      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 25        
Total params: 30,129
Trainable params: 30,129
Non-trainable params: 0
_________________________________________________________________


In [18]:
checkpoint_path = 'best_performed_model.ckpt'
checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, 
                                                save_weights_only=True, 
                                                save_best_only=True, 
                                                monitor='val_loss',
                                                verbose=1)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(train_padded, train_labels, 
                    validation_data=(valid_padded, valid_labels),
                    callbacks=[checkpoint],
                    epochs=20, 
                    verbose=2)

Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.38456, saving model to best_performed_model.ckpt
668/668 - 42s - loss: 0.4470 - accuracy: 0.7767 - val_loss: 0.3846 - val_accuracy: 0.8186
Epoch 2/20

Epoch 00002: val_loss improved from 0.38456 to 0.37276, saving model to best_performed_model.ckpt
668/668 - 41s - loss: 0.3497 - accuracy: 0.8390 - val_loss: 0.3728 - val_accuracy: 0.8289
Epoch 3/20

Epoch 00003: val_loss improved from 0.37276 to 0.36713, saving model to best_performed_model.ckpt
668/668 - 42s - loss: 0.3268 - accuracy: 0.8517 - val_loss: 0.3671 - val_accuracy: 0.8282
Epoch 4/20

Epoch 00004: val_loss improved from 0.36713 to 0.36418, saving model to best_performed_model.ckpt
668/668 - 41s - loss: 0.3109 - accuracy: 0.8602 - val_loss: 0.3642 - val_accuracy: 0.8340
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.36418
668/668 - 42s - loss: 0.3023 - accuracy: 0.8653 - val_loss: 0.3815 - val_accuracy: 0.8265
Epoch 6/20

Epoch 00006: val_loss did not improve f