In [32]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import imdb
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
(X_train, y_train), (X_test, y_test) = imdb.load_data()

In [4]:
print('훈련용 리뷰 : ', len(X_train))
print('테스트용 리뷰 : ',len(X_test))

훈련용 리뷰 :  25000
테스트용 리뷰 :  25000


In [5]:
print('훈련 카테고리 : ', len(set(y_train)))
print('테스트 카테고리 : ', len(set(y_test)))

훈련 카테고리 :  2
테스트 카테고리 :  2


In [6]:
# 이진분류 진행

In [7]:
print(X_train[0], y_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32] 1


In [9]:
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print(np.asarray((unique_elements, counts_elements)))

[[    0     1]
 [12500 12500]]


In [11]:
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

In [12]:
vocab_size = 1000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

In [13]:
max_len = 500
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [14]:
embedding_dim = 100
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(GRU(hidden_units))
model.add(Dense(1, activation='sigmoid'))

In [16]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('GRU_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [17]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [18]:
history = model.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.2)

Epoch 1/15
Epoch 1: val_acc improved from -inf to 0.83120, saving model to GRU_model.h5
Epoch 2/15


  saving_api.save_model(


Epoch 2: val_acc improved from 0.83120 to 0.83760, saving model to GRU_model.h5
Epoch 3/15
Epoch 3: val_acc improved from 0.83760 to 0.83940, saving model to GRU_model.h5
Epoch 4/15
Epoch 4: val_acc improved from 0.83940 to 0.86400, saving model to GRU_model.h5
Epoch 5/15
Epoch 5: val_acc improved from 0.86400 to 0.86820, saving model to GRU_model.h5
Epoch 6/15
 61/313 [====>.........................] - ETA: 3:02 - loss: 0.2473 - acc: 0.9050

KeyboardInterrupt: 

In [19]:
loaded_model = load_model('GRU_model.h5')
print('test acc : ', loaded_model.evaluate(X_test, y_test)[1])

test acc :  0.8728399872779846


In [23]:
word_to_index = imdb.get_word_index()

In [29]:
def sentiment_predict(new_sentence):
    new_sentence = re.sub('[^0-9a-zA-Z ]', '', new_sentence).lower()

    encoded = []
    for word in new_sentence.split():
        try:
            if word_to_index[word] <= vocab_size:
                encoded.append(word_to_index[word]+3)
            else:
                # <pad>, <sos>, <unk>
                encoded.append(2) # <unk>

        except KeyError:
            encoded.append(2)

    pad_sequence = pad_sequences([encoded], maxlen=max_len)
    score = float(loaded_model.predict(pad_sequence))

    if(score > 0.5):
        print('{:.2f}% 확률로 긍정 리뷰입니다.'.format(score*100))
    else:
        print('{:.2f}% 확률로 부정 리뷰입니다.'.format((1-score) * 100))

In [33]:
test_input = "This movie was just way too overrated. The fighting was not professional and in slow motion. I was expecting more from a 200 million budget movie. The little sister of T.Challa was just trying too hard to be funny. The story was really dumb as well. Don't watch this movie if you are going because others say its great unless you are a Black Panther fan or Marvels fan."

sentiment_predict(test_input)

98.69% 확률로 부정 리뷰입니다.


In [34]:
test_input = " I was lucky enough to be included in the group to see the advanced screening in Melbourne on the 15th of April, 2012. And, firstly, I need to say a big thank-you to Disney and Marvel Studios. \
Now, the film... how can I even begin to explain how I feel about this film? It is, as the title of this review says a 'comic book triumph'. I went into the film with very, very high expectations and I was not disappointed. \
Seeing Joss Whedon's direction and envisioning of the film come to life on the big screen is perfect. The script is amazingly detailed and laced with sharp wit a humor. The special effects are literally mind-blowing and the action scenes are both hard-hitting and beautifully choreographed."

sentiment_predict(test_input)

94.23% 확률로 긍정 리뷰입니다.


In [35]:
test_input = "I think the size marking is wrong. I bought it to fit, but it's too small. I am offended. Please refund."

sentiment_predict(test_input)

64.49% 확률로 긍정 리뷰입니다.


In [39]:
# 숙제 : sarcastic 뉴스 이진분류 모델 만들기(Word2Vec 또는 Glove로 진행해도 된다.)
import pandas as pd

df=pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
