# TEXT MINING for PRACTICE: text classification using Deep Learning
- Keras를 이용한 텍스트마이닝
---

# 1. 패키지 설치

In [1]:
import sys
!{sys.executable} -m pip uninstall tensorflow -y

Uninstalling tensorflow-2.0.0rc1:
  Successfully uninstalled tensorflow-2.0.0rc1


In [2]:
!{sys.executable} -m pip install tf-nightly-2.0-preview



In [3]:
!{sys.executable} -m pip install tensorflow==2.0.0-rc1
!{sys.executable} -m pip install tensorflow_hub tensorflow_datasets tf-nightly-2.0-preview numpy

Collecting tensorflow==2.0.0-rc1
  Using cached https://files.pythonhosted.org/packages/03/13/b8964a1e3e4b01025c4a3d09cdeb3c4634fb93f3726b934a3e2b66dba924/tensorflow-2.0.0rc1-cp36-cp36m-macosx_10_11_x86_64.whl
Collecting tb-nightly<1.15.0a20190807,>=1.15.0a20190806 (from tensorflow==2.0.0-rc1)
  Using cached https://files.pythonhosted.org/packages/bc/88/24b5fb7280e74c7cf65bde47c171547fd02afb3840cff41bcbe9270650f5/tb_nightly-1.15.0a20190806-py3-none-any.whl
[31mtf-nightly 2.1.0.dev20191121 has requirement keras-preprocessing>=1.1.0, but you'll have keras-preprocessing 1.0.5 which is incompatible.[0m
[31mtf-nightly 2.1.0.dev20191121 has requirement protobuf>=3.8.0, but you'll have protobuf 3.6.1 which is incompatible.[0m
[31mtf-nightly 2.1.0.dev20191121 has requirement tb-nightly<2.2.0a0,>=2.1.0a0, but you'll have tb-nightly 1.15.0a20190806 which is incompatible.[0m
[31mtf-nightly-2-0-preview 2.0.0.dev20191002 has requirement tb-nightly<2.2.0a0,>=2.1.0a0, but you'll have tb-nightl

[31mtf-nightly 2.1.0.dev20191121 has requirement keras-preprocessing>=1.1.0, but you'll have keras-preprocessing 1.0.5 which is incompatible.[0m
[31mtf-nightly 2.1.0.dev20191121 has requirement protobuf>=3.8.0, but you'll have protobuf 3.6.1 which is incompatible.[0m
[31mtensorflow 2.0.0rc1 has requirement tb-nightly<1.15.0a20190807,>=1.15.0a20190806, but you'll have tb-nightly 2.1.0a20191121 which is incompatible.[0m
[31mtb-nightly 2.1.0a20191121 has requirement grpcio>=1.24.3, but you'll have grpcio 1.16.1 which is incompatible.[0m
Installing collected packages: tb-nightly
  Found existing installation: tb-nightly 1.15.0a20190806
    Uninstalling tb-nightly-1.15.0a20190806:
      Successfully uninstalled tb-nightly-1.15.0a20190806
Successfully installed tb-nightly-2.1.0a20191121


# 2. 구성 설정 / 확인

In [4]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np


import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_datasets as tfds

print("버전: ", tf.__version__)
print("즉시 실행 모드: ", tf.executing_eagerly())
print("허브 버전: ", hub.__version__)
print("GPU ", "사용 가능" if tf.config.experimental.list_physical_devices("GPU") else "사용 불가능")

버전:  2.0.0-rc1
즉시 실행 모드:  True
허브 버전:  0.7.0
GPU  사용 불가능


# 3. 데이터셋 다운로드

In [5]:
# 훈련 세트를 6대 4로 나눕니다.
# 결국 훈련에 15,000개 샘플, 검증에 10,000개 샘플, 테스트에 25,000개 샘플을 사용하게 됩니다.
train_validation_split = tfds.Split.TRAIN.subsplit([6, 4])

(train_data, validation_data), test_data = tfds.load(
    name="imdb_reviews", 
    split=(train_validation_split, tfds.Split.TEST),
    as_supervised=True)

In [6]:
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))
train_examples_batch

<tf.Tensor: id=224, shape=(10,), dtype=string, numpy=
array([b"As a lifelong fan of Dickens, I have invariably been disappointed by adaptations of his novels.<br /><br />Although his works presented an extremely accurate re-telling of human life at every level in Victorian Britain, throughout them all was a pervasive thread of humour that could be both playful or sarcastic as the narrative dictated. In a way, he was a literary caricaturist and cartoonist. He could be serious and hilarious in the same sentence. He pricked pride, lampooned arrogance, celebrated modesty, and empathised with loneliness and poverty. It may be a clich\xc3\xa9, but he was a people's writer.<br /><br />And it is the comedy that is so often missing from his interpretations. At the time of writing, Oliver Twist is being dramatised in serial form on BBC television. All of the misery and cruelty is their, but non of the humour, irony, and savage lampoonery. The result is just a dark, dismal experience: the story p

In [7]:
train_labels_batch

<tf.Tensor: id=225, shape=(10,), dtype=int64, numpy=array([1, 1, 1, 1, 1, 1, 0, 1, 1, 0])>

# 4. 단어 임베딩 사용하기 (20차원으로 축소)

In [8]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)
hub_layer(train_examples_batch[:3])

<tf.Tensor: id=405, shape=(3, 20), dtype=float32, numpy=
array([[ 3.9819887 , -4.4838037 ,  5.177359  , -2.3643482 , -3.2938678 ,
        -3.5364532 , -2.4786978 ,  2.5525482 ,  6.688532  , -2.3076782 ,
        -1.9807833 ,  1.1315885 , -3.0339816 , -0.7604128 , -5.743445  ,
         3.4242578 ,  4.790099  , -4.03061   , -5.992149  , -1.7297493 ],
       [ 3.4232912 , -4.230874  ,  4.1488533 , -0.29553518, -6.802391  ,
        -2.5163853 , -4.4002395 ,  1.905792  ,  4.7512794 , -0.40538004,
        -4.3401685 ,  1.0361497 ,  0.9744097 ,  0.71507156, -6.2657013 ,
         0.16533905,  4.560262  , -1.3106939 , -3.1121316 , -2.1338716 ],
       [ 3.8508697 , -5.003031  ,  4.8700504 , -0.04324996, -5.893603  ,
        -5.2983093 , -4.004676  ,  4.1236343 ,  6.267754  ,  0.11632943,
        -3.5934832 ,  0.8023905 ,  0.56146765,  0.9192484 , -7.3066816 ,
         2.8202746 ,  6.2000837 , -3.5709393 , -4.564525  , -2.305622  ]],
      dtype=float32)>

# 5. 모델 구성

In [9]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 20)                400020    
_________________________________________________________________
dense (Dense)                (None, 16)                336       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


# 6. 훈련

In [10]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [11]:
history = model.fit(train_data.shuffle(10000).batch(512),
                    epochs=20,
                    validation_data=validation_data.batch(512),
                    verbose=1)

Epoch 1/20


W1122 18:10:08.745627 4749467072 deprecation.py:323] From /Users/seojungwon/anaconda3/lib/python3.6/site-packages/tensorflow_core/python/ops/nn_impl.py:183: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# 7. 검증

In [12]:
results = model.evaluate(test_data.batch(512), verbose=2)
for name, value in zip(model.metrics_names, results):
    print("%s: %.3f" % (name, value))

49/49 - 3s - loss: 0.3210 - accuracy: 0.8603
loss: 0.321
accuracy: 0.860
