In [2]:
import tensorflow as tf 
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib.ticker as ticker 
import urllib 
import sys 
import os 
import zipfile
import six

import inspect
import logging
import nose
import shutil
import six
import subprocess
import tarfile
import tempfile
from contextlib import closing
from six.moves.urllib.request import urlretrieve 


glove_zip_file = "glove.6B.zip"
glove_vectors_file = "glove.6B.100d.txt" # 훈련된 데이터

snli_zip_file = "snli_1.0.zip" 
snli_dev_file = "snli_1.0_dev.txt"
snli_full_dataset_file = "snli_1.0_train.txt" # dataset

#862 MB 
if (not os.path.isfile(glove_zip_file) and 
    not os.path.isfile(glove_vectors_file)): 
    urlretrieve ("http://nlp.stanford.edu/data/glove.6B.zip", glove_zip_file)

#94.6 MB 
if (not os.path.isfile(snli_zip_file) and 
    not os.path.isfile(snli_full_dataset_file)): 
    urlretrieve ("https://nlp.stanford.edu/projects/snli/snli_1.0.zip", snli_zip_file)

print("파일 다운로드 완료")    
    
def unzip_single_file(zip_file_name, output_file_name):
    if not os.path.isfile(output_file_name): 
        with open(output_file_name, 'wb') as out_file: 
            with zipfile.ZipFile(zip_file_name) as zipped: 
                for info in zipped.infolist(): 
                    if output_file_name in info.filename: 
                        with zipped.open(info) as requested_file:
                            out_file.write(requested_file.read())
                            return

unzip_single_file(glove_zip_file, glove_vectors_file) 
unzip_single_file(snli_zip_file, snli_full_dataset_file) 

print("파일 unzip 완료")

glove_wordmap = {}
with open(glove_vectors_file, "r",encoding='UTF8') as glove:
    for line in glove:
        name, vector = tuple(line.split(" ", 1))
        glove_wordmap[name] = np.fromstring(vector, sep=" ")

print("공백으로 구분된 형식의 Python 사전 만든 후 토큰화 하기")

# sequence 만들기
def sentence2sequence(sentence):
    tokens = sentence.lower().split(" ") 
    rows = [] 
    words = [] 
    #Greedy search for tokens 
    for token in tokens:
        i = len(token) 
        while len(token) > 0 and i > 0:
            word = token[:i] 
            if word in glove_wordmap:
                rows.append(glove_wordmap[word])
                words.append(word) 
                token = token[i:]
                i = len(token)
            else:
                i = i-1 
    return rows, words

# RNN 이용
run_size = 64
rnn = tf.contrib.rnn.BasicRNNCell(run_size)


# 네트워크 상수 정의
max_hypothesis_length, max_evidence_length = 30, 30 
batch_size, vector_size, hidden_size = 128, 50, 64

lstm_size = hidden_size

weight_decay = 0.0001

learning_rate = 1

input_p, output_p = 0.5, 0.5

training_iterations_count = 100000

display_step = 10



def score_setup(row): # 점수를 세팅한다.
    convert_dict = {
        'entailment': 0, #긍정
        'neutral': 1,    #중립
        'contradiction': 2 #부정
    }
    score = np.zeros((3,))
    for x in range(1,6):
        tag = row["label"+str(x)]
        if tag in convert_dict: score[convert_dict[tag]] += 1 
    return score / (1.0*np.sum(score))

def fit_to_size(matrix, shape): #알맞은 사이즈로
    res = np.zeros(shape)
    slices = [slice(0, min(dim, shape[e])) for e, dim in enumerate(matrix.shape)]
    res[slices] = matrix[slices] 
    return res

def split_data_into_scores(): # (가설, 증거), 정답, 점수 반환
    import csv 
    with open(snli_full_dataset_file,"r") as data:
        train = csv.DictReader(data, delimiter='\t') 
        evi_sentences = [] 
        hyp_sentences = [] 
        labels = [] 
        scores = [] 
        for row in train: 
            hyp_sentences.append(np.vstack(
                sentence2sequence (row["sentence1"].lower())[0]))
            evi_sentences.append(np.vstack(
                sentence2sequence(row["sentence2"].lower())[0]))
            labels.append(row["gold_label"])
            scores.append(score_setup(row))
            
            hyp_sentences = np.stack([fit_to_size(x, (max_hypothesis_length, vector_size)) 
                                      for x in hyp_sentences]) 
            evi_sentences = np.stack([fit_to_size(x, (max_evidence_length, vector_size)) 
                                      for x in evi_sentences])
            
            return (hyp_sentences, evi_sentences), labels, np.array(scores)
        
data_feature_list, correct_values, correct_scores = split_data_into_scores()

l_h, l_e = max_hypothesis_length, max_evidence_length
N, D, H = batch_size, vector_size, hidden_size 
l_seq = l_h + l_e


tf.reset_default_graph()

lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)

print("tenserflow를 이용해 LSTM 정의")

lstm_drop = tf.contrib.rnn.DropoutWrapper(lstm, input_p, output_p)

print("정규화를 위해 dropout 구현")


# N   : 데이터 하위 집합을 훈련하는데 사용하는 각 batch 요소 수
# 1_h : 가설의 최대 길이 또는 요소 수
# 1_e : 증거의 최대 길이

hyp = tf.placeholder(tf.float32, [N, l_h, D], 'hypothesis') # 트레이닝 중 가설
evi = tf.placeholder(tf.float32, [N, l_e, D], 'evidence')   # 트레이닝 중 증거
y = tf.placeholder(tf.float32, [N, 3], 'label') # 트레이닝 중 정답

lstm_back = tf.contrib.rnn.BasicRNNCell(lstm_size)

lstm_drop_back = tf.contrib.rnn.DropoutWrapper(lstm_back, input_p, output_p)

fc_initializer = tf.random_normal_initializer(stddev=0.1)

fc_weight = tf.get_variable('fc_weight', [2 * hidden_size, 3], initializer = fc_initializer)

fc_bias = tf.get_variable('bias', [3])


tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, tf.nn.l2_loss(fc_weight))

x = tf.concat([hyp, evi], 1)
x = tf.transpose(x, [1, 0, 2])
x = tf.reshape(x, [-1, vector_size])
x = tf.split(x, l_seq,)

# rnn outputs 생성!  -1은 최신 출력
rnn_outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(lstm, lstm_back, x, dtype=tf.float32)

classification_scores = tf.matmul(rnn_outputs[-1], fc_weight) + fc_bias # 분류 점수!
# 0: 긍정
# 1: 중립
# 2: 부정

print("모델 완성!")

with tf.variable_scope('Accuracy'):
    predicts = tf.cast(tf.argmax(classification_scores, 1), dtype=tf.int32)
    y_label = tf.cast(tf.argmax(y, 1), dtype=tf.int32)
    corrects = tf.equal(predicts, y_label)
    num_corrects = tf.reduce_sum(tf.cast(corrects, dtype=tf.float32))
    accuracy = tf.reduce_mean(tf.cast(corrects, dtype=tf.float32))
    
with tf.variable_scope("loss"):  #full batch는 너무 오래 걸리므로 mini-batch 사용!
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
        logits = classification_scores, labels = y) 
    loss = tf.reduce_mean(cross_entropy) 
    total_loss = loss + weight_decay * tf.add_n( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))

optimizer = tf.train.GradientDescentOptimizer(learning_rate)
opt_op = optimizer.minimize(total_loss)

print("TensoeFlow로 정확도(accuracy) 계산 표현 완료")

print("학습 시작")

init = tf.global_variables_initializer()

# tqdm을 이용해 진행사항을 추적해줌!
tqdm_installed = False 
try:
    from tqdm import tqdm
    tqdm_installed = True 
except:
    pass

# Launch the Tensorflow session 

sess = tf.Session() 
sess.run(init)

training_iterations = range(0, training_iterations_count, batch_size)

if tqdm_installed:
    training_iterations = tqdm(training_iterations)

for i in training_iterations:
    batch = np.random.randint(data_feature_list[0].shape[0], size=batch_size)

    hyps, evis, ys = (data_feature_list[0][batch, :], 
                      data_feature_list[1][batch, :], 
                      correct_scores[batch])
    
    sess.run([opt_op], feed_dict={hyp: hyps, evi: evis, y: ys}) # 훈련!
    
    # 추적!
    if (i / batch_size) % display_step == 0:
        #acc = sess.run(accuracy, feed_dict={hyp: hyps, evi: evis, y: ys})
        tmp_loss = sess.run(loss, feed_dict={hyp: hyps, evi: evis, y: ys}) 
        print("Iter " + str(i/batch_size) + ", Mini-batch Loss= " + \
          "{:.6f}".format(tmp_loss))

acc = sess.run(accuracy, feed_dict={hyp: hyps, evi: evis, y: ys})
print("Accuracy : ", acc)

print("학습 완료!")
print()
print()
print()

# 검증!

evidences = ["Maurita and Jade both were at the scene of the car crash."]
hypotheses = ["Multiple people saw the accident."]

sentence1 = [fit_to_size(np.vstack(sentence2sequence (evidence)[0]),
                         (30, 50)) for evidence in evidences]
sentence2 = [fit_to_size(np.vstack(sentence2sequence (hypothesis)[0]),
                         (30, 50)) for hypothesis in hypotheses]

prediction = sess.run(classification_scores, feed_dict={ hyp: (sentence1 * N),
                                                         evi: (sentence2 * N),
                                                         y: [[0,0,0]]*N})

label = sess.run(y, feed_dict={ hyp: (sentence1 * N),
                                evi: (sentence2 * N),
                                y: [[0,0,0]]*N})

print("   Label  : " + ["Positive", "Neutral", "Negative"][np.argmax (label[0])])
print("Prediction: " + ["Positive", "Neutral", "Negative"][np.argmax (prediction[0])])

sess.close()


파일 다운로드 완료
파일 unzip 완료
공백으로 구분된 형식의 Python 사전 만든 후 토큰화 하기
tenserflow를 이용해 LSTM 정의
정규화를 위해 dropout 구현
모델 완성!
TensoeFlow로 정확도(accuracy) 계산 표현 완료
학습 시작


  0%|                                                                                          | 0/782 [00:00<?, ?it/s]

Iter 0.0, Mini-batch Loss= 0.210463


  1%|▉                                                                                 | 9/782 [00:01<02:47,  4.62it/s]

Iter 10.0, Mini-batch Loss= 0.006261


  2%|█▉                                                                               | 19/782 [00:02<01:44,  7.28it/s]

Iter 20.0, Mini-batch Loss= 0.002778


  4%|███                                                                              | 29/782 [00:03<01:24,  8.91it/s]

Iter 30.0, Mini-batch Loss= 0.001738


  5%|████                                                                             | 39/782 [00:03<01:14, 10.01it/s]

Iter 40.0, Mini-batch Loss= 0.001250


  6%|█████                                                                            | 49/782 [00:04<01:08, 10.77it/s]

Iter 50.0, Mini-batch Loss= 0.000969


  8%|██████                                                                           | 59/782 [00:05<01:03, 11.39it/s]

Iter 60.0, Mini-batch Loss= 0.000789


  9%|███████▏                                                                         | 69/782 [00:05<01:00, 11.84it/s]

Iter 70.0, Mini-batch Loss= 0.000663


 10%|████████▏                                                                        | 79/782 [00:06<00:57, 12.21it/s]

Iter 80.0, Mini-batch Loss= 0.000571


 11%|█████████▏                                                                       | 89/782 [00:07<00:55, 12.47it/s]

Iter 90.0, Mini-batch Loss= 0.000501


 13%|██████████▎                                                                      | 99/782 [00:07<00:53, 12.74it/s]

Iter 100.0, Mini-batch Loss= 0.000446


 14%|███████████▏                                                                    | 109/782 [00:08<00:51, 12.95it/s]

Iter 110.0, Mini-batch Loss= 0.000401


 15%|████████████▏                                                                   | 119/782 [00:09<00:50, 13.12it/s]

Iter 120.0, Mini-batch Loss= 0.000365


 16%|█████████████▏                                                                  | 129/782 [00:09<00:49, 13.28it/s]

Iter 130.0, Mini-batch Loss= 0.000334


 18%|██████████████▏                                                                 | 139/782 [00:10<00:47, 13.43it/s]

Iter 140.0, Mini-batch Loss= 0.000308


 19%|███████████████▏                                                                | 149/782 [00:10<00:46, 13.55it/s]

Iter 150.0, Mini-batch Loss= 0.000286


 20%|████████████████▎                                                               | 159/782 [00:11<00:45, 13.66it/s]

Iter 160.0, Mini-batch Loss= 0.000267


 22%|█████████████████▎                                                              | 169/782 [00:12<00:44, 13.71it/s]

Iter 170.0, Mini-batch Loss= 0.000250


 23%|██████████████████▎                                                             | 179/782 [00:12<00:43, 13.79it/s]

Iter 180.0, Mini-batch Loss= 0.000236


 24%|███████████████████▎                                                            | 189/782 [00:13<00:42, 13.88it/s]

Iter 190.0, Mini-batch Loss= 0.000222


 25%|████████████████████▎                                                           | 199/782 [00:14<00:41, 13.95it/s]

Iter 200.0, Mini-batch Loss= 0.000211


 27%|█████████████████████▍                                                          | 209/782 [00:14<00:40, 14.04it/s]

Iter 210.0, Mini-batch Loss= 0.000200


 28%|██████████████████████▍                                                         | 219/782 [00:15<00:39, 14.10it/s]

Iter 220.0, Mini-batch Loss= 0.000190


 29%|███████████████████████▍                                                        | 229/782 [00:16<00:39, 14.14it/s]

Iter 230.0, Mini-batch Loss= 0.000182


 31%|████████████████████████▍                                                       | 239/782 [00:16<00:38, 14.18it/s]

Iter 240.0, Mini-batch Loss= 0.000174


 32%|█████████████████████████▍                                                      | 249/782 [00:17<00:37, 14.24it/s]

Iter 250.0, Mini-batch Loss= 0.000167


 33%|██████████████████████████▍                                                     | 259/782 [00:18<00:36, 14.28it/s]

Iter 260.0, Mini-batch Loss= 0.000160


 34%|███████████████████████████▌                                                    | 269/782 [00:18<00:35, 14.33it/s]

Iter 270.0, Mini-batch Loss= 0.000154


 36%|████████████████████████████▌                                                   | 279/782 [00:19<00:35, 14.35it/s]

Iter 280.0, Mini-batch Loss= 0.000149


 37%|█████████████████████████████▌                                                  | 289/782 [00:20<00:34, 14.37it/s]

Iter 290.0, Mini-batch Loss= 0.000143


 38%|██████████████████████████████▌                                                 | 299/782 [00:20<00:33, 14.39it/s]

Iter 300.0, Mini-batch Loss= 0.000139


 40%|███████████████████████████████▌                                                | 309/782 [00:21<00:32, 14.41it/s]

Iter 310.0, Mini-batch Loss= 0.000134


 41%|████████████████████████████████▋                                               | 319/782 [00:22<00:32, 14.42it/s]

Iter 320.0, Mini-batch Loss= 0.000130


 42%|█████████████████████████████████▋                                              | 329/782 [00:22<00:31, 14.45it/s]

Iter 330.0, Mini-batch Loss= 0.000126


 43%|██████████████████████████████████▋                                             | 339/782 [00:23<00:30, 14.46it/s]

Iter 340.0, Mini-batch Loss= 0.000122


 45%|███████████████████████████████████▋                                            | 349/782 [00:24<00:29, 14.46it/s]

Iter 350.0, Mini-batch Loss= 0.000119


 46%|████████████████████████████████████▋                                           | 359/782 [00:24<00:29, 14.48it/s]

Iter 360.0, Mini-batch Loss= 0.000116


 47%|█████████████████████████████████████▋                                          | 369/782 [00:25<00:28, 14.50it/s]

Iter 370.0, Mini-batch Loss= 0.000113


 48%|██████████████████████████████████████▊                                         | 379/782 [00:26<00:27, 14.53it/s]

Iter 380.0, Mini-batch Loss= 0.000110


 50%|███████████████████████████████████████▊                                        | 389/782 [00:26<00:26, 14.56it/s]

Iter 390.0, Mini-batch Loss= 0.000107


 51%|████████████████████████████████████████▊                                       | 399/782 [00:27<00:26, 14.59it/s]

Iter 400.0, Mini-batch Loss= 0.000104


 52%|█████████████████████████████████████████▊                                      | 409/782 [00:27<00:25, 14.62it/s]

Iter 410.0, Mini-batch Loss= 0.000102


 54%|██████████████████████████████████████████▊                                     | 419/782 [00:28<00:24, 14.64it/s]

Iter 420.0, Mini-batch Loss= 0.000100


 55%|███████████████████████████████████████████▉                                    | 429/782 [00:29<00:24, 14.67it/s]

Iter 430.0, Mini-batch Loss= 0.000097


 56%|████████████████████████████████████████████▉                                   | 439/782 [00:29<00:23, 14.69it/s]

Iter 440.0, Mini-batch Loss= 0.000095


 57%|█████████████████████████████████████████████▉                                  | 449/782 [00:30<00:22, 14.71it/s]

Iter 450.0, Mini-batch Loss= 0.000093


 59%|██████████████████████████████████████████████▉                                 | 459/782 [00:31<00:21, 14.72it/s]

Iter 460.0, Mini-batch Loss= 0.000091


 60%|███████████████████████████████████████████████▉                                | 469/782 [00:31<00:21, 14.74it/s]

Iter 470.0, Mini-batch Loss= 0.000090


 61%|█████████████████████████████████████████████████                               | 479/782 [00:32<00:20, 14.76it/s]

Iter 480.0, Mini-batch Loss= 0.000088


 63%|██████████████████████████████████████████████████                              | 489/782 [00:33<00:19, 14.78it/s]

Iter 490.0, Mini-batch Loss= 0.000086


 64%|███████████████████████████████████████████████████                             | 499/782 [00:33<00:19, 14.80it/s]

Iter 500.0, Mini-batch Loss= 0.000085


 65%|████████████████████████████████████████████████████                            | 509/782 [00:34<00:18, 14.82it/s]

Iter 510.0, Mini-batch Loss= 0.000083


 66%|█████████████████████████████████████████████████████                           | 519/782 [00:34<00:17, 14.84it/s]

Iter 520.0, Mini-batch Loss= 0.000082


 68%|██████████████████████████████████████████████████████                          | 529/782 [00:35<00:17, 14.85it/s]

Iter 530.0, Mini-batch Loss= 0.000080


 69%|███████████████████████████████████████████████████████▏                        | 539/782 [00:36<00:16, 14.87it/s]

Iter 540.0, Mini-batch Loss= 0.000079


 70%|████████████████████████████████████████████████████████▏                       | 549/782 [00:36<00:15, 14.89it/s]

Iter 550.0, Mini-batch Loss= 0.000077


 71%|█████████████████████████████████████████████████████████▏                      | 559/782 [00:37<00:14, 14.90it/s]

Iter 560.0, Mini-batch Loss= 0.000076


 73%|██████████████████████████████████████████████████████████▏                     | 569/782 [00:38<00:14, 14.92it/s]

Iter 570.0, Mini-batch Loss= 0.000075


 74%|███████████████████████████████████████████████████████████▏                    | 579/782 [00:38<00:13, 14.93it/s]

Iter 580.0, Mini-batch Loss= 0.000074


 75%|████████████████████████████████████████████████████████████▎                   | 589/782 [00:39<00:12, 14.94it/s]

Iter 590.0, Mini-batch Loss= 0.000073


 77%|█████████████████████████████████████████████████████████████▎                  | 599/782 [00:40<00:12, 14.95it/s]

Iter 600.0, Mini-batch Loss= 0.000072


 78%|██████████████████████████████████████████████████████████████▎                 | 609/782 [00:40<00:11, 14.95it/s]

Iter 610.0, Mini-batch Loss= 0.000071


 79%|███████████████████████████████████████████████████████████████▎                | 619/782 [00:41<00:10, 14.96it/s]

Iter 620.0, Mini-batch Loss= 0.000070


 80%|████████████████████████████████████████████████████████████████▎               | 629/782 [00:42<00:10, 14.96it/s]

Iter 630.0, Mini-batch Loss= 0.000069


 82%|█████████████████████████████████████████████████████████████████▎              | 639/782 [00:42<00:09, 14.96it/s]

Iter 640.0, Mini-batch Loss= 0.000068


 83%|██████████████████████████████████████████████████████████████████▍             | 649/782 [00:43<00:08, 14.96it/s]

Iter 650.0, Mini-batch Loss= 0.000067


 84%|███████████████████████████████████████████████████████████████████▍            | 659/782 [00:44<00:08, 14.97it/s]

Iter 660.0, Mini-batch Loss= 0.000066


 86%|████████████████████████████████████████████████████████████████████▍           | 669/782 [00:44<00:07, 14.97it/s]

Iter 670.0, Mini-batch Loss= 0.000065


 87%|█████████████████████████████████████████████████████████████████████▍          | 679/782 [00:45<00:06, 14.98it/s]

Iter 680.0, Mini-batch Loss= 0.000064


 88%|██████████████████████████████████████████████████████████████████████▍         | 689/782 [00:45<00:06, 14.99it/s]

Iter 690.0, Mini-batch Loss= 0.000064


 89%|███████████████████████████████████████████████████████████████████████▌        | 699/782 [00:46<00:05, 15.00it/s]

Iter 700.0, Mini-batch Loss= 0.000063


 91%|████████████████████████████████████████████████████████████████████████▌       | 709/782 [00:47<00:04, 15.01it/s]

Iter 710.0, Mini-batch Loss= 0.000062


 92%|█████████████████████████████████████████████████████████████████████████▌      | 719/782 [00:47<00:04, 15.03it/s]

Iter 720.0, Mini-batch Loss= 0.000061


 93%|██████████████████████████████████████████████████████████████████████████▌     | 729/782 [00:48<00:03, 15.03it/s]

Iter 730.0, Mini-batch Loss= 0.000061


 95%|███████████████████████████████████████████████████████████████████████████▌    | 739/782 [00:49<00:02, 15.04it/s]

Iter 740.0, Mini-batch Loss= 0.000060


 96%|████████████████████████████████████████████████████████████████████████████▌   | 749/782 [00:49<00:02, 15.05it/s]

Iter 750.0, Mini-batch Loss= 0.000059


 97%|█████████████████████████████████████████████████████████████████████████████▋  | 759/782 [00:50<00:01, 15.06it/s]

Iter 760.0, Mini-batch Loss= 0.000059


 98%|██████████████████████████████████████████████████████████████████████████████▋ | 769/782 [00:51<00:00, 15.07it/s]

Iter 770.0, Mini-batch Loss= 0.000058


100%|███████████████████████████████████████████████████████████████████████████████▋| 779/782 [00:51<00:00, 15.08it/s]

Iter 780.0, Mini-batch Loss= 0.000058


100%|████████████████████████████████████████████████████████████████████████████████| 782/782 [00:51<00:00, 15.07it/s]


Accuracy :  0.97
학습 완료!



   Label  : Positive
Prediction: Neutral


In [92]:

sess = tf.Session() 
sess.run(init)

evidences = ["Maurita and Jade both were at the scene of the car crash."]
hypotheses = ["Multiple people saw the accident."]

sentence1 = [fit_to_size(np.vstack(sentence2sequence (evidence)[0]),
                         (30, 50)) for evidence in evidences]
sentence2 = [fit_to_size(np.vstack(sentence2sequence (hypothesis)[0]),
                         (30, 50)) for hypothesis in hypotheses]

prediction = sess.run(classification_scores, feed_dict={ hyp: (sentence1 * N),
                                                         evi: (sentence2 * N),
                                                         y: [[0,0,0]]*N})

label = sess.run(y, feed_dict={ hyp: (sentence1 * N),
                                evi: (sentence2 * N),
                                y: [[0,0,0]]*N})

print("   Label  : " + ["Positive", "Neutral", "Negative"][np.argmax (label[0])])
print("Prediction: " + ["Positive", "Neutral", "Negative"][np.argmax (prediction[0])])

sess.close()


   Label  : Positive
Prediction: Positive
