In [2]:
import os
import numpy as np
import pandas as pd
import gensim
from scipy.sparse import find
import tensorflow as tf
import random
import time
import datetime
import re
import MeCab

from jamo import h2j, j2hcj
import gc



In [3]:
mecab = MeCab.Tagger()

In [4]:
def load_w2v_dic (path, flag, w2v_size,filetype):
    allFileNames = os.listdir(path)
    modelNames = [fn for fn in allFileNames if fn.find(flag+'.'+filetype) > -1 and fn.find(str(w2v_size)) > -1]
    
    model = gensim.models.Word2Vec.load(path+'/'+modelNames[0])
    print(modelNames[0]+' is loaded')
    return model

In [8]:
#data별로 처리가 다름
dataDir = 'data/uplusInput/train'
allFileNames = os.listdir(dataDir)

filePaths = []

for fname in allFileNames:
    if fname[0] != '.':
        filePaths.append(os.path.join(dataDir, fname))


for fpn in filePaths:
    if fpn[-4:] == '.csv':
        datas = pd.read_csv(fpn)
        input_x = datas['content']
        y_train = datas[['neu','pos','neg']]
        

In [9]:
dataDir = 'data/uplusInput/test'
allFileNames = os.listdir(dataDir)

filePaths = []

for fname in allFileNames:
    if fname[0] != '.':
        filePaths.append(os.path.join(dataDir, fname))


for fpn in filePaths:
    if fpn[-4:] == '.csv':
        datas = pd.read_csv(fpn)
        test_input_x = datas['content']
        y_test = datas[['neu','pos','neg']]


In [10]:
####################################################
# divide train/test set function                   #
####################################################
def divide(x, y, train_prop):
    random.seed(1234)
#     x = np.array(x)
#     y = np.array(y)
    tmp = np.random.permutation(np.arange(len(x)))
    return x[tmp][:round(train_prop * len(x))],  x[tmp][-(len(x)-round(train_prop * len(x))):], y[tmp][:round(train_prop * len(x))], y[tmp][-(len(x)-round(train_prop * len(x))):]
#     x_tr = x[tmp][:round(train_prop * len(x))]
#     y_tr = y[tmp][:round(train_prop * len(x))]
#     x_te = x[tmp][-(len(x)-round(train_prop * len(x))):]
#     y_te = y[tmp][-(len(x)-round(train_prop * len(x))):]
#     return x_tr, x_te, y_tr, y_te

In [11]:
def find_vector(word_array, word_vectors, word2vector_size=300):
    embedding_vector = []
    for w in word_array:
        if w not in word_vectors.vocab:
            embedding_vector.append(np.random.normal(scale=1e-2, size=word2vector_size))#[np.zeros(shape=300)] #
        else:
            embedding_vector.append(word_vectors[w])
    return embedding_vector

In [12]:
def batch_iter(x_train, x_train2, y_train, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data_size = len(x_train)
    num_batches_per_epoch = int((data_size - 1) / batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_x_data = x_train[shuffle_indices]
            shuffled_x_data2 = x_train2[shuffle_indices]
            shuffled_y_data = y_train[shuffle_indices]
        else:
            suffled_x_data = x_train
            shuffled_x_data2 = x_train2
            shuffled_y_data = y_train
            
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield zip(shuffled_x_data[start_index:end_index], shuffled_x_data2[start_index:end_index], shuffled_y_data[start_index:end_index])

In [16]:
word2vector_size = 150
window = 5  #현재 word를 계산하는데 사용하는 maximum distance ?
min_count = 2 #단어가 해당 개수 이하로 나오면 무시
SAVE_PATH = './wordDic/'
flag1 = 'WT'

model1 = load_w2v_dic(path=SAVE_PATH, flag=flag1, w2v_size = word2vector_size,filetype='bin')
word_vectors_WT = model1.wv

flag2 = 'JM'
model2 = load_w2v_dic(path=SAVE_PATH, flag=flag2, w2v_size = word2vector_size,filetype='bin')
word_vectors_JM = model2.wv

201804231620_150_WT.bin is loaded
201804231704_150_JM.bin is loaded


In [17]:
#flag define
max_seq_size = 150
filter_sizes = [2, 3]
num_filters = [128, 128]

input_weights = [1,1]
dropout_keep_prob = 0.7
l2_reg_lambda = 0.1

batch_size = 32
num_epochs = 100
evaluate_every = 200
checkpoint_every = 200
num_checkpoints = 5

allow_soft_placement = True
log_device_placement = False

In [18]:
input_x_vec1 = []
input_x_vec2 = []

w2v1 = Word2Vec(word_vec_size=word2vector_size, window=window, min_count=min_count, flag=flag1)
w2v2 = Word2Vec(word_vec_size=word2vector_size, window=window, min_count=min_count, flag=flag2)

for x in input_x:
    parse_x1 = w2v1.make_input([x]) #WT
    parse_x2 = w2v2.make_input([x]) #JM
    embedding_vec1 = find_vector(parse_x1, word_vectors_WT, word2vector_size)
    embedding_vec2 = find_vector(parse_x2, word_vectors_JM, word2vector_size)
    
    temp_len1 = max_seq_size-len(embedding_vec1)
    temp_len2 = max_seq_size-len(embedding_vec2)
    
    if temp_len1 > 0:
        for i in range(0,temp_len1):
            embedding_vec1 += [np.zeros(word2vector_size)]    
    else:
        embedding_vec1 = embedding_vec1[:max_seq_size]
        
    if temp_len2 > 0:
        for i in range(0,temp_len2):
            embedding_vec2 += [np.zeros(word2vector_size)]    
    else:
        embedding_vec2 = embedding_vec2[:max_seq_size]    
    
    input_x_vec1.append(embedding_vec1)
    input_x_vec2.append(embedding_vec2)

del input_x

In [21]:
test_input_x_vec1 = []
test_input_x_vec2 = []

# w2v = Word2Vec(word_vec_size=word2vector_size, window=window, min_count=min_count, flag=flag)

for x in test_input_x:
    parse_x1 = w2v1.make_input([x]) #WT
    parse_x2 = w2v2.make_input([x]) #JM
    embedding_vec1 = find_vector(parse_x1, word_vectors_WT, word2vector_size)
    embedding_vec2 = find_vector(parse_x2, word_vectors_JM, word2vector_size)
    
    temp_len1 = max_seq_size-len(embedding_vec1)
    temp_len2 = max_seq_size-len(embedding_vec2)
    
    if temp_len1 > 0:
        for i in range(0,temp_len1):
            embedding_vec1 += [np.zeros(word2vector_size)]    
    else:
        embedding_vec1 = embedding_vec1[:max_seq_size]
        
    if temp_len2 > 0:
        for i in range(0,temp_len2):
            embedding_vec2 += [np.zeros(word2vector_size)]    
    else:
        embedding_vec2 = embedding_vec2[:max_seq_size]    
    
    test_input_x_vec1.append(embedding_vec1)
    test_input_x_vec2.append(embedding_vec2)

del test_input_x
del w2v1
del w2v2


In [22]:


x_train1 = np.array(input_x_vec1)
x_train2 = np.array(input_x_vec2)
del input_x_vec1
del input_x_vec2

x_test1 = np.array(test_input_x_vec1)
x_test2 = np.array(test_input_x_vec2)
del test_input_x_vec1
del test_input_x_vec2

y_train = np.array(y_train)
y_test = np.array(y_test)

del model1
del model2
del word_vectors_WT
del word_vectors_JM



In [15]:
# del input_x_vec
# del test_input_x_vec 
# del input_x
# del test_input_x
# del w2v
# del model
# del word_vectors

In [15]:
# divide dataset into train/test set - 하나의 문서를 나눠서 사용할때
# x_train, x_test, y_train, y_test = divide(np.array(input_x_vec),np.array(input_y),train_prop=1)

In [34]:
class TextCNN(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    <Parameters>
        - sequence_length: 최대 문장 길이
        - num_classes: 클래스 개수
        - embedding_size: 각 단어에 해당되는 임베디드 벡터의 차원
        - filter_sizes: convolutional filter들의 사이즈 (= 각 filter가 몇 개의 단어를 볼 것인가?) (예: "3, 4, 5")
        - num_filters: 각 filter size 별 filter 수
        - l2_reg_lambda: 각 weights, biases에 대한 l2 regularization 정도
    """
    def make_filter_set(self, filter_size, num_filters):
        if type(filter_size) == int:
            filter_size = [filter_size]
        if type(num_filters) == int:
            num_filters = [num_filters]

        filter_first = True
        if len(filter_size) >= len(num_filters):
            t1 = filter_size
            t2 = num_filters
        else:
            t1 = num_filters
            t2 = filter_size
            filter_first = False

        result_arr = []

        for i, t in enumerate(t1):
            j = i

            if i >= len(t2):
                j = i%len(t2)

            if filter_first:
                c = (t , t2[j])
            else:
                c = (t2[j], t)

            result_arr.append(c)

        return result_arr
        
    def __init__(
            self, sequence_length, num_classes, 
            embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0, input_weights=[1.0]):
       
        # Placeholders for input, output and dropout
        
        
        self.input_x1 = tf.placeholder(tf.float32, [None, sequence_length, embedding_size], name="input_x1")
        self.input_x2 = tf.placeholder(tf.float32, [None, sequence_length, embedding_size], name="input_x2")
        input_x = [self.input_x1 , self.input_x2]
        
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # Keeping track of l2 regularization loss (optional)
        l2_loss = tf.constant(0.0)

#         self.expended_input_x = tf.expand_dims(self.input_x, -1) #conv2를 위한 차원 expend?
        test_set = self.make_filter_set(filter_size=filter_sizes,num_filters=num_filters)
        
        # Create a convolution + maxpool layer for each filter size
        num_filters_total = 0
        pooled_outputs = []
        
        for idx, filter_set in enumerate(test_set):
            filter_size, num_filter = filter_set
            with tf.name_scope("conv-%s" % filter_size):
                
                input_vec = tf.expand_dims(input_x[idx], -1) 
                
                sentence_length = int(input_vec.shape[1])
                word_vector_size = int(input_vec.shape[2])
                
                # Convolution Layer
                filter_shape = [filter_size, word_vector_size, 1, num_filter]
                print("filter shape : "+str(filter_shape))

                W = tf.Variable(tf.truncated_normal(filter_shape), name="W") #변경 가능 영역 - stddev / truncated_normal<->random_normal
                b = tf.Variable(tf.random_normal(shape=[num_filter]), name="b")
                conv = tf.nn.conv2d(
                    input_vec,
                    W,
                    strides=[1, 1, 1, 1],
                    padding="VALID",
                    name="conv")
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                print("conv result shape : "+str(h.shape))
                
            with tf.name_scope("maxpool-%s" % filter_size):
                # Maxpooling over the outputs
                sentence_length = int(h.shape[1])
                word2vec_size = int(h.shape[2])
                
                pool_size = [1, sentence_length, word2vec_size, 1]
                print("pool size : "+ str(pool_size))
                pooled = tf.nn.max_pool(
                    h,
                    ksize=pool_size,
                    strides=[1,1,1,1],
                    padding='VALID',
                    name="pool")
                print("pooled shape : "+ str(pooled.shape))
                pooled = input_weights[idx] * pooled
                pooled_outputs.append(pooled)
                num_filters_total += num_filter

        with tf.name_scope("dropout"):
            self.h_pool = tf.concat(axis=3, values=pooled_outputs)
            self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
            
        # Final (unnormalized) scores and predictions
        with tf.name_scope("output"):
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")

        # Calculate Mean cross-entropy loss
        with tf.name_scope("loss"):
            losses =tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.input_y, logits=self.scores)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

In [36]:
# 3. train the model and test
with tf.Graph().as_default():
    sess = tf.Session()
    with sess.as_default():
        with tf.device('/cpu:0'):
            cnn = TextCNN(sequence_length=len(x_train1[0]),
                          num_classes=y_train.shape[1],
                          embedding_size=word2vector_size,
                          filter_sizes=filter_sizes,
                          num_filters=num_filters,
                          l2_reg_lambda=l2_reg_lambda,
                          input_weights=input_weights)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
    #         grad_summaries = []
    #         for g, v in grads_and_vars:
    #             if g is not None:
    #                 grad_hist_summary = tf.summary.histogram("{}".format(v.name), g)
    #                 sparsity_summary = tf.summary.scalar("{}".format(v.name), tf.nn.zero_fraction(g))
    #                 grad_summaries.append(grad_hist_summary)
    #                 grad_summaries.append(sparsity_summary)
    #         grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])#, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=num_checkpoints)

            # Initialize all variables

            sess.run(tf.global_variables_initializer())

            def train_step(x_batch1, x_batch2, y_batch):
                """
                A single training step
                """
                feed_dict = {
                    cnn.input_x1: x_batch1,
                    cnn.input_x2: x_batch2,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob:dropout_keep_prob
                }
                try:
                    _, step, summaries, loss, accuracy = sess.run(
                        [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                        feed_dict)
    #                 time_str = datetime.datetime.now().isoformat()
    #                 print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                    train_summary_writer.add_summary(summaries, step)
                except Exception as e:
                    print(e)

            def dev_step(x_batch1, x_batch2, y_batch, writer=None):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                    cnn.input_x1: x_batch1,
                    cnn.input_x2: x_batch2,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                step, summaries, loss, accuracy = sess.run(
                    [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                if writer:
                    writer.add_summary(summaries, step)

            # Generate batches
            batches = batch_iter(x_train1, x_train2, y_train,  batch_size, num_epochs)

            testpoint = 0
            # Training loop. For each batch...
            for batch in batches:
                gc.collect() #memory 문제가...
                x_batch1, x_batch2, y_batch = zip(*batch)
                train_step(x_batch1, x_batch2, y_batch)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % evaluate_every == 0:
                    print("\nEvaluation:")
                    dev_step(x_test1, x_test2, y_test, writer=dev_summary_writer)
                    print("")
                if current_step % checkpoint_every == 0:
                    path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                    print("Saved model checkpoint to {}\n".format(path))

filter shape : [2, 150, 1, 128]
conv result shape : (?, 149, 1, 128)
pool size : [1, 149, 1, 1]
pooled shape : (?, 1, 1, 128)
filter shape : [3, 150, 1, 128]
conv result shape : (?, 148, 1, 128)
pool size : [1, 148, 1, 1]
pooled shape : (?, 1, 1, 128)
Writing to D:\emotionTest\runs\1524718696


Evaluation:
2018-04-26T13:59:35.316984: step 200, loss 4.55601, acc 0.7

Saved model checkpoint to D:\emotionTest\runs\1524718696\checkpoints\model-200


Evaluation:
2018-04-26T14:01:24.815103: step 400, loss 2.80501, acc 0.73

Saved model checkpoint to D:\emotionTest\runs\1524718696\checkpoints\model-400


Evaluation:
2018-04-26T14:01:48.786026: step 600, loss 2.87532, acc 0.79

Saved model checkpoint to D:\emotionTest\runs\1524718696\checkpoints\model-600


Evaluation:
2018-04-26T14:03:20.028471: step 800, loss 1.46162, acc 0.82

Saved model checkpoint to D:\emotionTest\runs\1524718696\checkpoints\model-800


Evaluation:
2018-04-26T14:04:59.976373: step 1000, loss 1.0377, acc 0.78

Saved model

In [1]:
class Word2Vec(object):
#     word_vec_size = 300
#     window = 5
#     min_count = 2
#     flag = 'W'
    """
    Word2Vec을 이용하여 사전을 만드는 여러가지 방식을 제공
    <Parameters>
        - word_vec_size : 단어 하나당 vector size
        - window : 현재 word를 계산하는데 사용하는 maximum distance
        - min_count : 단어가 해당 개수 이하로 나오면 무시
        - flag : 5가지 종류 제공.
          1) W : mecab 형태소분석 결과 word 단위 word2vec 사전
          2) WT : mecab 형태소분석 결과 word 단위 + pos tagging word2vec 사전
          3) JM : jamo 단위 word2vec 사전
          4) LN : N = 1,2,.. n 글자수 단위 사전 제공
    <Function>
        - make_input(sentence_list) : 각 flag에 알맞은 input 형태를 만들어서 return
        - save_w2v_dic(sentence_list, path) : 각 flag에 알맞은 사전은 원하는 path에 저장
    """
    def __init__(self, word_vec_size=300, window=5, min_count=2, flag='W'):
        self.word_vec_size = word_vec_size
        self.window = window
        self.min_count = min_count
        self.flag = flag
    

    def parse_sentence(self, text, param):
        if type(text) == str:
            text = ''.join(text.split()) #space remove
            words_array = []
            
            if self.flag[0] == 'W':
                mecab = MeCab.Tagger()
                
                parse_result = mecab.parse(text) # pose parse
                info_of_words = parse_result.split('\n')
                
                for info in info_of_words:
                    if not (info == 'EOS' or info == ''):
                        info_elems = info.split(',')
                        posed_word = info_elems[0].split('\t')

                        if param and len(posed_word)>1:
                            words_array.append( posed_word[0]+'/'+posed_word[1])
                        else:
                            words_array.append( posed_word[0])
            elif self.flag == 'JM':
                words_array = list(j2hcj(h2j(text)))
            elif self.flag[0] == 'L':
                n = param
                words_array = [text[i:i+n] for i in range(0, len(text), n)]
                
            return words_array
        else:
            return False
    

    def make_input(self, sentence_list):
        input_x = []
        
        for text in sentence_list:
            if self.flag[0] == 'W':
                if len(self.flag) > 1 and self.flag[1] == 'T':
                    param = True
                else:
                    param = False
            elif self.flag[0] == 'L':
                if len(self.flag) > 1 :
                    param = int(self.flag[1])
            else:
                param = ''
            result = self.parse_sentence(text=text, param=param)
            if type(result)==list and len(result) > 0:
                input_x.extend(result)
        return input_x        
    

    def make_w2v_dic(self, sentence_list):
        if type(sentence_list) == list:
            input_x = self.make_input(sentence_list)
            w2v_input = np.array(input_x)
            model = gensim.models.Word2Vec(min_count=self.min_count, window=self.window, size=self.word_vec_size)
            model.build_vocab(w2v_input)
            model.train(w2v_input, total_examples=model.corpus_count, epochs=model.epochs)
            word_vectors = model.wv

            return model, word_vectors
        else:
            return False
    
    
    def save_w2v_dic(self, sentence_list, path):

        SAVE_PATH = path
        SAVE_NAME = SAVE_PATH+datetime.datetime.now().strftime('%Y%m%d%H%M')+'_'+str(word_vec_size)+'_'+self.flag
        
        model, word_vectors = self.make_w2v_dic(sentence_list)
        print("Dictionary is saved : "+SAVE_NAME)
        model.save(SAVE_NAME+'.bin')

In [29]:
x_train.shape[2]

int