## 요약
- word2vec embedding
    - 300features, 40minwords, 10contexte
- 각 문장을 전처리+토큰화
- 토큰화 된 문장에 w2v 사전 단어가 있으며 모두 더해 평균
    - 토큰별로 진행
        - 각 문장에대한 sequence가 달라지게 됨
- 각 문장에 대한 단어 벡터들을 input으로 LSTM 모델로 학습
    - 각 문장에 대해 sequence가 다르기 때문에
        - dynamic rnn의 sequence_length 조절
    - input shape
        - (batch, dynamic sequence len, 300)
- rnn의 hidden layer는 2개까지 쌓아봄
- softmax + cross_entropy

## dynamic rnn
- sequence의 크기를 동적으로 
- tf.nn.dynamic_rnn(sequence_length =[...])

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings(action="ignore")


In [106]:
train = pd.read_csv("../data/movie/labeledTrainData.tsv",
                    header = 0,
                    delimiter="\t",
                    quoting=3)
submit_test = pd.read_csv("../data/movie/testData.tsv",
                    header = 0,
                    delimiter="\t",
                    quoting=3)

In [3]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_wordlist(review, remove_stopwords=False):
    # 1. HTML 태그 지우기
    review_text = BeautifulSoup(review).get_text()
    # 2. 알파벳 빼고 다 지움
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    # 3. 소문자로 바꾸고 단어 분리
    words = review_text.lower().split()
    # 4. 정지단어 제거
    if remove_stopwords:
        stops = set(stopwordswords.word("english"))
        words = [w for w in words if not w in stops]
    
    return (words)

def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) >0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

20000

In [107]:
train_data = train[0:int(len(train) * 0.8)]
test_data = train[int(len(train) * 0.8):]
print(len(train_data))
print(len(test_data))

20000
5000


In [130]:
train_data['review']

0        "With all this stuff going down at the moment ...
1        "\"The Classic War of the Worlds\" by Timothy ...
2        "The film starts with a manager (Nicholas Bell...
3        "It must be assumed that those who praised thi...
4        "Superbly trashy and wondrously unpretentious ...
5        "I dont know why people think this is such a b...
6        "This movie could have been very good, but com...
7        "I watched this video at a friend's house. I'm...
8        "A friend of mine bought this film for £1, and...
9        "<br /><br />This movie is full of references....
10       "What happens when an army of wetbacks, towelh...
11       "Although I generally do not like remakes beli...
12       "\"Mr. Harvey Lights a Candle\" is anchored by...
13       "I had a feeling that after \"Submerged\", thi...
14       "note to George Litman, and others: the Myster...
15       "Stephen King adaptation (scripted by King him...
16       "`The Matrix' was an exciting summer blockbust.

In [143]:
%%time
train_sentences = []
test_sentences = []
submit_sentences = []
for review in train_data["review"]:
    token_seq = []
    token_seq+=(review_to_sentences(review, tokenizer))
    train_sentences.append(token_seq)

for review in test_data["review"]:
    token_seq = []
    token_seq+=(review_to_sentences(review, tokenizer))
    test_sentences.append(token_seq)
    
for reveiw in submit_test["review"]:
    token_seq = []
    token_seq+=(review_to_sentences(review, tokenizer))
    submit_sentences.append(token_seq)

Wall time: 2min 28s


In [5]:
model = Word2Vec.load("300features_40minwords_10context")

In [144]:
# 각 문장에 해당하는 단어들이 있으면 그 벡터를 다 더해서 평균
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,), dtype = "float32")
    nwords = 0
    index2word_set = set(model.wv.index2word)
    
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])
    featureVec = np.divide(featureVec, nwords)
    return featureVec

def getAvgFeatureList(review, model, num_features):
    sentence_fv_lis = []
    sequence_length = []
    for r in review:
        featurevec_list = []
        for token in r:
            featurevec_list.append(makeFeatureVec(token, model, num_features))
        sentence_fv_lis.append(featurevec_list)
        sequence_length.append(len(featurevec_list))
    return sentence_fv_lis,sequence_length

In [145]:
%%time
train_feature_list,train_seq = getAvgFeatureList(train_sentences, model, model.wv.syn0.shape[1])
test_feature_list,test_seq = getAvgFeatureList(test_sentences, model, model.wv.syn0.shape[1])
submit_feature_list,submit_seq = getAvgFeatureList(submit_sentences, model, model.wv.syn0.shape[1])

Wall time: 4min 9s


In [146]:
s_train_seq = sorted(train_seq)
s_test_seq = sorted(test_seq)
s_submit_seq = sorted(submit_seq)

In [147]:
print(s_train_seq[-1])
print(s_test_seq[-1])
print(s_submit_seq[-1])

282
118
9


In [151]:
import tensorflow as tf

input_dim = model.wv.syn0.shape[1] # word2vec 학습시킨 차원 - 300
# review 각 문장들은 전처리과정을 거치고 token화가 되는데
# 정렬했을때 가장 많은 토큰수로 sorseq[-1]을 sequence dim으로 해준다
input_seq = sortseq[-1] # 282
# rnn 학습할때는 동적으로 sequence length를 설정해준다
output_dim = 1
hidden_dim = 256
learning_rate = 0.01

X = tf.placeholder(tf.float32, [None, input_seq, input_dim])
Y = tf.placeholder(tf.float32, [None, output_dim])
dropout_prob = tf.placeholder(tf.float32)

def create_rnn_cell():
    return tf.contrib.rnn.BasicLSTMCell(num_units = hidden_dim)

cell1 = create_rnn_cell()
cell1 = tf.nn.rnn_cell.DropoutWrapper(cell1, output_keep_prob = dropout_prob)
cell2 = create_rnn_cell()


multi_cell = tf.nn.rnn_cell.MultiRNNCell([cell1,cell2])

outputs, states = tf.nn.dynamic_rnn(multi_cell, X, sequence_length = train_seq,dtype=tf.float32)
print("outputs.shape : {}".format(outputs.shape))

outputs.shape : (?, 282, 256)
