## 요약
- word2vec embedding
    - 300features, 40minwords, 10contexte
- 각 문장을 전처리+토큰화
- 토큰화 된 문장에 w2v 사전 단어가 있으며 모두 더해 평균
    - 토큰별로 진행
        - 각 문장에대한 sequence가 달라지게 됨
- 각 문장에 대한 단어 벡터들을 input으로 LSTM 모델로 학습
    - 각 문장에 대해 sequence가 다르기 때문에
        - dynamic rnn의 sequence_length 조절
    - input shape
        - (batch, dynamic sequence len, 300)
- rnn의 hidden layer는 2개까지 쌓아봄
- softmax + cross_entropy

## dynamic rnn
- sequence의 크기를 동적으로 
- tf.nn.dynamic_rnn(sequence_length =[...])

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings(action="ignore")


In [106]:
train = pd.read_csv("../data/movie/labeledTrainData.tsv",
                    header = 0,
                    delimiter="\t",
                    quoting=3)
submit_test = pd.read_csv("../data/movie/testData.tsv",
                    header = 0,
                    delimiter="\t",
                    quoting=3)

In [220]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def review_to_wordlist(review, remove_stopwords=False):
    # 1. HTML 태그 지우기
    review_text = BeautifulSoup(review).get_text()
    # 2. 알파벳 빼고 다 지움
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    # 3. 소문자로 바꾸고 단어 분리
    words = review_text.lower().split()
    # 4. 정지단어 제거
    if remove_stopwords:
        stops = set(stopwordswords.word("english"))
        words = [w for w in words if not w in stops]
    
    return (words)

def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) >0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [107]:
# train / test set 나누기
train_data = train[0:int(len(train) * 0.8)]
test_data = train[int(len(train) * 0.8):]
print(len(train_data))
print(len(test_data))

20000
5000


In [406]:
train['review'][0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [211]:
# 공백으로 토큰화되는 문제가 있음
print(train_seq[4543])
print(train_data['review'][4543])
print(train_sentences[4543])

282
"Smallville episode Justice is the best episode of Smallville ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! It's my favorite episode of Smallville! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! !"
[['smallville', 'episode', 'justice', 'is', 'the', 'best', 'episode', 'of', 'smallville'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], 

In [373]:
def sentence_sequence(data, tokenizer):
    data_sentences = []
    for review in data:
        token_seq = []
        len_seq = []
        token_seq+=(review_to_sentences(review, tokenizer))
        check = True
        # 토큰화 된 문장에 3단어 이상으로 이뤄졌을때만 추가해줌
        for tok in token_seq:
            if len(tok) > 2:
                len_seq.append(tok)
                check = False
        # 모두다 2단어 이하일 경우에는 비어있는 []라도 넣어줘서 길이를 맞춰 줌
        if check :
            data_sentences.append([])
        else:
            data_sentences.append(len_seq)
    return data_sentences

In [374]:
%%time
train_sentences = []
test_sentences = []
submit_sentences = []
train_sentences = sentence_sequence(train_data["review"], tokenizer)
test_sentences = sentence_sequence(test_data["review"], tokenizer)
submit_sentences = sentence_sequence(submit_test["review"], tokenizer)

Wall time: 2min 41s


In [190]:
%%time
train_sentences = []
test_sentences = []
submit_sentences = []
sentence_sequence(train)
for review in train_data["review"]:
    token_seq = []
    token_seq+=(review_to_sentences(review, tokenizer))
    check = True
    for tok in token_seq:
        if len(tok) > 2:
            train_sentences.append(token_seq)
            check = False
    if check :
        train_sentences.append([])

for review in test_data["review"]:
    token_seq = []
    token_seq+=(review_to_sentences(review, tokenizer))
    test_sentences.append(token_seq)
    
for review in submit_test["review"]:
    token_seq = []
    token_seq+=(review_to_sentences(review, tokenizer))
    submit_sentences.append(token_seq)

Wall time: 2min 43s


In [5]:
model = Word2Vec.load("300features_40minwords_10context")

In [408]:
model.wv.syn0.shape

(8315, 300)

In [409]:
model.wv.index2word

['and',
 'the',
 'i',
 'a',
 'it',
 'this',
 'is',
 'as',
 'of',
 'movie',
 'story',
 'child',
 'wolf',
 'was',
 'but',
 'to',
 't',
 'have',
 'that',
 'when',
 'my',
 'made',
 'saw',
 'alone',
 'glad',
 'chirin',
 'for',
 'with',
 'in',
 'you',
 'on',
 'he',
 'are',
 'his',
 'be',
 'one',
 'so',
 'from',
 'like',
 'just',
 'about',
 's',
 'can',
 'more',
 'very',
 'up',
 'no',
 'even',
 'had',
 'will',
 'other',
 'into',
 'great',
 'because',
 'him',
 'too',
 'movies',
 'any',
 'watch',
 'seen',
 'many',
 'life',
 'never',
 'did',
 've',
 'such',
 'm',
 'though',
 'doesn',
 'again',
 'll',
 'am',
 'hard',
 'each',
 'ending',
 'maybe',
 'night',
 'need',
 'use',
 'others',
 'mother',
 'heart',
 'kill',
 'stories',
 'feeling',
 'age',
 'says',
 'sad',
 'kills',
 'depth',
 'follows',
 'meaning',
 'rare',
 'beautifully',
 'cry',
 'unusual',
 'broken',
 'eat',
 'grew',
 'anime',
 'tape',
 'mountain',
 'choices',
 'broke',
 'loneliness',
 'tomorrow',
 'cruelty',
 'someday',
 'encountered',


In [428]:
from sklearn.cluster import KMeans

word_vectors = model.wv.syn0
num_clusters = int(word_vectors.shape[0]/100)
print(num_clusters)

83


In [429]:
%%time
kmeans_clustering = KMeans(n_clusters=num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

Wall time: 13.8 s


In [453]:
idx[0]

66

In [430]:
word_centroid_map = dict(zip(model.wv.index2word, idx))

In [431]:
word_centroid_map

{'and': 66,
 'the': 66,
 'i': 66,
 'a': 66,
 'it': 66,
 'this': 66,
 'is': 25,
 'as': 66,
 'of': 66,
 'movie': 66,
 'story': 66,
 'child': 66,
 'wolf': 66,
 'was': 25,
 'but': 66,
 'to': 17,
 't': 66,
 'have': 66,
 'that': 76,
 'when': 66,
 'my': 66,
 'made': 25,
 'saw': 66,
 'alone': 74,
 'glad': 66,
 'chirin': 74,
 'for': 66,
 'with': 66,
 'in': 66,
 'you': 0,
 'on': 66,
 'he': 66,
 'are': 77,
 'his': 66,
 'be': 33,
 'one': 66,
 'so': 66,
 'from': 66,
 'like': 66,
 'just': 66,
 'about': 66,
 's': 26,
 'can': 41,
 'more': 54,
 'very': 74,
 'up': 66,
 'no': 76,
 'even': 66,
 'had': 25,
 'will': 17,
 'other': 66,
 'into': 34,
 'great': 66,
 'because': 66,
 'him': 66,
 'too': 74,
 'movies': 66,
 'any': 66,
 'watch': 66,
 'seen': 66,
 'many': 66,
 'life': 66,
 'never': 66,
 'did': 66,
 've': 66,
 'such': 66,
 'm': 66,
 'though': 66,
 'doesn': 41,
 'again': 66,
 'll': 66,
 'am': 66,
 'hard': 66,
 'each': 66,
 'ending': 10,
 'maybe': 66,
 'night': 66,
 'need': 41,
 'use': 76,
 'others': 66,

In [434]:
for cluster in range(0,10):
    print("cluster {}".format(cluster))
    words = []
    for item in word_centroid_map.items(): 
        if item[1] == cluster:
            words.append(item[0])
    print(words)
    print(len(words))

cluster 0
['you', 'time', 'then', 'your', 'watching', 'minutes', 'looking', 'money', 'seeing', 'laugh', 'myself', 'chance', 'please', 'yourself', 'hours', 'avoid', 'buy', 'viewing', 'rent', 'wait', 'free', 'unless', 'copy', 'laughing', 'store', 'paid', 'skip', 'price', 'hopes', 'advice', 'favor', 'test', 'costs', 'vote', 'strongly', 'cost', 'rental', 'dollars', 'blockbuster', 'buying', 'paying', 'renting', 'disbelief', 'wasting', 'plague', 'bucks', 'curiosity', 'ticket', 'popcorn', 'awake', 'urge', 'earned', 'preview', 'purchase', 'patience', 'relax', 'chuckle', 'goers', 'cents', 'yours', 'insomnia', 'anytime', 'vomit', 'beers', 'admission']
65
cluster 1
['famous', 'la', 'master', 'latter', 'lord', 'fox', 'tarzan', 'regular', 'golden', 'fame', 'hall', 'prime', 'latest', 'irish', 'featured', 'beloved', 'lincoln', 'notorious', 'grey', 'tour', 'whereas', 'infamous', 'le', 'silver', 'doc', 'briefly', 'ironically', 'notes', 'brazil', 'waters', 'roman', 'companion', 'babe', 'blues', 'solo', 

In [450]:
model['and']

array([ 0.13031492, -0.02957438,  0.11854155,  0.02205045,  0.01125446,
        0.06360631, -0.0541133 , -0.13853987,  0.04139017, -0.01589334,
       -0.09426113,  0.0706529 , -0.06750637,  0.05334873, -0.06013417,
       -0.00360388,  0.03565174, -0.05744587,  0.04229328, -0.12201357,
       -0.09853052, -0.00588993,  0.07471609,  0.00071117,  0.02712767,
        0.16124359, -0.05526419, -0.01098994, -0.01337571,  0.02846422,
       -0.05886051,  0.00610646,  0.0004728 ,  0.12077574,  0.01123986,
       -0.10856149, -0.07009662,  0.00557431,  0.03714091,  0.12360331,
       -0.00066999, -0.04225912, -0.10717808,  0.02079637, -0.02573308,
       -0.04159555, -0.07778424,  0.05747209,  0.09268185, -0.04096794,
        0.047696  , -0.00438409,  0.05928781,  0.13909656,  0.02211961,
        0.02986236,  0.05733318, -0.00567353,  0.03526219,  0.06250302,
        0.02805644,  0.0093689 ,  0.03432084, -0.03004945, -0.02753009,
        0.11449958, -0.110161  , -0.00454438,  0.00658672, -0.02

In [424]:
model.wv.index2word

['and',
 'the',
 'i',
 'a',
 'it',
 'this',
 'is',
 'as',
 'of',
 'movie',
 'story',
 'child',
 'wolf',
 'was',
 'but',
 'to',
 't',
 'have',
 'that',
 'when',
 'my',
 'made',
 'saw',
 'alone',
 'glad',
 'chirin',
 'for',
 'with',
 'in',
 'you',
 'on',
 'he',
 'are',
 'his',
 'be',
 'one',
 'so',
 'from',
 'like',
 'just',
 'about',
 's',
 'can',
 'more',
 'very',
 'up',
 'no',
 'even',
 'had',
 'will',
 'other',
 'into',
 'great',
 'because',
 'him',
 'too',
 'movies',
 'any',
 'watch',
 'seen',
 'many',
 'life',
 'never',
 'did',
 've',
 'such',
 'm',
 'though',
 'doesn',
 'again',
 'll',
 'am',
 'hard',
 'each',
 'ending',
 'maybe',
 'night',
 'need',
 'use',
 'others',
 'mother',
 'heart',
 'kill',
 'stories',
 'feeling',
 'age',
 'says',
 'sad',
 'kills',
 'depth',
 'follows',
 'meaning',
 'rare',
 'beautifully',
 'cry',
 'unusual',
 'broken',
 'eat',
 'grew',
 'anime',
 'tape',
 'mountain',
 'choices',
 'broke',
 'loneliness',
 'tomorrow',
 'cruelty',
 'someday',
 'encountered',


In [175]:
# 각 문장에 해당하는 단어들이 있으면 그 벡터를 다 더해서 평균
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,), dtype = "float32")
    nwords = 0
    index2word_set = set(model.wv.index2word)
    
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])
    featureVec = np.divide(featureVec, nwords)
    return featureVec

def getAvgFeatureList(review, model, num_features):
    sentence_fv_lis = []
    sequence_length = []
    for r in review:
        featurevec_list = []
        for token in r:
            featurevec_list.append(makeFeatureVec(token, model, num_features))
        sentence_fv_lis.append(featurevec_list)
        sequence_length.append(len(featurevec_list))
    return sentence_fv_lis,sequence_length

In [383]:
%%time
train_feature_list,train_seq = getAvgFeatureList(train_sentences, model, model.wv.syn0.shape[1])
test_feature_list,test_seq = getAvgFeatureList(test_sentences, model, model.wv.syn0.shape[1])
submit_feature_list,submit_seq = getAvgFeatureList(submit_sentences, model, model.wv.syn0.shape[1])

Wall time: 4min 36s


In [384]:
s_train_seq = sorted(train_seq)
s_test_seq = sorted(test_seq)
s_submit_seq = sorted(submit_seq)

In [392]:
print("train shape  : {}".format(train_feature_list[0][0].shape))
print("test shape  : {}".format(test_feature_list[0][0].shape))
print("submit shape  : {}".format(submit_feature_list[0][0].shape))
print("max(train_seq) : {}".format(s_train_seq[-1]))
print("max(test_seq) : {}".format(s_test_seq[-1]))
print("max(submit_seq) : {}".format(s_submit_seq[-1]))

train shape  : (300,)
test shape  : (300,)
submit shape  : (300,)
max(train_seq) : 146
max(test_seq) : 90
max(submit_seq) : 78


In [399]:
print(len(train_feature_list[0]))

15


In [405]:
print(train_feature_list[0][0].shape)

(300,)


In [396]:
import tensorflow as tf

tf.reset_default_graph()

input_dim = model.wv.syn0.shape[1] # word2vec 학습시킨 차원 - 300
# review 각 문장들은 전처리과정을 거치고 token화가 되는데
# 정렬했을때 가장 많은 토큰수로 sorseq[-1]을 sequence dim으로 해준다
input_seq = s_train_seq[-1] # 282
# rnn 학습할때는 동적으로 sequence length를 설정해준다
output_dim = 1
hidden_dim = 256
learning_rate = 0.01

X = tf.placeholder(tf.float32, [None, input_seq, input_dim])
print("X.shape : {}".format(X.shape))
Y = tf.placeholder(tf.float32, [None, output_dim])
print("Y.shape : {}".format(Y.shape))
dropout_prob = tf.placeholder(tf.float32)

def create_rnn_cell():
    return tf.contrib.rnn.BasicLSTMCell(num_units = hidden_dim)

cell1 = create_rnn_cell()
cell1 = tf.nn.rnn_cell.DropoutWrapper(cell1, output_keep_prob = dropout_prob)
cell2 = create_rnn_cell()


multi_cell = tf.nn.rnn_cell.MultiRNNCell([cell1,cell2])

outputs, states = tf.nn.dynamic_rnn(multi_cell, X, sequence_length = train_seq,dtype=tf.float32)
print("outputs.shape : {}".format(outputs.shape))

X.shape : (?, 146, 300)
Y.shape : (?, 1)
outputs.shape : (?, 146, 256)
