In [1]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from keras.layers.core import Activation,Dense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import nltk

Using TensorFlow backend.


In [3]:
train=pd.read_csv('labeledTrainData.tsv',header=0, delimiter="\t", quoting=3)
test=pd.read_csv('testData.tsv',header=0, delimiter="\t", quoting=3)

In [4]:
def review_to_wordlist(review):
    review_text=BeautifulSoup(review,'html.parser').get_text()
    review_text=re.sub("[^a-zA-Z]"," ",review_text)
    words=review_text.lower()
    return words

In [5]:
train['review']=train['review'].map(review_to_wordlist)
test['review']=test['review'].map(review_to_wordlist)


In [6]:
xtrain,xvalid,ytrain,yvalid=train_test_split(train.review.values,
                                             train.sentiment.values,stratify=train.sentiment.values,random_state=42,
                                            shuffle=True,test_size=0.2)

In [7]:
from gensim.models import Word2Vec
model=Word2Vec.load('Word2Vector-300features_40minwords_10context')



In [10]:
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [11]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [12]:
xtrain_word2vec = [sent2vec(x) for x in xtrain]
xvalid_word2vec = [sent2vec(x) for x in xvalid]

In [13]:
# scale the data before any neural net:
from sklearn import preprocessing
scl=preprocessing.StandardScaler()
xtrain_word2vec_scl=scl.fit_transform(xtrain_word2vec)
xvalid_word2vec_scl=scl.fit_transform(xvalid_word2vec)

In [14]:
#To move further, i.e. with LSTMs we need to tokenize the text data
from keras.preprocessing import sequence, text
token=text.Tokenizer(num_words=None)
max_len=80
# fit_on_text(texts) 使用一系列文档来生成token词典，texts为list类，每个元素为一个文档。
# texts_to_sequences(texts) 将多个文档转换为word下标的向量形式,shape为[len(texts)，len(text)] -- (文档数，每条文档的长度)
# texts_to_matrix(texts) 将多个文档转换为矩阵表示,shape为[len(texts),num_words]
token.fit_on_texts(list(xtrain)+list(xvalid))
xtrain_seq=token.texts_to_sequences(xtrain)
xvalid_seq=token.texts_to_sequences(xvalid)

# zero pad the sequences
xtrain_pad=sequence.pad_sequences(xtrain_seq,maxlen=max_len)
xvalid_pad=sequence.pad_sequences(xvalid_seq,maxlen=max_len)

word_index=token.word_index
# word_index 一个dict，保存所有word对应的编号id，从1开始
# word_counts 一个dict，保存每个word在所有文档中出现的次数
# word_docs 一个dict，保存每个word出现的文档的数量
# index_docs 一个dict，保存word的id出现的文档的数量
index_word={v:k for k,v in word_index.items()}

In [15]:
index_word[2]

'and'

In [32]:
from keras.layers.core import SpatialDropout1D
from keras.callbacks import EarlyStopping
EMBEDDING_SIZE=128
HIDDEN_LAYER_SIZE=64
model=Sequential()
model.add(Embedding(len(word_index),EMBEDDING_SIZE,
                   input_length=max_len))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(HIDDEN_LAYER_SIZE,dropout=0.2,recurrent_dropout=0.2))

model.add(Dense(1024,activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024,activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1))
model.add(Activation('softmax'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])


In [34]:
BATCH_SIZE=64
NUM_EPOCHS=10
# Fit the model with early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
model.fit(xtrain_pad,ytrain,batch_size=BATCH_SIZE,epochs=NUM_EPOCHS,validation_data=(xvalid_pad,yvalid),callbacks=[earlystop])

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.callbacks.History at 0x2b4ca978>