In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer,  text_to_word_sequence
from keras.engine.topology import Layer
from keras import initializers as initializers, regularizers, constraints
from keras.callbacks import Callback, ModelCheckpoint
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding, Input, Dense, LSTM, GRU, Bidirectional, TimeDistributed, Dropout
from keras import backend as K
from keras import optimizers
from keras.models import Model
import nltk
import re
import matplotlib.pyplot as plt
import sys
from sklearn.metrics import roc_auc_score
from nltk import tokenize
import seaborn as sns
import os                                                                                                                                               
import html
from functools import reduce

Using TensorFlow backend.


In [2]:
l2_reg = regularizers.l2(1e-13)

In [3]:
#source input files path
CLAS_PATH='data/imdb_kaggle/'
os.makedirs(CLAS_PATH, exist_ok=True)

#sentence / field tagging.
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

In [4]:
def product(x, kernel):
    return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)

class AttentionContext(Layer):
    def __init__(self,
                 W_r=None, u_r=None, b_r=None,
                 W_c=None, u_c=None, b_c=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_r = regularizers.get(W_r)
        self.u_r = regularizers.get(u_r)
        self.b_r = regularizers.get(b_r)

        self.W_c = constraints.get(W_c)
        self.u_c = constraints.get(u_c)
        self.b_c = constraints.get(b_c)

        self.bias = bias
        super(AttentionContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_r,
                                 constraint=self.W_c)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_r,
                                     constraint=self.b_c)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_r,
                                 constraint=self.u_c)

        super(AttentionContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = product(uit, self.u)

        a = K.exp(ait)

        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

In [5]:
max_vocab=200000
max_words_per_sent=40
max_senten=10

In [6]:
embed_size=200
VALIDATION_SPLIT = 0.2

In [7]:
re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ').replace('#','')
    return re1.sub(' ', html.unescape(x))

def get_textsFromColumn(df, columnName):
    texts = f'\n{BOS} {FLD} 1 ' + df[columnName].astype(str)
    texts = list(texts.apply(fixup).values)
    return texts

In [8]:
df_trn = pd.read_csv(CLAS_PATH+'/train.tsv',sep='\t', header=None)
df_val = pd.read_csv(CLAS_PATH+'/validation.tsv', sep='\t', header=None)
df_test = pd.read_csv(CLAS_PATH+'/test.tsv', sep='\t', header=None)

In [9]:
trn_texts = get_textsFromColumn(df_trn, 1)
val_texts = get_textsFromColumn(df_val, 1)
test_texts = get_textsFromColumn(df_test, 1)

In [10]:
labels = np.concatenate([df_trn[0], df_val[0]])

In [11]:
trn_val_texts = trn_texts + val_texts
trn_texts_tok = trn_val_texts + test_texts

In [12]:
len(trn_val_texts[0])

2275

In [13]:
trn_tokenizer = Tokenizer(num_words=max_vocab, oov_token=True)
trn_tokenizer.fit_on_texts(trn_texts_tok)

In [14]:
len(trn_tokenizer.word_index)

124316

In [15]:
paras = [tokenize.sent_tokenize(text) for text in trn_val_texts]

In [16]:
def generateSubmission(df, predictValues, csvFile):
    result_df = pd.DataFrame(columns = ['id', 'label'])
    result_df['id'] = df[0]
    result_df['label'] = predictValues
    result_df.to_csv(csvFile, index=False)

In [17]:
def generateEmbeddings(texts, max_senten, max_words_per_sent):
    data = np.zeros((len(texts), max_senten, max_words_per_sent), dtype='int32')
    for i, sentences in enumerate(paras):
        for j, sent in enumerate(sentences):
            if j< max_senten:
                wordTokens = text_to_word_sequence(sent)
                k=0
                for _, word in enumerate(wordTokens):
                    tokenIndex = trn_tokenizer.word_index[word]
                    try:
                        if k<max_words_per_sent and trn_tokenizer.word_index[word]<max_vocab:
                            data[i,j,k] = trn_tokenizer.word_index[word]
                            k=k+1
                    except Exception as e:
                        print(str(e))
                        #print(f'missing word:{word}')
                        pass
    return data
data = generateEmbeddings(trn_val_texts, max_senten, max_words_per_sent)

In [18]:
word_index = trn_tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

Total 124316 unique tokens.


In [19]:
def extractWordAndEmbedding(line):
    wordAndEmbedding = line.split()
    word = wordAndEmbedding[0]
    embedding = np.asarray(wordAndEmbedding[1:], dtype='float32')
    return (word, embedding)
    
GLOVE_DIR = "data/embedding/glove.6B.200d.txt"
embeddings_index = {}
f = open(GLOVE_DIR)
for line in f:
    try:
        (word, embedding) = extractWordAndEmbedding(line)
        embeddings_index[word] = embedding
    except:
        print(word)
        pass
f.close()
print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [20]:
embedding_matrix = np.zeros((len(word_index) + 1, embed_size))
absent_words = 0
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        absent_words += 1
print('Total absent words are', absent_words, 'which is', "%0.2f" % (absent_words * 100 / len(word_index)), '% of total words')

Total absent words are 49203 which is 39.58 % of total words


In [21]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [22]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
print('Number of positive and negative reviews in traing and validation set')
print(y_train)
print(y_val)

Number of positive and negative reviews in traing and validation set
[1 1 1 ... 0 1 1]
[0 1 1 ... 1 0 0]


In [23]:
len(word_index)

124316

In [24]:
embedding_layer = Embedding(len(word_index) + 1,embed_size,weights=[embedding_matrix], input_length=max_words_per_sent, trainable=False)

In [25]:
word_input = Input(shape=(max_words_per_sent,), dtype='float32')
word_sequences = embedding_layer(word_input)
word_lstm = Bidirectional(LSTM(150, return_sequences=True, kernel_regularizer=l2_reg))(word_sequences)
word_dense = TimeDistributed(Dense(200, kernel_regularizer=l2_reg))(word_lstm)
word_att = AttentionContext()(word_dense)
wordEncoder = Model(word_input, word_att)

sent_input = Input(shape=(max_senten, max_words_per_sent), dtype='float32')
sent_encoder = TimeDistributed(wordEncoder)(sent_input)
sent_lstm = Bidirectional(LSTM(150, return_sequences=True, kernel_regularizer=l2_reg))(sent_encoder)
sent_dense = TimeDistributed(Dense(200, kernel_regularizer=l2_reg))(sent_lstm)
sent_att = Dropout(0.5)(AttentionContext()(sent_dense))
non_lin = Dense(8, activation='relu')(sent_att)
preds = Dense(1, activation='sigmoid')(non_lin)
model = Model(sent_input, preds)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])

In [26]:
model.summary()
checkpoint = ModelCheckpoint('best_model4.h5', verbose=0, monitor='val_loss',save_best_only=True, mode='auto')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 10, 40)            0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 10, 200)           25385200  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 10, 300)           421200    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 10, 200)           60200     
_________________________________________________________________
attention_context_2 (Attenti (None, 200)               40400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 1608      
__________

In [27]:
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=100, callbacks=[checkpoint])

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
data_test = generateEmbeddings(test_texts, max_senten, max_words_per_sent)

In [29]:
preds = model.predict(data_test)

In [30]:
preds1 = np.where(preds > 0.5,1, 0)

In [31]:
generateSubmission(df_test, preds1, 'submission_han.csv')