In [1]:
import os
import sys
import codecs
import operator
import numpy as np
import re
from time import time


In [2]:
import tensorflow
print(tensorflow.__version__)

2.2.0


In [3]:
import _pickle as cPickle

In [4]:
data_path = './data/doc-level/'

### Reading preprocess data

In [5]:
def read_pickle(data_path, file_name):

    f = open(os.path.join(data_path, file_name), 'rb')
    read_file = cPickle.load(f)
    f.close()

    return read_file

def save_pickle(data_path, file_name, data):

    f = open(os.path.join(data_path, file_name), 'wb')
    cPickle.dump(data, f)
    print(" file saved to: %s"%(os.path.join(data_path, file_name)))
    f.close()

In [6]:
words_idx = read_pickle(data_path, 'words_idx.pkl')

In [7]:
idx_words = read_pickle(data_path, 'idx_words.pkl')

In [8]:
data = read_pickle(data_path, 'data.pkl')

In [9]:
label = read_pickle(data_path, 'label.pkl')

### Preparing training and validation set

In [10]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical


In [11]:
rand_idx = np.arange(len(data))
np.random.shuffle(rand_idx)

data = data[rand_idx]
label = to_categorical(label)[rand_idx]

data_size = len(data)

test_x = data[0:1000]
test_y = label[0:1000]

dev_x = data[1000:5000]
dev_y = label[1000:5000]

train_x = data[5000:int(data_size)]
train_y = label[5000:int(data_size)]


In [12]:
maxlen = np.max([len(d) for d in data])

#import operator
#words_idx = [x for (x, _) in sorted(words_idx.items(), key=operator.itemgetter(1))]

In [13]:
train_x_ = sequence.pad_sequences(train_x, maxlen)
dev_x_ = sequence.pad_sequences(dev_x, maxlen)
test_x_ = sequence.pad_sequences(test_x, maxlen)

In [14]:
train_x_ = np.array(train_x_)
train_y = np.array(train_y)

dev_x_ = np.array(dev_x_)
dev_y = np.array(dev_y)

test_x_ = np.array(test_x_)
test_y = np.array(test_y)

### Data iterator

In [15]:
class Dataiterator():
    '''
      1) Iteration over minibatches using next(); call reset() between epochs to randomly shuffle the data
      2) Access to the entire dataset using all()
    '''
    
    def __init__(self, X, y, seq_length=32, decoder_dim=300, batch_size=32):      
        self.X = X 
        self.y = y 
        self.num_data = len(X) # total number of examples
        self.batch_size = batch_size # batch size
        self.reset() # initial: shuffling examples and set index to 0
    
    def __iter__(self): # iterates data
        return self


    def reset(self): # initials
        self.idx = 0
        self.order = np.random.permutation(self.num_data) # shuffling examples by providing randomized ids 
        
    def __next__(self): # return model inputs - outputs per batch
        X_ids = [] # hold ids per batch 
        while len(X_ids) < self.batch_size:
            X_id = self.order[self.idx] # copy random id from initial shuffling
            X_ids.append(X_id)
            self.idx += 1 # 
            if self.idx >= self.num_data: # exception if all examples of data have been seen (iterated)
                self.reset()
                raise StopIteration()
                
        batch_X = self.X[np.array(X_ids)] # X values (encoder input) per batch
        batch_y = self.y[np.array(X_ids)] # y_in values (decoder input) per batch
        return batch_X, batch_y

          
    def all(self): # return all data examples
        return self.X, self.y

### LSTM Model for document level sentiment classification

In [16]:
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, Input
from keras.models import Model

Using TensorFlow backend.


### Input Layer

In [17]:
input_layer = Input(shape=(maxlen,), dtype="int32", name="input")

### Layer to train embedding weights of words

In [18]:
voc_size = len(words_idx)
embedding = Embedding(input_dim=voc_size, output_dim=300, mask_zero=True, input_length=maxlen, name="embedding")(input_layer)

### RNN-based layer 

In [19]:
dropout = 0.1
recurrent_dropout = 0.1
lstm = LSTM(
    units=300, 
    dropout=dropout, 
    recurrent_dropout=recurrent_dropout, 
    name="lstm"
)(embedding)

### Prediction layer

In [20]:
dropout = Dropout(0.5, name="dropout")(lstm)
output = Dense(3, activation="softmax", name="output")(dropout)

### Construct the model

In [30]:
model = Model(inputs=input_layer, outputs=output)

In [23]:
optimizer = opt.RMSprop(lr=0.001, rho=0.9, epsilon=1e-06, clipnorm=10, clipvalue=0)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [24]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 1016)              0         
_________________________________________________________________
embedding (Embedding)        (None, 1016, 300)         3000900   
_________________________________________________________________
lstm (LSTM)                  (None, 300)               721200    
_________________________________________________________________
dropout (Dropout)            (None, 300)               0         
_________________________________________________________________
output (Dense)               (None, 3)                 903       
Total params: 3,723,003
Trainable params: 3,723,003
Non-trainable params: 0
_________________________________________________________________


### Training with batch generator

In [25]:
batch_size = 50

In [26]:
train_steps_epoch = len(train_x_)/batch_size
batch_train_iter = Dataiterator(train_x_, train_y, batch_size)

In [27]:
val_steps_epoch = len(dev_x_)/batch_size
batch_val_iter = Dataiterator(dev_x_, dev_y, batch_size)

In [28]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

def train_generator(model, batch_train_iter, batch_val_iter):
    earlystop_callbacks = [EarlyStopping(monitor='val_loss', patience=10),
                     ModelCheckpoint(filepath=os.path.join('./','{epoch:02d}-{loss:.2f}.check'), \
                                     monitor='val_loss', save_best_only=False, \
                                     save_weights_only=True)
                     ]
    
    def train_gen():
        while True:
            train_batches = [[X, y] for X, y in batch_train_iter]
            for train_batch in train_batches:
                yield train_batch
                
    def val_gen():
        while True:
            val_batches = [[X, y] for X, y in batch_val_iter]
            for val_batch in val_batches:
                yield val_batch
                
    history = model.fit(train_gen(), validation_data=val_gen(), \
                                  validation_steps=val_steps_epoch, steps_per_epoch=train_steps_epoch, \
                                  epochs = 20, callbacks = earlystop_callbacks)
    return history
      

In [29]:
history=train_generator(model, batch_train_iter, batch_val_iter)

AttributeError: 'Model' object has no attribute '_in_multi_worker_mode'