In [1]:
import pandas as pd
import numpy as np
from keras.models import Model
from nltk.tokenize import TweetTokenizer
from keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, BatchNormalization
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
HOME = '../../'
DATA = HOME + 'data/'
MODEL = HOME + 'model/'
RECORD = DATA + 'summary.csv'

combine_test = False

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

tok = TweetTokenizer()
train = pd.read_csv(DATA + 'cleaned_train.csv')[:3000]
test = pd.read_csv(DATA + 'cleaned_test.csv')[:1000]
train_sentences = train['comment_text_cleaned']
test_sentences = test['comment_text_cleaned']

if combine_test: sentences = pd.concat([train_sentences, test_sentences])
else: sentences = train_sentences

text_length = sentences.apply(lambda x: len(tok.tokenize(x)))
mean_length = text_length.mean()
std_length = text_length.std()

print(train.shape)
print(test.shape)
print(mean_length)
print(std_length)

(3000, 27)
(1000, 21)
74.24433333333333
110.13973346701432


In [3]:
# grid search params

class grid_search_parameters(object):
    # Here needs to be modified
    params = {'max_features': [200000], 
              'epochs': [1], ################################################################################
              'batch_size': [32],
              'max_len': [np.round(mean_length + 3*std_length).astype(int)], # max sequence length
              'dropout': [0.2],
              'patience': [5],
              'model_file': [MODEL + 'lstm_best.hdf5'],
              'loss': ['binary_crossentropy'],
              'label_len': [len(label_cols)],
              # fixed grid search params separate line
              'embed_trainable': [True, False],
              'batch_normalization': [True, False],
              'activation': ['relu', 'tanh', 'elu', 'sigmoid'],
              'lstm_activation': ['relu', 'tanh', 'elu', 'sigmoid'],
              'lstm_units': [350, 250, 150],
              'dense_units': [350, 250, 150],
              'lstm_layer_size': [2, 1],
              'dense_layer_size': [3, 2, 1],
              'embedding_param': [{'file': '/home/kai/data/resources/glove/glove.6B.50d.txt', 'size': 50, 'type': 'glove'},
                                  {'file': '/home/kai/data/resources/glove/glove.6B.100d.txt', 'size': 100, 'type': 'glove'},
                                  {'file': '/home/kai/data/resources/glove/glove.6B.200d.txt', 'size': 200, 'type': 'glove'},
                                  {'file': '/home/kai/data/resources/glove/glove.6B.300d.txt', 'size': 300, 'type': 'glove'},
                                  {'file': '/home/kai/data/resources/FastText/wiki.en.bin', 'size':300, 'type': 'fasttext'}]
              }
    param_keys = [
        'max_features', 'epochs', 'batch_size', 'max_len', 'dropout', 'patience', 'model_file', 'loss',\
        'label_len', 'embed_trainable', 'batch_normalization', 'activation', 'lstm_activation', 'lstm_units',\
        'dense_units', 'lstm_layer_size', 'dense_layer_size', 'embed_type', 'embed_file', 'embed_size'
    ]
    
    def __init__(self):
        self._idx = np.zeros(len(self.params.keys())).astype(int)
        ub = []
        for x in self.params.keys():
            ub.append(len(self.params[x]))
        self._ub = np.array(ub).astype(int)
        self._terminate = False
    
    def _next_idx(self):
        i = 0
        self._idx[i] = (self._idx[i] + 1) % self._ub[i]
        i += 1
        end_loop = (self._idx[i-1]!=0 or i==self._idx.shape[0])
        terminate = (self._idx[i-1]==0 and i==self._idx.shape[0])
        while(not end_loop):
            self._idx[i] = (self._idx[i] + 1) % self._ub[i]
            i += 1
            end_loop = (self._idx[i-1]!=0 or i==self._idx.shape[0])
            terminate = (self._idx[i-1]==0 and i==self._idx.shape[0])
        self._terminate = terminate

    def next_param(self):
        if not self._terminate:
            # Here needs to be modified
            tmp = self._get_grid_search_param(self.params['max_features'][self._idx[0]],
                                              self.params['epochs'][self._idx[1]],
                                              self.params['batch_size'][self._idx[2]],
                                              self.params['max_len'][self._idx[3]],
                                              self.params['dropout'][self._idx[4]],
                                              self.params['patience'][self._idx[5]],
                                              self.params['model_file'][self._idx[6]],
                                              self.params['loss'][self._idx[7]],
                                              self.params['label_len'][self._idx[8]],
                                              self.params['embed_trainable'][self._idx[9]],
                                              self.params['batch_normalization'][self._idx[10]],
                                              self.params['activation'][self._idx[11]],
                                              self.params['lstm_activation'][self._idx[12]],
                                              self.params['lstm_units'][self._idx[13]],
                                              self.params['dense_units'][self._idx[14]],
                                              self.params['lstm_layer_size'][self._idx[15]],
                                              self.params['dense_layer_size'][self._idx[16]],
                                              self.params['embedding_param'][self._idx[17]]['type'],
                                              self.params['embedding_param'][self._idx[17]]['file'],
                                              self.params['embedding_param'][self._idx[17]]['size'])
            self._next_idx()
            return tmp
        else: return None
        
    def _get_grid_search_param(self, *x): return dict(zip(self.param_keys, x))
    
    # get_model, train, recorder
    def grid_search_on_model(self, get_model, train, recorder, record_file, x, y, tokenizer,\
                             val_size=0.8, shuffle=True):
        x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=val_size, shuffle=False)#shuffle)
        params = self.next_param()
        print(params)
        record_start = True
        while params != None:
            model = get_model(params, tokenizer)
            train_auc, val_auc, val_loss = train(model, x_train, y_train, x_val, y_val, params)
            recorder(record_file, train_auc, val_auc, val_loss, params, record_start)
            record_start = False
            params = self.next_param()
            print(params)

# record grid search results
def recorder(record_file, train_auc, val_auc, val_loss, params, initialize=False):
    if initialize:
        head = ''
        for x in params.keys():
            head += x + ','
        head += 'train_auc,val_auc,val_loss\n'
        with open(record_file, 'w') as f: f.write(head)
    r = ''
    for x in params.values():
        r += str(x) + ','
    r += '%.6f,%.6f,%.6f\n'%(train_auc, val_auc, val_loss)
    with open(record_file, 'a') as f: f.write(r)
    print('train_auc {},val_auc {},val_loss {}\n\n'.format(train_auc, val_auc, val_loss))

print('done')

done


In [4]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

def glove_get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
    embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file, encoding='utf8'))
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix, nb_words

def fasttext_get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = ft_model.get_word_vector(word).astype('float32')
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix, nb_words

def get_embedding_matrix(embed_type, file, size, max_features, tokenizer):
    if embed_type == 'fasttext': return fasttext_get_embedding_matrix(file, size, max_features, tokenizer)
    else: return glove_get_embedding_matrix(file, size, max_features, tokenizer)

def get_rnn_model(params, tokenizer):
    embed_type = params['embed_type']
    embed_file = params['embed_file']
    embed_size = params['embed_size']
    lstm_units = params['lstm_units']
    lstm_activation = params['lstm_activation']
    dense_units = params['dense_units']
    activation = params['activation']
    embed_trainable = params['embed_trainable']
    batch_normalization = params['batch_normalization']
    
    max_len = params['max_len']
    dropout = params['dropout']
    loss = params['loss']
    label_len = params['label_len']
    max_features = params['max_features']
    
    embedding_matrix, inp_len = get_embedding_matrix(embed_type, embed_file, embed_size, max_features, tokenizer)
    input = Input(shape=(max_len, ))
    x = Embedding(inp_len, embed_size, weights=[embedding_matrix], trainable=embed_trainable)(input)
    for i in range(params['lstm_layer_size']):
        x = Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=dropout,\
                               recurrent_dropout=dropout, activation=lstm_activation))(x)
    x = GlobalMaxPool1D()(x)
    if batch_normalization:
        x = BatchNormalization()(x)
    for i in range(params['dense_layer_size']):
        x = Dense(dense_units, activation=activation)(x)
    x = Dropout(dropout)(x)
    x = Dense(label_len, activation='sigmoid')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
    return model

def train_model(model, x, y, x_val, y_val, params):
    batch_size = params['batch_size']
    epochs = params['epochs']
    patience = params['patience']
    model_file = params['model_file']
    
    checkpoint = ModelCheckpoint(model_file, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=patience)
    callbacks_list = [checkpoint, earlystopping]
    history = model.fit(x, y, batch_size=batch_size, epochs=epochs,\
                        validation_data=(x_val,y_val), callbacks=callbacks_list)
    
    # predict
    model.load_weights(model_file)
    y_train = model.predict(x, verbose=1)
    y_pre = model.predict(x_val, verbose=1)
    
    # compute the scores
    val_loss = history.history['val_loss'][-1]
    val_auc = roc_auc_score(y_val, y_pre)
    train_auc = roc_auc_score(y, y_train)
    
    return val_loss, val_auc, train_auc

print('done')

done


In [5]:
param_class = grid_search_parameters()

tokenizer = text.Tokenizer(num_words=param_class.params['max_features'][0])
tokenizer.fit_on_texts(sentences.values)
tokenized_train = tokenizer.texts_to_sequences(train_sentences.values)

x = sequence.pad_sequences(tokenized_train, maxlen=param_class.params['max_len'][0])
y = train[label_cols].values

param_class.grid_search_on_model(get_rnn_model, train_model, recorder, RECORD, x, y, tokenizer)
print('done')

{'activation': 'relu', 'batch_normalization': True, 'dense_units': 350, 'patience': 5, 'epochs': 1, 'max_len': 405, 'model_file': '../../model/lstm_best.hdf5', 'batch_size': 32, 'embed_file': '/home/kai/data/resources/glove/glove.6B.200d.txt', 'dense_layer_size': 3, 'embed_size': 200, 'loss': 'binary_crossentropy', 'lstm_activation': 'relu', 'embed_type': 'glove', 'max_features': 200000, 'lstm_layer_size': 1, 'lstm_units': 350, 'dropout': 0.2, 'embed_trainable': True, 'label_len': 6}
Train on 600 samples, validate on 2400 samples
Epoch 1/1
Epoch 00001: val_loss improved from inf to 0.65650, saving model to ../../model/lstm_best.hdf5
train_auc 0.6564994517962138,val_auc 0.5,val_loss 0.5




IndexError: list index out of range