In [1]:
import pandas as pd
import numpy as np
from keras.models import Model
from nltk.tokenize import TweetTokenizer
from keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, BatchNormalization
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from fastText import load_model

Using TensorFlow backend.


In [2]:
HOME = '../../'
DATA = HOME + 'data/'
MODEL = HOME + 'model/'
RECORD = DATA + 'summary.csv'

combine_test = False

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

tok = TweetTokenizer()
train = pd.read_csv(DATA + 'cleaned_train.csv')
train_sentences = train['comment_text_cleaned']
sentences = train_sentences

text_length = sentences.apply(lambda x: len(tok.tokenize(x)))
mean_length = text_length.mean()
std_length = text_length.std()

print(train.shape)
print(mean_length)
print(std_length)

(5000, 27)
75.8668
115.62577101


In [12]:
# grid search params
import pandas as pd
import json

class grid_search_generator(object):
    # Here needs to be modified
    def __init__(self, config_file_url=None):
        if config_file_url == None:
            self.params = {'max_features': [200000], 
                      'epochs': [40], 
                      'batch_size': [1024],
                      'max_len': [int(np.round(mean_length + 3*std_length))], # max sequence length
                      'dropout': [0.2, 0.5],
                      'patience': [5],
                      'model_file': [MODEL + 'lstm_best.hdf5'],
                      'loss': ['binary_crossentropy'],
                      'label_len': [len(label_cols)],
                      # fixed grid search params separate line
                      'embed_trainable': [True, False],
                      'batch_normalization': [True, False],
                      'activation': ['relu', 'tanh', 'sigmoid'],
                      'lstm_activation': ['relu', 'tanh', 'sigmoid'],
                      'lstm_units': [5, 50, 100, 200],
                      'dense_units': [50, 100, 200, 300],
                      'lstm_layer_size': [1, 2],
                      'dense_layer_size': [1, 2, 3],
                      'embedding_param': [{'embed_file': '../../data/glove.6B.50d.txt', 'embed_size': 50, 'embed_type': 'glove'},
                                          {'embed_file': '../../data/glove.6B.100d.txt', 'embed_size': 50, 'embed_type': 'glove'},
                                          {'embed_file': '../../data/glove.6B.200d.txt', 'embed_size': 50, 'embed_type': 'glove'},
                                          {'embed_file': '../../data/glove.6B.300d.txt', 'embed_size': 50, 'embed_type': 'glove'},
                                          {'embed_file': '../../data/fasttext.txt', 'embed_size': 200, 'embed_type': 'glove'}]
                    }

            self.binding = {'embedding_param': ['embed_type', 'embed_file', 'embed_size']}
            self.score_name_list = ['val_loss', 'val_auc', 'train_auc']
            
            self.single_keys = [key for key in self.params.keys() if key not in self.binding.keys()]
            self.binding_keys = list(self.binding.keys())

            self._terminate = False
        else:
            with open(config_file_url, 'r') as f:
                data = json.load(f)
                self.params = data['params']
                self.binding = data['binding']
                self.score_name_list = data['score_name_list']
                self.single_keys = data['single_keys']
                self.binding_keys = data['binding_keys']
                self._idx = data['_idx']
                
        self._terminate = False
        self.keys = list(self.single_keys)
        self.keys.extend(self.binding_keys)
        self.param_keys = list(self.single_keys)
        for i in range(len(self.single_keys), len(self.keys)):
            self.param_keys.extend(self.binding[self.keys[i]])
        
        if config_file_url == None: self._idx = [int(0) for i in range(len(self.keys))]
        self._ub = [len(self.params[key]) for key in self.keys]
    
    def _next_idx(self):
        i = 0
        self._idx[i] = (self._idx[i] + 1) % self._ub[i]
        i += 1
        end_loop = (self._idx[i-1]!=0 or i==len(self._idx))
        terminate = (self._idx[i-1]==0 and i==len(self._idx))
        while(not end_loop):
            self._idx[i] = (self._idx[i] + 1) % self._ub[i]
            i += 1
            end_loop = (self._idx[i-1]!=0 or i==len(self._idx))
            terminate = (self._idx[i-1]==0 and i==len(self._idx))
        self._terminate = terminate

    def _next_param_list(self):
        if not self._terminate:
            input = [self.params[self.keys[i]][self._idx[i]] for i in range(len(self.single_keys))]
            for i in range(len(self.single_keys), len(self.keys)):
                input.extend([self.params[self.keys[i]][self._idx[i]][key] for key in self.binding[self.keys[i]]])
            self._next_idx()
            return input
        else: return None
        
    def get_csv(self, csv_url):
        value_list = [float('nan') for i in range(len(self.score_name_list))]
        column_name = list(self.param_keys)
        column_name.extend(self.score_name_list)
        param_list = []
        
        param = self._next_param_list()
        while param != None:
            param.extend(value_list)
            param_list.append(param)
            param = self._next_param_list()
        pd.DataFrame(param_list, columns=column_name).to_csv(csv_url, index=False)
        self._idx = [0 for i in range(len(self.keys))]
        self._terminate = False
        print('successfully generated grid search csv file\n')
        return 0
    
    def _get_grid_search_param(self, keys, values): return dict(zip(keys, values))
    
    def next_param(self, url):
        value = self._next_param_list()
        if value == None:
            with open(url, 'w') as f: f.write('terminate')
            return None
        param_dict = self._get_grid_search_param(self.param_keys, value)
        with open(url, 'w') as f:
            data = {
                'params': self.params,
                'binding': self.binding,
                'score_name_list': self.score_name_list,
                'single_keys': self.single_keys,
                'binding_keys': self.binding_keys,
                '_idx': self._idx
            }
            json.dump(data, f)
        return param_dict
    
    # get_model, train, recorder
    def grid_search_on_model(self, get_model, train, recorder, record_file, x, y, tokenizer,\
                             val_size=0.2, shuffle=True, url='a.json'):
        x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=val_size, shuffle=shuffle)
        params = self.next_param(url)
        record_start = True
        while params != None:
            print(params)
            model = get_model(params, tokenizer)
            train_auc, val_auc, val_loss = train(model, x_train, y_train, x_val, y_val, params)
            recorder(record_file, params, record_start, train_auc, val_auc, val_loss)
            record_start = False
            params = self.next_param(url)
            
print('done')

done


In [4]:
# record grid search results
def recorder(record_file, params, initialize, train_auc, val_auc, val_loss):
    if initialize:
        head = ''
        for x in params.keys():
            head += x + ','
        head += 'train_auc,val_auc,val_loss\n'
        with open(record_file, 'w') as f: f.write(head)
    r = ''
    for x in params.values():
        r += str(x) + ','
    r += '%.6f,%.6f,%.6f\n'%(train_auc, val_auc, val_loss)
    with open(record_file, 'a') as f: f.write(r)
    print('train_auc: {}, val_auc: {}, val_loss: {}\n\n'.format(train_auc, val_auc, val_loss))

print('done')

done


In [7]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

def glove_get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
    embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file))
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix, nb_words

def fasttext_get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
    word_index = tokenizer.word_index
    ft_model = load_model(embedding_file)
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = ft_model.get_word_vector(word).astype('float32')
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix, nb_words

def get_embedding_matrix(embed_type, file, size, max_features, tokenizer):
    if embed_type == 'fasttext': return fasttext_get_embedding_matrix(file, size, max_features, tokenizer)
    else: return glove_get_embedding_matrix(file, size, max_features, tokenizer)

def get_rnn_model(params, tokenizer):
    embed_type = params['embed_type']
    embed_file = params['embed_file']
    embed_size = params['embed_size']
    lstm_units = params['lstm_units']
    lstm_activation = params['lstm_activation']
    dense_units = params['dense_units']
    activation = params['activation']
    embed_trainable = params['embed_trainable']
    batch_normalization = params['batch_normalization']
    
    max_len = params['max_len']
    dropout = params['dropout']
    loss = params['loss']
    label_len = params['label_len']
    max_features = params['max_features']
    
    embedding_matrix, inp_len = get_embedding_matrix(embed_type, embed_file, embed_size, max_features, tokenizer)
    input = Input(shape=(max_len, ))
    x = Embedding(inp_len, embed_size, weights=[embedding_matrix], trainable=embed_trainable)(input)
    for i in range(params['lstm_layer_size']):
        x = Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=dropout,\
                               recurrent_dropout=dropout, activation=lstm_activation))(x)
    x = GlobalMaxPool1D()(x)
    if batch_normalization:
        x = BatchNormalization()(x)
    for i in range(params['dense_layer_size']):
        x = Dense(dense_units, activation=activation)(x)
    x = Dropout(dropout)(x)
    x = Dense(label_len, activation='sigmoid')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
    return model

def train_model(model, x, y, x_val, y_val, params):
    batch_size = params['batch_size']
    epochs = params['epochs']
    patience = params['patience']
    model_file = params['model_file']
    
    checkpoint = ModelCheckpoint(model_file, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=patience)
    callbacks_list = [checkpoint, earlystopping]
    history = model.fit(x, y, batch_size=batch_size, epochs=epochs,\
                        validation_data=(x_val,y_val), callbacks=callbacks_list)
    
    # predict
    model.load_weights(model_file)
    y_train = model.predict(x, verbose=1)
    y_pre = model.predict(x_val, verbose=1)
    
    # compute the scores
    val_loss = history.history['val_loss'][-1]
    val_auc = roc_auc_score(y_val, y_pre)
    train_auc = roc_auc_score(y, y_train)
    
    return val_loss, val_auc, train_auc

print('done')

done


In [13]:
param_class = grid_search_generator()

tokenizer = text.Tokenizer(num_words=param_class.params['max_features'][0])
tokenizer.fit_on_texts(sentences.values)
tokenized_train = tokenizer.texts_to_sequences(train_sentences.values)

x = sequence.pad_sequences(tokenized_train, maxlen=param_class.params['max_len'][0])
y = train[label_cols].values

param_class.grid_search_on_model(get_rnn_model, train_model, recorder, RECORD, x, y, tokenizer)
print('done')

{'max_features': 10, 'epochs': 1, 'batch_size': 32, 'max_len': 423, 'dropout': 0.2, 'patience': 5, 'model_file': '../../model/lstm_best.hdf5', 'loss': 'binary_crossentropy', 'label_len': 6, 'embed_trainable': True, 'batch_normalization': False, 'activation': 'relu', 'lstm_activation': 'tanh', 'lstm_units': 5, 'dense_units': 10, 'lstm_layer_size': 1, 'dense_layer_size': 1, 'embed_type': 'glove', 'embed_file': '../../data/glove.6B.50d.txt', 'embed_size': 50}
Train on 1000 samples, validate on 4000 samples
Epoch 1/1
train_auc: 0.6610714311599731, val_auc: 0.5153796293740789, val_loss: 0.5248197521065308


{'max_features': 10, 'epochs': 1, 'batch_size': 32, 'max_len': 423, 'dropout': 0.2, 'patience': 5, 'model_file': '../../model/lstm_best.hdf5', 'loss': 'binary_crossentropy', 'label_len': 6, 'embed_trainable': False, 'batch_normalization': False, 'activation': 'relu', 'lstm_activation': 'tanh', 'lstm_units': 5, 'dense_units': 10, 'lstm_layer_size': 1, 'dense_layer_size': 1, 'embed_type': 

KeyboardInterrupt: 

In [14]:
# continue case

# param_class = grid_search_generator('a.json')

# tokenizer = text.Tokenizer(num_words=param_class.params['max_features'][0])
# tokenizer.fit_on_texts(sentences.values)
# tokenized_train = tokenizer.texts_to_sequences(train_sentences.values)

# x = sequence.pad_sequences(tokenized_train, maxlen=param_class.params['max_len'][0])
# y = train[label_cols].values

# param_class.grid_search_on_model(get_rnn_model, train_model, recorder, RECORD, x, y, tokenizer)
# print('done')

{'max_features': 10, 'epochs': 1, 'batch_size': 32, 'max_len': 423, 'dropout': 0.2, 'patience': 5, 'model_file': '../../model/lstm_best.hdf5', 'loss': 'binary_crossentropy', 'label_len': 6, 'embed_trainable': True, 'batch_normalization': False, 'activation': 'relu', 'lstm_activation': 'tanh', 'lstm_units': 5, 'dense_units': 10, 'lstm_layer_size': 1, 'dense_layer_size': 2, 'embed_type': 'glove', 'embed_file': '../../data/glove.6B.50d.txt', 'embed_size': 50}
Train on 1000 samples, validate on 4000 samples
Epoch 1/1
train_auc: 0.5480913205146789, val_auc: 0.5606760135327692, val_loss: 0.5901663339588018


{'max_features': 10, 'epochs': 1, 'batch_size': 32, 'max_len': 423, 'dropout': 0.2, 'patience': 5, 'model_file': '../../model/lstm_best.hdf5', 'loss': 'binary_crossentropy', 'label_len': 6, 'embed_trainable': False, 'batch_normalization': False, 'activation': 'relu', 'lstm_activation': 'tanh', 'lstm_units': 5, 'dense_units': 10, 'lstm_layer_size': 1, 'dense_layer_size': 2, 'embed_type': 