In [1]:
import pandas as pd
import numpy as np
from keras.models import Model
from nltk.tokenize import TweetTokenizer
from keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, BatchNormalization
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from fastText import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
HOME = '/home/kai/data/kaggle/toxic/hz/'
DATA = HOME + 'data/'
MODEL = HOME + 'model/'
RECORD = DATA + 'summary.csv'

combine_test = False

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

tok = TweetTokenizer()
# train = pd.read_csv(DATA + 'cleaned_train.csv')
train = pd.read_csv('/home/kai/data/kaggle/toxic/dataset/training/emoji_train.csv')

train_sentences = train['comment_text_cleaned']
sentences = train_sentences

text_length = sentences.apply(lambda x: len(tok.tokenize(x)))
mean_length = text_length.mean()
std_length = text_length.std()

print(train.shape)
print(mean_length)
print(std_length)

(159571, 10)
74.75983104699475
110.47788051973407


In [3]:
# grid search params
import pandas as pd
import json

class grid_search_generator(object):
    # Here needs to be modified
    def __init__(self, config_file_url=None):
        if config_file_url == None:
            self.params = {'max_features': [200000], 
                      'epochs': [40], 
                      'batch_size': [1024],
                      'max_len': [int(np.round(mean_length + 3*std_length))], # max sequence length
                      'dropout': [0.2, 0.5],
                      'patience': [5],
                      'model_file': [MODEL + 'lstm_best.hdf5'],
                      'loss': ['binary_crossentropy'],
                      'label_len': [len(label_cols)],
                      # fixed grid search params separate line
                      'embed_trainable': [True, False],
                      'batch_normalization': [True, False],
                      'activation': ['relu', 'tanh', 'sigmoid'],
                      'lstm_activation': ['relu', 'tanh', 'sigmoid'],
                      'lstm_units': [5, 50, 100, 200],
                      'dense_units': [50, 100, 200, 300],
                      'lstm_layer_size': [1, 2],
                      'dense_layer_size': [1, 2, 3],
                      'embedding_param': [{'embed_file': '/home/kai/data/resources/glove/glove.6B.50d.txt', 'embed_size': 50, 'embed_type': 'glove'},
                                          {'embed_file': '/home/kai/data/resources/glove/glove.6B.100d.txt', 'embed_size': 100, 'embed_type': 'glove'},
                                          {'embed_file': '/home/kai/data/resources/glove/glove.6B.200d.txt', 'embed_size': 200, 'embed_type': 'glove'},
                                          {'embed_file': '/home/kai/data/resources/glove/glove.6B.300d.txt', 'embed_size': 300, 'embed_type': 'glove'},
                                          {'embed_file': '/home/kai/data/resources/FastText/wiki.en.bin', 'embed_size': 300, 'embed_type': 'fasttext'}]
                    }

            self.binding = {'embedding_param': ['embed_type', 'embed_file', 'embed_size']}
            self.score_name_list = ['val_loss', 'val_auc', 'train_auc']
            
            self.single_keys = [key for key in self.params.keys() if key not in self.binding.keys()]
            self.binding_keys = list(self.binding.keys())

            self._terminate = False
        else:
            with open(config_file_url, 'r') as f:
                data = json.load(f)
                self.params = data['params']
                self.binding = data['binding']
                self.score_name_list = data['score_name_list']
                self.single_keys = data['single_keys']
                self.binding_keys = data['binding_keys']
                self._idx = data['_idx']
                
        self._terminate = False
        self.keys = list(self.single_keys)
        self.keys.extend(self.binding_keys)
        self.param_keys = list(self.single_keys)
        for i in range(len(self.single_keys), len(self.keys)):
            self.param_keys.extend(self.binding[self.keys[i]])
        
        if config_file_url == None: self._idx = [int(0) for i in range(len(self.keys))]
        self._ub = [len(self.params[key]) for key in self.keys]
    
    def _next_idx(self):
        i = 0
        self._idx[i] = (self._idx[i] + 1) % self._ub[i]
        i += 1
        end_loop = (self._idx[i-1]!=0 or i==len(self._idx))
        terminate = (self._idx[i-1]==0 and i==len(self._idx))
        while(not end_loop):
            self._idx[i] = (self._idx[i] + 1) % self._ub[i]
            i += 1
            end_loop = (self._idx[i-1]!=0 or i==len(self._idx))
            terminate = (self._idx[i-1]==0 and i==len(self._idx))
        self._terminate = terminate

    def _next_param_list(self):
        if not self._terminate:
            input = [self.params[self.keys[i]][self._idx[i]] for i in range(len(self.single_keys))]
            for i in range(len(self.single_keys), len(self.keys)):
                input.extend([self.params[self.keys[i]][self._idx[i]][key] for key in self.binding[self.keys[i]]])
            self._next_idx()
            return input
        else: return None
        
    def get_csv(self, csv_url):
        value_list = [float('nan') for i in range(len(self.score_name_list))]
        column_name = list(self.param_keys)
        column_name.extend(self.score_name_list)
        param_list = []
        
        param = self._next_param_list()
        while param != None:
            param.extend(value_list)
            param_list.append(param)
            param = self._next_param_list()
        pd.DataFrame(param_list, columns=column_name).to_csv(csv_url, index=False)
        self._idx = [0 for i in range(len(self.keys))]
        self._terminate = False
        print('successfully generated grid search csv file\n')
        return 0
    
    def _get_grid_search_param(self, keys, values): return dict(zip(keys, values))
    
    def next_param(self, url):
        value = self._next_param_list()
        if value == None:
            with open(url, 'w') as f: f.write('terminate')
            return None
        param_dict = self._get_grid_search_param(self.param_keys, value)
        with open(url, 'w') as f:
            data = {
                'params': self.params,
                'binding': self.binding,
                'score_name_list': self.score_name_list,
                'single_keys': self.single_keys,
                'binding_keys': self.binding_keys,
                '_idx': self._idx
            }
            json.dump(data, f)
        return param_dict
    
    # get_model, train, recorder
    def grid_search_on_model(self, get_model, train, recorder, record_file, x, y, tokenizer,\
                             val_size=0.2, shuffle=True, url='a.json'):
        x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=val_size, shuffle=shuffle)
        params = self.next_param(url)
        record_start = True
        while params != None:
            print(params)
            model = get_model(params, tokenizer)
            train_auc, val_auc, val_loss = train(model, x_train, y_train, x_val, y_val, params)
            recorder(record_file, params, record_start, train_auc, val_auc, val_loss)
            record_start = False
            params = self.next_param(url)
            
print('done')

done


In [4]:
# record grid search results
def recorder(record_file, params, initialize, train_auc, val_auc, val_loss):
    if initialize:
        head = ''
        for x in params.keys():
            head += x + ','
        head += 'train_auc,val_auc,val_loss\n'
        with open(record_file, 'w') as f: f.write(head)
    r = ''
    for x in params.values():
        r += str(x) + ','
    r += '%.6f,%.6f,%.6f\n'%(train_auc, val_auc, val_loss)
    with open(record_file, 'a') as f: f.write(r)
    print('train_auc: {}, val_auc: {}, val_loss: {}\n\n'.format(train_auc, val_auc, val_loss))

print('done')

done


In [48]:

def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

def glove_get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
#     with open(embedding_file) as ef:
#         embeddings_index = dict(get_coefs(*o.strip().split()) for o in ef.readlines())
    embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file, encoding='utf8'))
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix, nb_words

def fasttext_get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
    word_index = tokenizer.word_index
    ft_model = load_model(embedding_file)
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = ft_model.get_word_vector(word).astype('float32')
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix, nb_words

def get_embedding_matrix(embed_type, file, size, max_features, tokenizer):
    if embed_type == 'fasttext': return fasttext_get_embedding_matrix(file, size, max_features, tokenizer)
    else: return glove_get_embedding_matrix(file, size, max_features, tokenizer)

def get_rnn_model(params, tokenizer):
    embed_type = params['embed_type']
    embed_file = params['embed_file']
    embed_size = params['embed_size']
    lstm_units = params['lstm_units']
    lstm_activation = params['lstm_activation']
    dense_units = params['dense_units']
    activation = params['activation']
    embed_trainable = params['embed_trainable']
    batch_normalization = params['batch_normalization']
    
    max_len = params['max_len']
    dropout = params['dropout']
    loss = params['loss']
    label_len = params['label_len']
    max_features = params['max_features']
    
    embedding_matrix, inp_len = get_embedding_matrix(embed_type, embed_file, embed_size, max_features, tokenizer)
    input = Input(shape=(max_len, ))
    x = Embedding(inp_len, embed_size, weights=[embedding_matrix], trainable=embed_trainable)(input)
    for i in range(params['lstm_layer_size']):
        x = Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=dropout,\
                               recurrent_dropout=dropout, activation=lstm_activation))(x)
    x = GlobalMaxPool1D()(x)
    if batch_normalization:
        x = BatchNormalization()(x)
    for i in range(params['dense_layer_size']):
        x = Dense(dense_units, activation=activation)(x)
    x = Dropout(dropout)(x)
    x = Dense(label_len, activation='sigmoid')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
    return model

def train_model(model, x, y, x_val, y_val, params):
    batch_size = params['batch_size']
    epochs = params['epochs']
    patience = params['patience']
    model_file = params['model_file']
    
    checkpoint = ModelCheckpoint(model_file, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=patience)
    callbacks_list = [checkpoint, earlystopping]
    history = model.fit(x, y, batch_size=batch_size, epochs=epochs,\
                        validation_data=(x_val,y_val), callbacks=callbacks_list)
    
    # predict
    model.load_weights(model_file)
    y_train = model.predict(x, verbose=1)
    y_pre = model.predict(x_val, verbose=1)
    
    # compute the scores
    val_loss = history.history['val_loss'][-1]
    if np.isnan(y_val).any():
        print('y_val contains Nan')
        y_val = np.nan_to_num(y_val)
    if np.isnan(y_pre).any():
        print('y_pre contains Nan')
        y_pre = np.nan_to_num(y_pre)
    if np.isnan(y_train).any():
        print('y_train contains Nan')
        y_train = np.nan_to_num(y_train)
    val_auc = roc_auc_score(y_val, y_pre)
    train_auc = roc_auc_score(y, y_train)
    
    return val_loss, val_auc, train_auc

print('done')

done


In [None]:
param_class = grid_search_generator()

tokenizer = text.Tokenizer(num_words=param_class.params['max_features'][0])
tokenizer.fit_on_texts(sentences.values)
tokenized_train = tokenizer.texts_to_sequences(train_sentences.values)

x = sequence.pad_sequences(tokenized_train, maxlen=param_class.params['max_len'][0])
y = train[label_cols].values

param_class.grid_search_on_model(get_rnn_model, train_model, recorder, RECORD, x, y, tokenizer)
print('done')

{'loss': 'binary_crossentropy', 'patience': 5, 'max_len': 406, 'max_features': 200000, 'embed_size': 50, 'lstm_activation': 'relu', 'label_len': 6, 'dense_layer_size': 1, 'embed_trainable': True, 'epochs': 40, 'batch_size': 1024, 'activation': 'relu', 'batch_normalization': True, 'dropout': 0.2, 'embed_file': '/home/kai/data/resources/glove/glove.6B.50d.txt', 'dense_units': 50, 'model_file': '/home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5', 'lstm_units': 5, 'lstm_layer_size': 1, 'embed_type': 'glove'}
Train on 127656 samples, validate on 31915 samples
Epoch 1/40
Epoch 00001: val_loss improved from inf to 0.63772, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 2/40
Epoch 00002: val_loss improved from 0.63772 to 0.58772, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 3/40
Epoch 00003: val_loss improved from 0.58772 to 0.54284, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 4/40
Epoch 00004: val_loss imp

Epoch 23/40
Epoch 00023: val_loss improved from 0.20178 to 0.19637, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 24/40
Epoch 00024: val_loss improved from 0.19637 to 0.19146, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 25/40
Epoch 00025: val_loss improved from 0.19146 to 0.18700, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 26/40
Epoch 00026: val_loss improved from 0.18700 to 0.18296, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 27/40
Epoch 00027: val_loss improved from 0.18296 to 0.17927, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 28/40
Epoch 00028: val_loss improved from 0.17927 to 0.17593, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 29/40
Epoch 00029: val_loss improved from 0.17593 to 0.17288, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 30/40
Epoch 00030: val_loss improved from 

In [None]:
#continue case

param_class = grid_search_generator('a.json')

tokenizer = text.Tokenizer(num_words=param_class.params['max_features'][0])
tokenizer.fit_on_texts(sentences.values)
tokenized_train = tokenizer.texts_to_sequences(train_sentences.values)

x = sequence.pad_sequences(tokenized_train, maxlen=param_class.params['max_len'][0])
y = train[label_cols].values

param_class.grid_search_on_model(get_rnn_model, train_model, recorder, RECORD, x, y, tokenizer)
print('done')

{'embed_trainable': True, 'patience': 5, 'embed_type': 'glove', 'batch_size': 1024, 'dense_units': 50, 'epochs': 40, 'activation': 'relu', 'model_file': '/home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5', 'max_len': 406, 'loss': 'binary_crossentropy', 'max_features': 200000, 'lstm_layer_size': 1, 'lstm_activation': 'tanh', 'dropout': 0.2, 'label_len': 6, 'lstm_units': 5, 'embed_file': '/home/kai/data/resources/glove/glove.6B.50d.txt', 'batch_normalization': True, 'dense_layer_size': 3, 'embed_size': 50}
Train on 127656 samples, validate on 31915 samples
Epoch 1/40
Epoch 00001: val_loss improved from inf to 0.10372, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 2/40
Epoch 00002: val_loss improved from 0.10372 to 0.06889, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 3/40
Epoch 00003: val_loss improved from 0.06889 to 0.05970, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 4/40
Epoch 00004: val_loss imp

Epoch 00001: val_loss improved from inf to 0.07845, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 2/40
Epoch 00002: val_loss improved from 0.07845 to 0.06584, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 3/40
Epoch 00003: val_loss improved from 0.06584 to 0.05806, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 4/40
Epoch 00004: val_loss improved from 0.05806 to 0.05532, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 5/40
Epoch 00005: val_loss improved from 0.05532 to 0.05308, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 6/40
Epoch 00006: val_loss improved from 0.05308 to 0.05129, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 7/40
Epoch 00007: val_loss improved from 0.05129 to 0.05015, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 8/40
Epoch 00008: val_loss improved from 0.05015 to 0.04942, sav

Epoch 7/40
Epoch 00007: val_loss improved from 0.05525 to 0.05322, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 8/40
Epoch 00008: val_loss improved from 0.05322 to 0.05218, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 9/40
Epoch 00009: val_loss improved from 0.05218 to 0.05118, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 10/40
Epoch 00010: val_loss improved from 0.05118 to 0.05082, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 11/40
Epoch 00011: val_loss did not improve
Epoch 12/40
Epoch 00012: val_loss did not improve
Epoch 13/40
Epoch 00013: val_loss improved from 0.05082 to 0.05041, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 14/40
Epoch 00014: val_loss did not improve
Epoch 15/40
Epoch 00015: val_loss did not improve
Epoch 16/40
Epoch 00016: val_loss did not improve
Epoch 17/40
Epoch 00017: val_loss did not improve
Epoch 18/40
Epoch 00018

Epoch 12/40
Epoch 00012: val_loss did not improve
Epoch 13/40
Epoch 00013: val_loss improved from 0.05049 to 0.05033, saving model to /home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5
Epoch 14/40
Epoch 00014: val_loss did not improve
Epoch 15/40
Epoch 00015: val_loss did not improve
Epoch 16/40
Epoch 00016: val_loss did not improve
Epoch 17/40
Epoch 00017: val_loss did not improve
Epoch 18/40
Epoch 00018: val_loss did not improve
train_auc: 0.05212143960743139, val_auc: 0.9760183759068148, val_loss: 0.9850226663420835


{'embed_trainable': True, 'patience': 5, 'embed_type': 'glove', 'batch_size': 1024, 'dense_units': 50, 'epochs': 40, 'activation': 'tanh', 'model_file': '/home/kai/data/kaggle/toxic/hz/model/lstm_best.hdf5', 'max_len': 406, 'loss': 'binary_crossentropy', 'max_features': 200000, 'lstm_layer_size': 1, 'lstm_activation': 'sigmoid', 'dropout': 0.2, 'label_len': 6, 'lstm_units': 5, 'embed_file': '/home/kai/data/resources/glove/glove.6B.50d.txt', 'batch_normalization': True

In [34]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, shuffle=True)
np.isnan(x_train).any()

False

In [37]:
np.isnan(x).any()

False

In [38]:
np.isnan(y).any()

False

In [42]:
y1 = np.array([0,0,0,1])
y2 = np.array([0,0,0,0])
roc_auc_score(y2, y1)

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.