In [1]:
# grid search params
import pandas as pd
import json

class grid_search_generator(object):
    def __init__(self, config_file_url='grid_search_config.json',
                 params=None, bindings=None, score_name_list=None, csv_url=None):
        new_generator = params != None and bindings != None and score_name_list != None
        if new_generator:
            self.params = params
            self.binding = bindings
            self.score_name_list = score_name_list
            
            self.single_keys = [key for key in self.params.keys() if key not in self.binding.keys()]
            self.binding_keys = list(self.binding.keys())
            self._terminate = False
            
            self._save_config(config_file_url)
        else:
            with open(config_file_url, 'r') as f:
                data = json.load(f)
                self.params = data['params']
                self.binding = data['binding']
                self.score_name_list = data['score_name_list']
                self.single_keys = data['single_keys']
                self.binding_keys = data['binding_keys']
                
        self.keys = list(self.single_keys)
        self.keys.extend(self.binding_keys)
        self.param_keys = list(self.single_keys)
        for i in range(len(self.single_keys), len(self.keys)):
            self.param_keys.extend(self.binding[self.keys[i]])
        
        self._terminate = False
        self._idx = [0 for i in range(len(self.keys))]
        self._ub = [len(self.params[key]) for key in self.keys]
        
        if csv_url != None: self.get_csv(csv_url)
    
    def _reset_config(self):
        self._terminate = False
        self._idx = [0 for i in range(len(self.keys))]
        self._ub = [len(self.params[key]) for key in self.keys]
    
    def _save_config(self, config_file_url):
        with open(config_file_url, 'w') as f:
            data = {
                'params': self.params,
                'binding': self.binding,
                'score_name_list': self.score_name_list,
                'single_keys': self.single_keys,
                'binding_keys': self.binding_keys
            }
            json.dump(data, f)
    
    def _next_idx(self):
        i = 0
        self._idx[i] = (self._idx[i] + 1) % self._ub[i]
        i += 1
        end_loop = (self._idx[i-1]!=0 or i==len(self._idx))
        terminate = (self._idx[i-1]==0 and i==len(self._idx))
        while(not end_loop):
            self._idx[i] = (self._idx[i] + 1) % self._ub[i]
            i += 1
            end_loop = (self._idx[i-1]!=0 or i==len(self._idx))
            terminate = (self._idx[i-1]==0 and i==len(self._idx))
        self._terminate = terminate

    def _next_param_list(self):
        if not self._terminate:
            input = [self.params[self.keys[i]][self._idx[i]] for i in range(len(self.single_keys))]
            for i in range(len(self.single_keys), len(self.keys)):
                input.extend([self.params[self.keys[i]][self._idx[i]][key] for key in self.binding[self.keys[i]]])
            self._next_idx()
            return input
        else: return None
        
    def _get_param_frame(self):
        param_list = []
        self._reset_config()
        param = self._next_param_list()
        while param != None:
            param_list.append(param)
            param = self._next_param_list()
        return pd.DataFrame(param_list, columns=self.param_keys)
        
    def get_csv(self, csv_url):
        self._get_param_frame().to_csv(csv_url, index=False)
        print('successfully generated grid search csv file\n')
    
    def _get_grid_search_param(keys, values): return dict(zip(keys, values))
    
    def _get_next_param(csv_url, param_list):
        df = pd.read_csv(csv_url)
        if df.empty: return None
        param = grid_search_generator._get_grid_search_param(param_list, df.iloc[0].values)
        df.iloc[1 : ].to_csv(csv_url, index=False)
        return param
    
    def _recorder(values, record_csv, score_name_list, params, param_keys, label_list, confusion_matrices, initialize):
        head = list(param_keys)
        head.extend(score_name_list)
        if label_list != None: head.extend(label_list)
        
        content = []
        for x in param_keys: content.append(params[x])
        content.extend(values)
        if label_list != None:
            for x in label_list: content.append(confusion_matrices[x])
        if initialize:
            with open(record_csv, 'w') as f: pd.DataFrame([content], columns=head).to_csv(f, header=True, index=False)
        else:
            with open(record_csv, 'a') as f: pd.DataFrame([content], columns=head).to_csv(f, header=False, index=False)

        print_str = ''
        for i, j in enumerate(values): print_str += (score_name_list[i] + ':' + ('%.6f, ' % j))
        print_str = print_str[ : -2] + '\n'
        print(print_str)
    
    def add_params(csv_url, param_dict, config_file_url='grid_search_config.json'):
        for key in param_dict.keys():
            grid_search = grid_search_generator(config_file_url=config_file_url)
            original = list(grid_search.params[key])
            param_values = [value for value in param_dict[key] if value not in original]
            grid_search.params[key] = param_values
            with open(csv_url, 'a') as f: grid_search._get_param_frame().to_csv(f, header=False, index=False)
            grid_search.params[key].extend(original)
            grid_search._save_config(config_file_url)
        
        print('successfully append the new parameters')
        
    def delete_params(csv_url, param_dict, config_file_url='grid_search_config.json'):
        csv = pd.read_csv(csv_url)
        grid_search = grid_search_generator(config_file_url)
        
        for key in param_dict.keys():
            for value in param_dict[key]:
                csv = csv[csv[key] != value]
        with open(csv_url, 'w') as f: csv.to_csv(f, index=False)
        
        for key in param_dict.keys():
            if key in grid_search.single_keys:
                grid_search.params[key] = [value for value in grid_search.params[key] if value not in param_dict[key]]
            else:
                for bkey in grid_search.binding_keys:
                    if key in grid_search.binding[bkey]:
                        grid_search.params[bkey] = [value for value in grid_search.params[bkey] if value[key] not in param_dict[key]]
                        break
        grid_search._save_config(config_file_url)
        print('successfully delete the parameters')
    
    def search(remain_csv, record_csv, X, y, model_run, other_model_dependency_dict, label_list=None,
               config_file_url='grid_search_config.json', X_y_split=False, val_size=0.1, shuffle=True):
        grid_search = grid_search_generator(config_file_url)
        if X_y_split:
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size, shuffle=shuffle)
        param = grid_search_generator._get_next_param(remain_csv, grid_search.param_keys)
        header = True
        while param != None:
            print(param)
            if X_y_split: scores = list(model_run(param, X_train, X_val, y_train, y_val, other_model_dependency_dict))
            else: scores, confusion_matrices = model_run(param, X, y, other_model_dependency_dict)
            grid_search_generator._recorder(scores, record_csv, grid_search.score_name_list, param,
                                            grid_search.param_keys, label_list, confusion_matrices, header)
            header = False
            param = grid_search_generator._get_next_param(remain_csv, grid_search.param_keys)
            
print('done')

done


In [2]:
import numpy as np
import pandas as pd
from keras.models import Model
from nltk.tokenize import TweetTokenizer
from keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, BatchNormalization
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle

HOME = '../../'
DATA = HOME + 'data/'
MODEL = HOME + 'model/'
RECORD = DATA + 'summary.csv'

combine_test = False

# label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# label_cols = ['toxic', 'obscene', 'insult']
label_cols = ['toxic']

tok = TweetTokenizer()
train = pd.read_csv(DATA + 'cleaned_train.csv')
# train = pd.read_csv('/home/kai/data/kaggle/toxic/dataset/training/emoji_train.csv')

train_sentences = train['comment_text_cleaned']
sentences = train_sentences

text_length = sentences.apply(lambda x: len(tok.tokenize(x)))
mean_length = text_length.mean()
std_length = text_length.std()

print(train.shape)
print(mean_length)
print(std_length)

Using TensorFlow backend.


(159571, 27)
74.7713180966
110.453204391


In [3]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

def glove_get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
    embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file, encoding='utf8'))
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix, nb_words

def fasttext_get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
    word_index = tokenizer.word_index
    ft_model = load_model(embedding_file)
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = ft_model.get_word_vector(word).astype('float32')
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix, nb_words

def get_embedding_matrix(embed_type, file, size, max_features, tokenizer):
    if embed_type == 'fasttext': return fasttext_get_embedding_matrix(file, size, max_features, tokenizer)
    else: return glove_get_embedding_matrix(file, size, max_features, tokenizer)

def get_rnn_model(params, tokenizer):
    embed_type = params['embed_type']
    embed_file = params['embed_file']
    embed_size = params['embed_size']
    lstm_units = params['lstm_units']
    lstm_activation = params['lstm_activation']
    dense_units = params['dense_units']
    activation = params['activation']
    embed_trainable = params['embed_trainable']
    batch_normalization = params['batch_normalization']
    
    max_len = params['max_len']
    dropout = params['dropout']
    loss = params['loss']
    label_len = params['label_len']
    max_features = params['max_features']
    
    embedding_matrix, inp_len = get_embedding_matrix(embed_type, embed_file, embed_size, max_features, tokenizer)
    input = Input(shape=(max_len, ))
    x = Embedding(inp_len, embed_size, weights=[embedding_matrix], trainable=embed_trainable)(input)
    for i in range(params['lstm_layer_size']):
        x = Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=dropout,\
                               recurrent_dropout=dropout, activation=lstm_activation))(x)
    x = GlobalMaxPool1D()(x)
    if batch_normalization:
        x = BatchNormalization()(x)
    for i in range(params['dense_layer_size']):
        x = Dense(dense_units, activation=activation)(x)
    x = Dropout(dropout)(x)
    x = Dense(label_len, activation='sigmoid')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
    return model

def train_model(model, x, y, x_val, y_val, params):
    batch_size = params['batch_size']
    epochs = params['epochs']
    patience = params['patience']
    model_file = params['model_file']
    
    checkpoint = ModelCheckpoint(model_file, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=patience)
    callbacks_list = [checkpoint, earlystopping]
    history = model.fit(x, y, batch_size=batch_size, epochs=epochs,\
                        validation_data=(x_val,y_val), callbacks=callbacks_list)
    
    # predict
    model.load_weights(model_file)
#     y_train = model.predict(x, verbose=1)
    y_pre = model.predict(x_val, verbose=1)
    
    # compute the scores
    best_epoch = history.history['val_loss'].index(min(history.history['val_loss']))
    val_loss = history.history['val_loss'][best_epoch]
    val_auc = roc_auc_score(y_val, y_pre)
#     train_auc = roc_auc_score(y, y_train)
    
#     val_auc = 0
    train_auc = 0
    
    thres = 0.5
    def f(x):
        return (x > thres)*1
    
    M = {}
    y_pre = pd.DataFrame(y_pre, columns=label_cols)
    for i in label_cols:
        y_tmp = y_pre[i].apply(f)
        M[i] = confusion_matrix(y_val[i], y_tmp)
    
    return val_loss, best_epoch, val_auc, train_auc, M

# def balance(x_train, y_train):
#     df = pd.concat([x_train, y_train], axis=1)
#     df['t'] = df[label_cols].sum(axis=1)
#     clean = df[df['t']==0].sample(n=df[df['toxic']==1].shape[0])
#     zs = clean.shape[0]
#     remain = df[df['t']!=0]
#     print(zs-remain[remain['toxic']==1].shape[0], remain[remain['toxic'] == 1].shape[0])
#     toxic = remain[remain['toxic'] == 1].sample(n=zs, replace=True)
#     o = pd.concat([clean, toxic])
#     remain = remain[remain['toxic'] == 0]
#     print(max(0,zs-o[o['obscene']==1].shape[0]), remain[remain['obscene'] == 1].shape[0])
#     obscene = remain[remain['obscene'] == 1].sample(n=max(0,zs-o[o['obscene']==1].shape[0]), replace=True)
#     o = pd.concat([clean, obscene])
#     remain = remain[remain['obscene'] == 0]
#     print(max(0,zs-o[o['insult']==1].shape[0]), remain[remain['insult'] == 1].shape[0])
#     insult = remain[remain['insult'] == 1].sample(n=max(0,zs-o[o['insult']==1].shape[0]),  replace=True)
#     o = shuffle(pd.concat([o, insult]))
#     return o['comment_text_cleaned'], o[label_cols]

def balance(x_train, y_train):
    df = pd.concat([x_train, y_train], axis=1)
    df['t'] = df[label_cols].sum(axis=1)
    clean = df[df['t']==0]
    o = shuffle(pd.concat([df[df[label_cols[0]]==1], clean.sample(n=df[df[label_cols[0]]==1].shape[0])]))
    return o['comment_text_cleaned'], o[label_cols]

def model_run(param, X, y, other_model_dependency_dict):
    tokenizer = text.Tokenizer(num_words=param['max_features'])
    tokenizer.fit_on_texts(X.values)
    
    x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)
    x_train, y_train = balance(x_train, y_train)
    
    tokenized_train = tokenizer.texts_to_sequences(x_train.values)
    x_train = sequence.pad_sequences(tokenized_train, maxlen=param['max_len'])
    
    tokenized_val = tokenizer.texts_to_sequences(x_val.values)
    x_val = sequence.pad_sequences(tokenized_val, maxlen=param['max_len'])
    
    model = get_rnn_model(param, tokenizer)
    val_loss, best_epoch, val_auc, train_auc, M = train_model(model, x_train, y_train, x_val, y_val, param)
    print(M)
    return [val_loss, best_epoch, val_auc, train_auc], M

print('done')

done


In [4]:
params = {'max_features': [200000], 
          'epochs': [2], 
          'batch_size': [32],
          'max_len': [int(np.round(mean_length + 3*std_length)), int(np.round(mean_length + 2*std_length))],
          'dropout': [0.5],
          'patience': [5],
          'model_file': [MODEL + 'lstm_best.hdf5'],
          'loss': ['binary_crossentropy'],
          'label_len': [len(label_cols)],
          'embed_trainable': [True],
          'batch_normalization': [False],
          'activation': ['relu'],
          'lstm_activation': ['tanh'],
          'lstm_units': [50],
          'dense_units': [50],
          'lstm_layer_size': [1],
          'dense_layer_size': [1],
          'embedding_param': [{'embed_file': '../../data/glove.6B.50d.txt', 'embed_size': 50, 'embed_type': 'glove'}]
        }

bindings = {'embedding_param': ['embed_type', 'embed_file', 'embed_size']}

score_name_list = ['val_loss', 'best_epoch', 'val_auc', 'train_auc']

grid_search_csv_url = label_cols[0] + '.csv'
grid_search_result_csv_url = label_cols[0] + '_result.csv'
param_class = grid_search_generator(params=params, bindings=bindings,
                                    score_name_list=score_name_list, csv_url=grid_search_csv_url)

grid_search_generator.search(remain_csv=grid_search_csv_url, record_csv=grid_search_result_csv_url,
                             X=train_sentences, y=train[label_cols], model_run=model_run,
                             other_model_dependency_dict=None, X_y_split=False, label_list=label_cols)

print('done')

successfully generated grid search csv file

{'max_features': 200000, 'epochs': 2, 'batch_size': 32, 'max_len': 406, 'dropout': 0.5, 'patience': 5, 'model_file': '../../model/lstm_best.hdf5', 'loss': 'binary_crossentropy', 'label_len': 1, 'embed_trainable': True, 'batch_normalization': False, 'activation': 'relu', 'lstm_activation': 'tanh', 'lstm_units': 50, 'dense_units': 50, 'lstm_layer_size': 1, 'dense_layer_size': 1, 'embed_type': 'glove', 'embed_file': '../../data/glove.6B.50d.txt', 'embed_size': 50}
Train on 24622 samples, validate on 31915 samples
Epoch 1/2
Epoch 2/2
{'toxic': array([[25236,  3696],
       [  211,  2772]])}
val_loss:0.307263, best_epoch:0.000000, val_auc:0.966849, train_auc:0.000000

{'max_features': 200000, 'epochs': 2, 'batch_size': 32, 'max_len': 296, 'dropout': 0.5, 'patience': 5, 'model_file': '../../model/lstm_best.hdf5', 'loss': 'binary_crossentropy', 'label_len': 1, 'embed_trainable': True, 'batch_normalization': False, 'activation': 'relu', 'lstm_activa