In [2]:
from fastText import load_model
# grid search params
from sklearn.utils import shuffle
import pandas as pd
import json

class grid_search_generator(object):
    def __init__(self, config_file_url='grid_search_config.json',
                 params=None, bindings=None, score_name_list=None, csv_url=None):
        new_generator = params != None and bindings != None and score_name_list != None
        if new_generator:
            self.params = params
            self.binding = bindings
            self.score_name_list = score_name_list
            
            self.single_keys = [key for key in self.params.keys() if key not in self.binding.keys()]
            self.binding_keys = list(self.binding.keys())
            self._terminate = False
            
            self._save_config(config_file_url)
        else:
            with open(config_file_url, 'r') as f:
                data = json.load(f)
                self.params = data['params']
                self.binding = data['binding']
                self.score_name_list = data['score_name_list']
                self.single_keys = data['single_keys']
                self.binding_keys = data['binding_keys']
                
        self.keys = list(self.single_keys)
        self.keys.extend(self.binding_keys)
        self.param_keys = list(self.single_keys)
        for i in range(len(self.single_keys), len(self.keys)):
            self.param_keys.extend(self.binding[self.keys[i]])
        
        self._terminate = False
        self._idx = [0 for i in range(len(self.keys))]
        self._ub = [len(self.params[key]) for key in self.keys]
        
        if csv_url != None: self.get_csv(csv_url)
    
    def _reset_config(self):
        self._terminate = False
        self._idx = [0 for i in range(len(self.keys))]
        self._ub = [len(self.params[key]) for key in self.keys]
    
    def _save_config(self, config_file_url):
        with open(config_file_url, 'w') as f:
            data = {
                'params': self.params,
                'binding': self.binding,
                'score_name_list': self.score_name_list,
                'single_keys': self.single_keys,
                'binding_keys': self.binding_keys
            }
            json.dump(data, f)
    
    def _next_idx(self):
        i = 0
        self._idx[i] = (self._idx[i] + 1) % self._ub[i]
        i += 1
        end_loop = (self._idx[i-1]!=0 or i==len(self._idx))
        terminate = (self._idx[i-1]==0 and i==len(self._idx))
        while(not end_loop):
            self._idx[i] = (self._idx[i] + 1) % self._ub[i]
            i += 1
            end_loop = (self._idx[i-1]!=0 or i==len(self._idx))
            terminate = (self._idx[i-1]==0 and i==len(self._idx))
        self._terminate = terminate

    def _next_param_list(self):
        if not self._terminate:
            input = [self.params[self.keys[i]][self._idx[i]] for i in range(len(self.single_keys))]
            for i in range(len(self.single_keys), len(self.keys)):
                input.extend([self.params[self.keys[i]][self._idx[i]][key] for key in self.binding[self.keys[i]]])
            self._next_idx()
            return input
        else: return None
        
    def _get_param_frame(self):
        param_list = []
        self._reset_config()
        param = self._next_param_list()
        while param != None:
            param_list.append(param)
            param = self._next_param_list()
        return pd.DataFrame(param_list, columns=self.param_keys)
        
    def get_csv(self, csv_url):
        self._get_param_frame().to_csv(csv_url, index=False)
        print('successfully generated grid search csv file\n')
    
    def _get_grid_search_param(keys, values): return dict(zip(keys, values))
    
    def _get_next_param(csv_url, param_list):
        df = shuffle(pd.read_csv(csv_url))
        if df.empty: return None
        param = grid_search_generator._get_grid_search_param(param_list, df.iloc[0].values)
        df.iloc[1 : ].to_csv(csv_url, index=False)
        return param
    
    def _recorder(values, record_csv, score_name_list, params, param_keys, label_list, confusion_matrices, initialize):
        head = list(param_keys)
        head.extend(score_name_list)
        if label_list != None: head.extend(label_list)
        
        content = []
        for x in param_keys: content.append(params[x])
        content.extend(values)
        if label_list != None:
            for x in label_list: content.append(confusion_matrices[x])
        if initialize:
            with open(record_csv, 'w') as f: pd.DataFrame([content], columns=head).to_csv(f, header=True, index=False)
        else:
            with open(record_csv, 'a') as f: pd.DataFrame([content], columns=head).to_csv(f, header=False, index=False)

        print_str = ''
        for i, j in enumerate(values): print_str += (score_name_list[i] + ':' + ('%.6f, ' % j))
        print_str = print_str[ : -2] + '\n'
        print(print_str)
    
    def add_params(csv_url, param_dict, config_file_url='grid_search_config.json'):
        for key in param_dict.keys():
            grid_search = grid_search_generator(config_file_url=config_file_url)
            original = list(grid_search.params[key])
            param_values = [value for value in param_dict[key] if value not in original]
            grid_search.params[key] = param_values
            with open(csv_url, 'a') as f: grid_search._get_param_frame().to_csv(f, header=False, index=False)
            grid_search.params[key].extend(original)
            grid_search._save_config(config_file_url)
        
        print('successfully append the new parameters')
        
    def delete_params(csv_url, param_dict, config_file_url='grid_search_config.json'):
        csv = pd.read_csv(csv_url)
        grid_search = grid_search_generator(config_file_url)
        
        for key in param_dict.keys():
            for value in param_dict[key]:
                csv = csv[csv[key] != value]
        with open(csv_url, 'w') as f: csv.to_csv(f, index=False)
        
        for key in param_dict.keys():
            if key in grid_search.single_keys:
                grid_search.params[key] = [value for value in grid_search.params[key] if value not in param_dict[key]]
            else:
                for bkey in grid_search.binding_keys:
                    if key in grid_search.binding[bkey]:
                        grid_search.params[bkey] = [value for value in grid_search.params[bkey] if value[key] not in param_dict[key]]
                        break
        grid_search._save_config(config_file_url)
        print('successfully delete the parameters')
    
    def search(remain_csv, record_csv, X, y, model_run, other_model_dependency_dict, label_list=None,
               config_file_url='grid_search_config.json', X_y_split=True, val_size=0.1, shuffle=True):
        grid_search = grid_search_generator(config_file_url)
#         if X_y_split:
#             X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size, shuffle=shuffle)
        param = grid_search_generator._get_next_param(remain_csv, grid_search.param_keys)
        header = True
        while param != None:
            print(param)
            #if X_y_split: scores = list(model_run(param, X_train, X_val, y_train, y_val, other_model_dependency_dict))
            scores, confusion_matrices = model_run(param, X, y, other_model_dependency_dict)#, X_y_split=True, val_size=0.1, shuffle=True)
            grid_search_generator._recorder(scores, record_csv, grid_search.score_name_list, param,
                                            grid_search.param_keys, label_list, confusion_matrices, header)
            header = False
            param = grid_search_generator._get_next_param(remain_csv, grid_search.param_keys)
            
print('done')

done


In [3]:
import numpy as np
import pandas as pd
# #from nltk.tokenize import TweetTokenizer
# from keras.layers import Dense, Embedding, Input, LSTM, GRU, Conv1D, Bidirectional, GlobalMaxPool1D, Dropout
# from keras.layers import SpatialDropout1D, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, BatchNormalization
# from keras.preprocessing import text, sequence
from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

HOME = '../../'
DATA = HOME + 'data/'
MODEL = HOME + 'model/'

PATH = '~/data/toxic/data/'
train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')

print(train.shape, test.shape)

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# tok = TweetTokenizer()
# # train = pd.read_csv(DATA + 'cleaned_train.csv')[ : 1000]
# train = pd.read_csv('/home/kai/data/kaggle/toxic/dataset/training/emoji_train.csv')

# train_sentences = train['comment_text_cleaned']
# sentences = train_sentences

# text_length = sentences.apply(lambda x: len(tok.tokenize(x)))
# mean_length = text_length.mean()
# std_length = text_length.std()

# print(train.shape)
# print(mean_length)
# print(std_length)

ModuleNotFoundError: No module named 'keras'

In [None]:
def get_coefs(word,*arr): 
    import pdb
    pdb.set_trace()
    return word, np.asarray(arr, dtype='float32')

def glove_get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
    embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file, encoding='utf8'))
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix, nb_words

def fasttext_get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
    word_index = tokenizer.word_index
    ft_model = load_model(embedding_file)
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.zeros((nb_words, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = ft_model.get_word_vector(word).astype('float32')
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix, nb_words

def get_embedding_matrix(embed_type, file, size, max_features, tokenizer):
    if embed_type == 'fasttext': return fasttext_get_embedding_matrix(file, size, max_features, tokenizer)
    else: return glove_get_embedding_matrix(file, size, max_features, tokenizer)

def get_rnn_model(params, tokenizer):
    embed_type = params['embed_type']
    embed_file = params['embed_file']
    embed_size = params['embed_size']
    lstm_units = params['lstm_units']
    lstm_activation = params['lstm_activation']
    dense_units = params['dense_units']
    activation = params['activation']
    embed_trainable = params['embed_trainable']
    batch_normalization = params['batch_normalization']
    
    max_len = params['max_len']
    dropout = params['dropout']
    loss = params['loss']
    label_len = params['label_len']
    max_features = params['max_features']
    

    sequence_input = Input(shape=(max_len, ))
    embedding_matrix, inp_len = get_embedding_matrix(embed_type, embed_file, embed_size, max_features, tokenizer)
    import pdb
    pdb.set_trace()
    x = Embedding(inp_len, embed_size, weights=[embedding_matrix],trainable = embed_trainable)(sequence_input)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(lstm_units, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
    x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool]) 
    # x = Dense(128, activation='relu')(x)
    # x = Dropout(0.1)(x)
    preds = Dense(label_len, activation="sigmoid")(x)
    model = Model(sequence_input, preds)

    # filepath="840B300D_weights_base.best.hdf5"
    # print('load model: ' + str(filepath))
    # model.load_weights(filepath)

    model.compile(loss=loss,optimizer=Adam(lr=1e-3),metrics=['accuracy'])
    
    return model
    
#     embedding_matrix, inp_len = get_embedding_matrix(embed_type, embed_file, embed_size, max_features, tokenizer)
#     input = Input(shape=(max_len, ))
#     x = Embedding(inp_len, embed_size, weights=[embedding_matrix], trainable=embed_trainable)(input)
#     for i in range(params['lstm_layer_size']):
#         x = Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=dropout,\
#                                recurrent_dropout=dropout, activation=lstm_activation))(x)
#     x = GlobalMaxPool1D()(x)
#     if batch_normalization:
#         x = BatchNormalization()(x)
#     for i in range(params['dense_layer_size']):
#         x = Dense(dense_units, activation=activation)(x)
#     x = Dropout(dropout)(x)
#     x = Dense(label_len, activation='sigmoid')(x)
#     model = Model(inputs=input, outputs=x)
#     model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])
#     return model

from keras.callbacks import Callback
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))


def train_model(model, x, y, x_val, y_val, params):
    batch_size = params['batch_size']
    epochs = params['epochs']
    patience = params['patience']
    model_file = '{}_{}_{}_{}_{}_{}_{}_{}_{}.hdf5'.format(params['embed_type'], params['embed_size'],
                                                          params['lstm_units'], params['lstm_activation'],
                                                          params['dense_units'], params['activation'],
                                                          params['embed_trainable'], params['batch_normalization'],
                                                          params['max_len'])
    model_file = '../../model/' + model_file
    
    checkpoint = ModelCheckpoint(model_file, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
                                #(model_file, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_acc", mode="max", patience=3)#(monitor="val_loss", mode="min", patience=patience)
    ra_val = RocAucEvaluation(validation_data=(x_val, y_val), interval = 1)
    callbacks_list = [checkpoint, earlystopping, ra_val]
    history = model.fit(x, y, batch_size=batch_size, epochs=epochs,\
                        validation_data=(x_val,y_val), callbacks=callbacks_list)
    
    # predict
    model.load_weights(model_file)
    y_train = model.predict(x, verbose=1)
    y_pre = model.predict(x_val, verbose=1)
    
    # compute the scores
    best_epoch = history.history['val_loss'].index(min(history.history['val_loss']))
    val_loss = history.history['val_loss'][best_epoch]
    val_auc = roc_auc_score(y_val, y_pre)
    train_auc = roc_auc_score(y, y_train)
    
    thres = 0.5
    def f(x):
        return (x > thres)*1
    
    M = {}
    y_pre = pd.DataFrame(y_pre, columns=label_cols)
    for i in label_cols:
        y_tmp = y_pre[i].apply(f)
        M[i] = confusion_matrix(y_val[i], y_tmp)
    
    return val_loss, best_epoch, val_auc, train_auc, M

def model_run(param, X, y, other_model_dependency_dict, X_val = None, y_val=None):#, X_y_split=True, val_size=0.1, shuffle=True):
    tokenizer = text.Tokenizer(num_words=param['max_features'])
    tokenizer.fit_on_texts(X.values)
    tokenized_train = tokenizer.texts_to_sequences(X.values)
    tokenized_val = tokenizer.texts_to_sequences(X_val.values)
    x = sequence.pad_sequences(tokenized_train, maxlen=param['max_len'])
    x_val = sequence.pad_sequences(tokenized_val, maxlen=param['max_len'])
    
    model = get_rnn_model(param, tokenizer)
    val_loss, best_epoch, val_auc, train_auc, M = train_model(model, x_train, y_train, x_val, y_val, param)
    return [val_loss, best_epoch, val_auc, train_auc], M

print('done')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


done


In [None]:
import numpy as np
def get_coefs(word,*arr): 
    try:
        return word, np.asarray(arr, dtype='float32') 
    except ValueError:
        return 'nnnnnnnaaaaaaa@@!',np.zeros(300)
file = '/home/kai/data/resources/FastText/wiki.en.vec'
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(file, encoding='utf8')) 
 


In [None]:
# params = {'max_features': [100000,150000], 
#           'epochs': [8],#,5,6], 
#           'batch_size': [128],64,32],
#           'max_len': [250,150],#int(np.round(mean_length + 2*std_length)), int(np.round(mean_length + 1*std_length))],
#           'dropout': [0.1],
#           'patience': [3],
#           'loss': ['binary_crossentropy'],
#           'label_len': [len(label_cols)],
#           'embed_trainable': [True, False],
#           'batch_normalization': [False],
#           'activation': ['relu']#, 'tanh', 'sigmoid'],
#           'lstm_activation': ['tanh'],
#           'lstm_units': [64, 128],#GRU, LSTM
#           'dense_units': [100],
#           'lstm_layer_size': [1],
#           'dense_layer_size': [1],
#           'embedding_param': [{'embed_file': '/home/kai/data/resources/glove/glove.840B.300d.txt', 'embed_size': 300, 'embed_type': 'glove'},
#                               {'embed_file': '/home/kai/data/resources/glove/glove.twitter.27B.200d.txt', 'embed_size': 200, 'embed_type': 'glove'},
#                               {'embed_file': '/home/kai/data/resources/FastText/wiki.en.bin', 'embed_size': 300, 'embed_type': 'fasttext'}]
#         }

params = {'max_features': [100000], 
          'epochs': [4],#,5,6], 
          'batch_size': [128],
          'max_len': [150],#int(np.round(mean_length + 2*std_length)), int(np.round(mean_length + 1*std_length))],
          'dropout': [0.1], # not used
          'patience': [1],
          'loss': ['binary_crossentropy'],
          'label_len': [len(label_cols)],
          'embed_trainable': [False],
          'batch_normalization': [False], # not used
          'activation': ['relu'], # not used
          'lstm_activation': ['tanh'], # not used
          'lstm_units': [128], 
          'dense_units': [50], # not used
          'lstm_layer_size': [1], # not used
          'dense_layer_size': [1], # not used
          'embedding_param': [{'embed_file': '/home/kai/data/resources/glove/glove.6B.300d.txt', 'embed_size': 300, 'embed_type': 'glove'}]  
         }

bindings = {'embedding_param': ['embed_type', 'embed_file', 'embed_size']}

score_name_list = ['val_loss', 'best_epoch', 'val_auc', 'train_auc']

grid_search_csv_url = 'grid_search.csv'
grid_search_result_csv_url = 'grid_search_result.csv'
param_class = grid_search_generator(params=params, bindings=bindings,
                                    score_name_list=score_name_list, csv_url=grid_search_csv_url)

grid_search_generator.search(remain_csv=grid_search_csv_url, record_csv=grid_search_result_csv_url,
                             X=train['comment_text'], y=train[label_cols], model_run=model_run,
                             other_model_dependency_dict=None, X_y_split=True, label_list=label_cols)

print('done')