In [1]:
from fastText import load_model
import re, os
import numpy as np
import pandas as pd
import pdb
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc

In [2]:
import timeit, time, datetime
from keras import regularizers
from keras.models import Model
from keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
print('\nLoading FT model')
ft_model = load_model('/home/kai/data/resources/FastText/wiki.en.bin')
n_features = ft_model.get_dimension()

print(n_features)
window_length = 200 # The amount of words we look at per example. Experiment with this.


Loading FT model
300


In [4]:
# # config
# RNN_UNITS = 50 # LSTM hidden layer unit number
# DENSE_UNITS = 50
# DROPOUT = 0.3 # dropout rate
# BATCH_SIZE = 128
# EPOCHS = 1

In [5]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [6]:
def normalize(s):
    """
    Given a text, cleans and normalizes it. Feel free to add your own stuff.
    """
    #s = s.lower()
    # Replace ips
    #s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s)
    # Isolate punctuation
    #s = re.sub(r'([\'\"\.\(\)\!\?\-\\\/\,])', r' \1 ', s)
    # Remove some special characters
    #s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Replace numbers and symbols with language
    s = s.replace('&', ' and ')
    s = s.replace('@', ' at ')
    s = s.replace('0', ' zero ')
    s = s.replace('1', ' one ')
    s = s.replace('2', ' two ')
    s = s.replace('3', ' three ')
    s = s.replace('4', ' four ')
    s = s.replace('5', ' five ')
    s = s.replace('6', ' six ')
    s = s.replace('7', ' seven ')
    s = s.replace('8', ' eight ')
    s = s.replace('9', ' nine ')
    return s

print('\nLoading data')
train = pd.read_csv('/home/kai/data/haoyan/ToxicClassificationCopy3/data/cleaned_train.csv')
test = pd.read_csv('/home/kai/data/haoyan/ToxicClassificationCopy3/data/cleaned_test.csv')
#train['comment_text'] = train['comment_text'].fillna('_empty_')
#test['comment_text'] = test['comment_text'].fillna('_empty_')


Loading data


In [7]:
# train = train.sample(frac=0.05)
# test = test.sample(frac=0.01)
# train.shape, test.shape

In [8]:
train.shape, test.shape

((159571, 27), (153164, 21))

In [9]:
def text_to_vector(text):
    """
    Given a string, normalizes it, then splits it into words and finally converts
    it to a sequence of word vectors.
    """
    text = normalize(text)
    words = text.split()
    window = words[-window_length:]
    
    x = np.zeros((window_length, n_features))

    for i, word in enumerate(window):
        x[i, :] = ft_model.get_word_vector(word).astype('float32')

    return x

def df_to_data(df):
    """
    Convert a given dataframe to a dataset of inputs for the NN.
    """
    x = np.zeros((len(df), window_length, n_features), dtype='float32')

    for i, comment in enumerate(df['comment_text_cleaned'].values):
        x[i, :] = text_to_vector(comment)

    return x

In [10]:
# Split the dataset
split_index = round(len(train) * 0.9) #################################
shuffled_train = train#.sample(frac=1) # no shuffle so the last 10% is chosen as validation set
df_train = shuffled_train.iloc[:split_index]
df_val = shuffled_train.iloc[split_index:]
# Convert validation set to fixed array
x_val = df_to_data(df_val)
y_val = df_val[label_cols].values
# Get test data ready
x_test = df_to_data(test)

In [11]:
x_val.shape

(15957, 200, 300)

np.save('/home/kai/data/shiyi/toxic/saved_files/x_val_np', x_val)

np.save('/home/kai/data/shiyi/toxic/saved_files/y_val_np', y_val)

In [12]:
def data_generator(df, window_length, n_features, batch_size):
    """
    Given a raw dataframe, generates infinite batches of FastText vectors.
    """
    batch_i = 0 # Counter inside the current batch vector
    batch_x = None # The current batch's x data
    batch_y = None # The current batch's y data
    
    while True: # Loop forever
        df = df.sample(frac=1) # Shuffle df each epoch # for bagging purpose, change it to a float. 
                                #(and/or sample with placement)
        
        for i, row in df.iterrows():
            comment = row['comment_text_cleaned']
            
            if batch_x is None:
                batch_x = np.zeros((batch_size, window_length, n_features), dtype='float32')
                batch_y = np.zeros((batch_size, len(label_cols)), dtype='float32')
                
            batch_x[batch_i] = text_to_vector(comment)
            batch_y[batch_i] = row[label_cols].values
            batch_i += 1

            if batch_i == batch_size:
                # Ready to yield the batch
                yield batch_x, batch_y
                batch_x = None
                batch_y = None
                batch_i = 0

In [26]:
# def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

# def get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
#     embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file, encoding='utf8'))
#     all_embs = np.stack(embeddings_index.values())
#     word_index = tokenizer.word_index
#     nb_words = min(max_features, len(word_index))
#     embedding_matrix = np.random.normal(all_embs.mean(), all_embs.std(), (nb_words, embed_size))
#     for word, i in word_index.items():
#         if i < max_features:
#             embedding_vector = embeddings_index.get(word)
#             if embedding_vector is not None: embedding_matrix[i] = embedding_vector
#     return embedding_matrix

def get_lstm_model(window_length, n_features, rnn_units, dense_units, label_cols, dropout, mode='LSTM', bidirection = False ,load_model=False, load_model_file=None):
    #embedding_matrix = get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer)
    #input = Input(shape=(max_len, ))
    input = Input(shape=(window_length, n_features))
    #x = Embedding(max_features, embed_size, weights=[embedding_matrix])(input)
    
    #x = Bidirectional(LSTM(rnn_units, return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(input)
    #x = LSTM(rnn_units, return_sequences=True, dropout=dropout, recurrent_dropout=dropout)(input)
    rnn_layer = LSTM(rnn_units, return_sequences=True, dropout=dropout, recurrent_dropout=dropout)
    if mode == 'GRU':
        rnn_layer = GRU(rnn_units, return_sequences=True, dropout=dropout, recurrent_dropout=dropout)
    if bidirection:
        x = Bidirectional(rnn_layer)(input)
    else:
        x = rnn_layer(input)
    
    x = GlobalMaxPool1D()(x)
    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(dropout)(x)
    x = Dense(len(label_cols), activation='sigmoid')(x)
    model = Model(inputs=input, outputs=x)
    
    #pdb.set_trace()
    if (load_model):
        model.load_weights(load_model_file)
        
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

# def get_lstm_model_temp():
#     model = Sequential()
#     model.add(Dense(32, activation='relu', input_dim=100))
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(optimizer='rmsprop',
#                   loss='binary_crossentropy',
#                   metrics=['accuracy'])
#     return model

# def train_model_temp(model, data, labels):
#     model.fit(data, one_hot_)

def train_model(model, model_file, window_length, n_features, batch_size, epochs, df_train, x_val, y_val):
    # without generator, do some like this and feed to model.fit
    # x_train = df_to_data(train)
    # y_train = train[label_cols].values
    
    # with generator:
    training_steps_per_epoch = round(len(df_train) / batch_size)
    print('steps:')
    print(training_steps_per_epoch)
    training_generator = data_generator(df_train, window_length, n_features, batch_size)
    
    # start training
    checkpoint = ModelCheckpoint(model_file, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)
    callbacks_list = [checkpoint, earlystopping]
        
    #model.fit(X_train, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)
    model.fit_generator(
        generator=training_generator, 
        steps_per_epoch=training_steps_per_epoch, 
        epochs=epochs, 
        validation_data=(x_val, y_val), 
        callbacks=callbacks_list
    )
    return model

def predict(model, file_path, x_test):
    model.load_weights(file_path)
    return model.predict(x_test, verbose=1)

def evaluate(model, file_path, x_val, y_val, label_cols, metrics='ROC'):
    model.load_weights(file_path)
    y_val_preds = model.predict(x_val)
    individual_label_auc_report = ''
    auc_per_label = {}
    for i in range(len(label_cols)):
        fpr, tpr, thresholds = roc_curve(y_val[:,i], y_val_preds[:,i], pos_label=1.0)
        auc_temp = auc(fpr, tpr)
        auc_per_label[label_cols[i]] = auc_temp
        individual_label_auc_report += '\nLabel: {:20} Threashold count: {} \t AUC: {}'.format(label_cols[i], len(thresholds), auc_temp)
        #mean_auc += auc_temp # this can be computed by roc_auc_score
    #mean_auc/=len(label_cols)
    individual_label_auc_report+='\n'
    if metrics=='ROC':
        return roc_auc_score(y_val, y_val_preds), individual_label_auc_report, auc_per_label
    raise ValueError('The chosen metrics is not implemented yet')
    
def save(y_test, label_cols, path, is_train=False):
    if is_train:
        submission = pd.read_csv(path + 'sample_train.csv')
        file_name = 'trn_'
    else:
        submission = pd.read_csv(path + 'sample_submission.csv')
        file_name = 'sub_'
    submission[label_cols] = y_test
    print('submission shape:')
    print(submission.shape)
    submission.to_csv(path + file_name + BUILD_ID + '.csv', index=False)

    
print('done')

done


In [14]:
def get_filename(file_dir, file_id):
    for root, dirs, files in os.walk(file_dir):
        for filename in files:
            if str(file_id) in filename:
                return filename
    return None   

In [18]:
import gc
gc.collect()

187

In [None]:
%%time

records_file = 'timefile_FastText.txt'
record_csv = 'fasttext_record.csv'

#for window_length in [200,250,300]

fresh_start = True # if not a fresh start, PREV_ID must be provided
add_epoch = 10 # so the total epoch will be previous epoch + add_epoch - 1 (because range is right exclusive)
PREV_ID = None #'1518995754'
for mode in ['GRU','LSTM']:
    for EPOCHS in [1 for i in range(add_epoch)]:

        # in data_generator, there are another two parameters: frac and replace (sample with replacement)
        DROPOUT = 0.1
        BATCH_SIZE = 32    
        RNN_UNITS = 50
        bidirectional = True
        DENSE_UNITS = 50
        current_epoch = 1 # 1 for fresh start. if not fresh start, this var does not matter

        start_time = timeit.default_timer()

        SAVE_DIR = '/home/kai/data/shiyi/toxic/'
        SUB_DIR = SAVE_DIR + 'submissions/'
        print(SUB_DIR)

        ID = str(int(time.time()))
        if fresh_start:
            print('getting fresh model')
            model = get_lstm_model(window_length, n_features, RNN_UNITS, DENSE_UNITS, label_cols, DROPOUT, mode, bidirectional)
            BUILD_ID = 'FastText_cleaned_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}'.format(mode, bidirectional, window_length, n_features, RNN_UNITS, DENSE_UNITS, DROPOUT, BATCH_SIZE, current_epoch, ID)
            fresh_start = False
        else:
            if PREV_ID == None:
                raise ValueError("Since it's not a fresh start, please provide the PREV_ID so the model can be loaded")
            model_saved_dir = SAVE_DIR + 'models/'
            LOAD_MODEL_FILE = model_saved_dir + get_filename(model_saved_dir, PREV_ID)
            model = get_lstm_model(window_length, n_features, RNN_UNITS, DENSE_UNITS, label_cols, DROPOUT, mode, bidirectional, load_model=True, load_model_file=LOAD_MODEL_FILE)
            current_epoch = int(LOAD_MODEL_FILE.split('_')[-2]) + 1
            print('Saved model loaded: {}, epochs: {}'.format(PREV_ID, current_epoch - 1))
            BUILD_ID = 'FastText_cleaned_{}_{}_{}_{}_{}_{}_{}_{}_{}_{}'.format(mode, bidirectional, window_length, n_features, RNN_UNITS, DENSE_UNITS, DROPOUT, BATCH_SIZE, current_epoch, ID)


        PREV_ID = ID
        print('BUILD_ID: ' + BUILD_ID)
        MODEL_FILE = SAVE_DIR + 'models/mod_' + BUILD_ID + '.hdf5'
        print('Model will be saved at: ' +str(MODEL_FILE))

        print('training')
        model = train_model(model, MODEL_FILE, window_length, n_features, BATCH_SIZE, EPOCHS, df_train, x_val, y_val)
    #     print('loading and training')
    #     model = train_model(model, MODEL_FILE, window_length, n_features, BATCH_SIZE, EPOCHS, df_train, x_val, y_val, 
    #                        load_model=True, load_model_file=LOAD_MODEL_FILE)

        elapsed_time = timeit.default_timer() - start_time    
        print('training time: {}s'.format(elapsed_time))
        with open(records_file,'a') as f:
            f.write('##################################################################\n')
            f.write('##################################################################\n')
            f.write('\nBUILD_ID: ' + BUILD_ID + '\ntraining: '+str(elapsed_time))


        ############################################################################
        ############################################################################
        print('evaluating')
        roc, report, auc_per_label = evaluate(model, MODEL_FILE, x_val, y_val, label_cols)
        print('ROC on each label: {}'.format(report))
        print('ROC: {}'.format(roc))
        with open(records_file,'a') as f:
            f.write('\nROC on each label: {}'.format(report))
            f.write('\nROC Average: {}'.format(roc))

        ############################################################################
        optimizer = 'adam'
        rnn_mode = mode
        dense_activation = 'tanh'
        history = model.history.history
        
        result = '%s,%s,%s,%d,%d,%s,%s,%d,%d,%s,%d,%d,%d,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f\n'\
                        %(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),\
                        ID,optimizer,window_length,n_features,bidirectional,rnn_mode,\
                        RNN_UNITS,DENSE_UNITS,dense_activation,DROPOUT,BATCH_SIZE,current_epoch,\
                        history['loss'][0],history['acc'][0],history['val_loss'][0],history['val_acc'][0],\
                        roc,auc_per_label['toxic'],auc_per_label['severe_toxic'],auc_per_label['obscene'],\
                        auc_per_label['threat'],auc_per_label['insult'],auc_per_label['identity_hate'])        
        
        with open(record_csv, 'a') as f:
            f.write(result)
        ############################################################################
        ############################################################################
        print('predicting')
        start_time = timeit.default_timer()
        y_test = predict(model, MODEL_FILE, x_test)

        #print('train predicting')
        #y_train = predict(model, MODEL_PATH, X_train)
        elapsed_time = timeit.default_timer() - start_time    
        print('predicting time: {}s'.format(elapsed_time))
        with open(records_file,'a') as f:
            f.write('\npredicting: '+str(elapsed_time)+'\n')      

        ############################################################################
        ############################################################################
        
        save(y_test, label_cols, SUB_DIR)

        #save('lstm', y_train, label_cols, SUB_DIR, True)

        with open(records_file,'a') as f:
            model.summary(print_fn=lambda x: f.write(x + '\n'))
            for key, value in model.history.history.items():
                f.write('\nperformance: '+str(key)+': '+str(value))
            for key, value in model.history.params.items():
                f.write('\nparams: '+str(key)+': '+str(value))
            f.write('\n')

        
        print('done')

/home/kai/data/shiyi/toxic/submissions/
getting fresh model
BUILD_ID: FastText_cleaned_GRU_True_200_300_50_50_0.1_32_1_1519090847
Model will be saved at: /home/kai/data/shiyi/toxic/models/mod_FastText_cleaned_GRU_True_200_300_50_50_0.1_32_1_1519090847.hdf5
training
steps:
4488
Epoch 1/1
Epoch 00001: val_loss improved from inf to 0.04801, saving model to /home/kai/data/shiyi/toxic/models/mod_FastText_cleaned_GRU_True_200_300_50_50_0.1_32_1_1519090847.hdf5
training time: 1558.7051081610844s
evaluating
ROC on each label: 
Label: toxic                Threashold count: 925 	 AUC: 0.9789213536454304
Label: severe_toxic         Threashold count: 279 	 AUC: 0.9913409670820945
Label: obscene              Threashold count: 575 	 AUC: 0.9891487975171223
Label: threat               Threashold count: 155 	 AUC: 0.9826076570063496
Label: insult               Threashold count: 685 	 AUC: 0.9829358059983604
Label: identity_hate        Threashold count: 275 	 AUC: 0.9905977940091485

ROC: 0.98592539587

KeyboardInterrupt: 

record_csv = 'fasttext_record.csv'
col_names = 'date,ID,optimizer,window_length,n_features,bidirectional,rnn_mode,RNN_UNITS,DENSE_UNITS,dense_activation,DROPOUT,BATCH_SIZE,EPOCH,trn_loss,trn_acc,val_loss,val_acc,val_auc,toxic,severe_toxic,obscene,threat,insult,identity_hate\n'
with open(record_csv, 'a') as f:
    f.write(col_names)


x_trn = df_to_data(shuffled_train)
y_trn = shuffled_train[label_cols].values

x_trn.shape, y_trn.shape

SAVE_DIR = '/home/kai/data/shiyi/toxic/'
model_saved_dir = SAVE_DIR + 'models/'
model_file = model_saved_dir + get_filename(model_saved_dir, '1518834132')
model_9839 = get_lstm_model(200, 300, 50, 50, label_cols, 0.1, 'LSTM', True, load_model=True, load_model_file=model_file)

In [21]:
model_file

'/home/kai/data/shiyi/toxic/models/mod_FastText_cleaned_200_300_50_50_0.1_32_6_1518834132.hdf5'

In [22]:
y_trn_preds = predict(model_9839, model_file, x_trn)



In [23]:
SUB_DIR = SAVE_DIR + 'submissions/'

In [25]:
BUILD_ID = '9839_' + str(int(time.time()))
BUILD_IDLD_ID

'9839_1519155658'

In [27]:
save(y_trn_preds, label_cols, SUB_DIR, True)

submission shape:
(159571, 7)


In [28]:
y_test = predict(model_9839, model_file, x_test)



In [29]:
save(y_test, label_cols, SUB_DIR, False)

submission shape:
(153164, 7)
