#### Import required libraries

In [1]:
import pandas as pd
import numpy as np
import os
from common import preprocess_text, eval_pred

In [None]:
# Required otherwise encounter CancelledError: [_Derived_]RecvAsync is cancelled
# Reference: https://github.com/tensorflow/tensorflow/issues/33721
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

In [5]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Input
from tensorflow.keras.layers import LSTM, GRU, Bidirectional, GlobalMaxPooling1D, Dropout
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

#### Read Data

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
test_y = pd.read_csv('../data/test_labels.csv')
test_df = pd.concat([test_df, test_y.iloc[:,1:]], axis=1, sort=False)
print(train_df.shape, test_df.shape)

(159571, 8) (153164, 8)


#### Initialization

In [3]:
scores_tracker = {}
non_toxic_label = 'non_toxic'
comment_col = 'comment_text'

class_labels = train_df.columns.tolist()[2:]
class_labels

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
# Create non-toxic class and fillna
train_df[non_toxic_label] = 1 - train_df[class_labels].max(axis=1)
train_df[comment_col] = train_df[comment_col].fillna('unknown')
test_df[comment_col] = test_df[comment_col].fillna('unknown')

#### 2. Neural Network Model
- tensorflow keras is run with GPU here

##### Functions

In [7]:
# Preprocessing
def preprocess_nn(train_df, test_df,
                  comment_col=comment_col,
                  class_labels=class_labels,
                  max_features=20000,
                  maxlen=100):
    '''Tokenize and pad data for NN training'''
    
    train_df = train_df.sample(frac=1, random_state=123).copy()
    train_x = train_df[comment_col].values
    train_y = train_df[class_labels].values
    test_x = test_df[comment_col].values
    
    # Tokenize data
    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_x))
    train_x = tokenizer.texts_to_sequences(train_x)
    test_x = tokenizer.texts_to_sequences(test_x)
    
    train_seq = sequence.pad_sequences(train_x, maxlen=maxlen)
    test_seq = sequence.pad_sequences(test_x, maxlen=maxlen)
    
    return tokenizer, train_y, train_seq, test_seq

def get_embedding(emb_fp, tokenizer,
                  max_features=20000,
                  embed_size=128):
    '''Get embedding matrix of training data
    from embedding in emb_fp'''
    
    assert emb_fp is not None, 'emb_fp is not specified'
    
    def _get_coefs(word, *arr):
        '''Generate dict of word-vector k-v pair'''
        return word, np.asarray(arr, dtype='float32')
    
    with open(emb_fp, encoding='utf-8') as file:
        emb_index = dict(_get_coefs(*l.strip().split()) for l in file)
    # Discard words with vector size not equals embed_size
    # Observe this error when using Glove twitter embedding
    discard_keys = [k for k, v in emb_index.items() if len(v)!=embed_size]
    for k in discard_keys:
        del emb_index[k]
        
    all_embs = np.stack(emb_index.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    
    # Initialize emb matrix with normal distribution
    # from mean and std of passed embbeding
    np.random.seed(123)
    emb_matrix = np.random.normal(emb_mean, emb_std,
                                  (nb_words, embed_size))
    
    # Replace random init value with that from passed embedding
    # if word is found
    for word, idx in word_index.items():
        if idx>=max_features:
            continue
        emb_vector = emb_index.get(word)
        if emb_vector is not None:
            emb_matrix[idx] = emb_vector
    
    return emb_matrix

In [8]:
def get_nn_model(struc,
                 embedding,
                 max_features=20000,
                 maxlen=100,
                 embed_size=128,
                 dropout=0.1):
    '''Bidirectional GRU/LSTM with 2 fully connected layers
    initialized with a embedding matrix (if specified)
    with dropout'''
    
    assert struc in ['GRU','LSTM'], 'struc type not supported'
    
    inp = Input(shape=(maxlen, ))
    if embedding is not None:
        x = Embedding(max_features, embed_size, weights=[embedding])(inp)
    else:
        x = Embedding(max_features, embed_size)(inp)
        
    if struc=='LSTM':
        x = Bidirectional(LSTM(50, return_sequences=True))(x)
    else:
        x = Bidirectional(GRU(50, return_sequences=True))(x)
        
    x = GlobalMaxPooling1D()(x)
    x = Dropout(dropout)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(dropout)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [9]:
def get_pred_nn(model, model_fp,
                train_y, train_seq, test_seq,
                batch_size=32, epochs=5):
    '''Get predictions of NN models for each label at a time'''
    
    model_full_fp = os.path.abspath(model_fp)
    model_dir = os.path.dirname(model_full_fp)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    # Define callbacks
    checkpoint = ModelCheckpoint(model_fp, monitor='val_loss',
                                 verbose=1, save_best_only=True,
                                 mode='min')
    early = EarlyStopping(monitor='val_loss',
                          mode='min',
                          patience=20)
    callbacks_list = [checkpoint, early]
    
    # Fit model
    model.fit(train_seq, train_y,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.1,
              callbacks=callbacks_list)
    
    # Load best weights and make pred
    model.load_weights(model_fp)
    test_y = model.predict(test_seq)
    
    return test_y

def run_nn(struc, model_fp,
           train_df, test_df,
           embed_fp=None,
           embed_size=128,
           max_features=20000,
           maxlen=100,
           dropout=0.1,
           batch_size=32,
           epochs=2,
           class_labels=class_labels):
    '''Run 1 NN prediction cycle'''
    
    # Preprocess
    tokenizer, train_y, train_seq, test_seq = \
        preprocess_nn(
            train_df=train_df,
            test_df=test_df,
            class_labels=class_labels,
            max_features=max_features,
            maxlen=maxlen)
    print('1. Preprocessed completed\n')
    
    # Get embeddings if specified
    if embed_fp is not None:
        embedding = get_embedding(
            embed_fp,
            tokenizer,
            max_features=max_features,
            embed_size=embed_size)
        print('2. Embeddings generated\n')
    else:
        embedding=None
        print('2. No Embeddings\n')
    
    # Generate model
    model = get_nn_model(
        struc=struc,
        embedding=embedding,
        max_features=max_features,
        maxlen=maxlen,
        embed_size=embed_size,
        dropout=dropout)
    print(f'3. Model generated ({struc})\n')
    
    # Get predictions
    preds = get_pred_nn(
        model, model_fp,
        train_y, train_seq, test_seq,
        batch_size=batch_size,
        epochs=epochs)
    print('4. Predictions completed\n')
    
    # Eval predictions
    score = eval_pred(test_df, preds, class_labels)
    print('5. Evaluation completed\n')
    
    return score

##### RUN

In [10]:
# Model parameters
max_features = 20000
maxlen = 100
embed_size = 100
dropout = 0.1

# Training parameters
batch_size = 32
epochs = 2

# Embedding filepaths
glove_wiki_fp = '../glove/glove.6B/glove.6B.100d.txt'
glove_twitter_fp = '../glove/glove.twitter.27B/glove.twitter.27B.100d.txt'

##### 2.1 LSTM

In [12]:
%%time
score_lstm = run_nn(
    struc='LSTM',
    model_fp=r'models/weights_lstm.best.hdf5',
    train_df=train_df, test_df=test_df,
    embed_fp=None,
    embed_size=embed_size,
    max_features=max_features,
    maxlen=maxlen,
    dropout=dropout,
    batch_size=batch_size,
    epochs=epochs,
    class_labels=class_labels)
scores_tracker['nn_lstm'] = score_lstm

1. Preprocessed completed

2. No Embeddings

3. Model generated (LSTM)

Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.05069, saving model to models\weights_lstm.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.05069 to 0.04786, saving model to models\weights_lstm.best.hdf5
4. Predictions completed

Mean ROC-AUC: 0.9741297926161975
5. Evaluation completed

Wall time: 5min 21s


##### 2.2 GRU

In [13]:
%%time
score_gru = run_nn(
    struc='GRU',
    model_fp=r'models/weights_gru.best.hdf5',
    train_df=train_df, test_df=test_df,
    embed_fp=None,
    embed_size=embed_size,
    max_features=max_features,
    maxlen=maxlen,
    dropout=dropout,
    batch_size=batch_size,
    epochs=epochs,
    class_labels=class_labels)
scores_tracker['nn_gru'] = score_gru

1. Preprocessed completed

2. No Embeddings

3. Model generated (GRU)

Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.04936, saving model to models\weights_gru.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.04936 to 0.04654, saving model to models\weights_gru.best.hdf5
4. Predictions completed

Mean ROC-AUC: 0.9745754951558911
5. Evaluation completed

Wall time: 5min 28s


##### 2.3 GRU with Glove embedding

In [14]:
%%time
score_gru_glove_wiki = run_nn(
    struc='GRU',
    model_fp=r'models/weights_gru_glove_wiki.best.hdf5',
    train_df=train_df, test_df=test_df,
    embed_fp=glove_wiki_fp,
    embed_size=embed_size,
    max_features=max_features,
    maxlen=maxlen,
    dropout=dropout,
    batch_size=batch_size,
    epochs=epochs,
    class_labels=class_labels)
scores_tracker['nn_gru_glove_wiki'] = score_gru_glove_wiki

1. Preprocessed completed



  if sys.path[0] == '':


2. Embeddings generated

3. Model generated (GRU)

Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.04664, saving model to models\weights_gru_glove_wiki.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.04664 to 0.04500, saving model to models\weights_gru_glove_wiki.best.hdf5
4. Predictions completed

Mean ROC-AUC: 0.9783651989946405
5. Evaluation completed

Wall time: 5min 36s


In [15]:
%%time
score_gru_glove_twitter = run_nn(
    struc='GRU',
    model_fp=r'models/weights_gru_glove_twitter.best.hdf5',
    train_df=train_df, test_df=test_df,
    embed_fp=glove_twitter_fp,
    embed_size=embed_size,
    max_features=max_features,
    maxlen=maxlen,
    dropout=dropout,
    batch_size=batch_size,
    epochs=epochs,
    class_labels=class_labels)
scores_tracker['nn_gru_glove_twitter'] = score_gru_glove_twitter

1. Preprocessed completed



  if sys.path[0] == '':


2. Embeddings generated

3. Model generated (GRU)

Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.04672, saving model to models\weights_gru_glove_twitter.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.04672 to 0.04522, saving model to models\weights_gru_glove_twitter.best.hdf5
4. Predictions completed

Mean ROC-AUC: 0.9796035068929402
5. Evaluation completed

Wall time: 6min 3s


##### 2.4 GRU with Glove embedding and further preprocessing

In [16]:
train_df2 = train_df.copy()
test_df2 = test_df.copy()
train_df2[comment_col] = train_df2[comment_col].apply(preprocess_text)
test_df2[comment_col] = test_df2[comment_col].apply(preprocess_text)

In [27]:
%%time
score_gru_glove_twitter_clean = run_nn(
    struc='GRU',
    model_fp=r'models/weights_gru_glove_twitter_clean.best.hdf5',
    train_df=train_df2, test_df=test_df2,
    embed_fp=glove_twitter_fp,
    embed_size=embed_size,
    max_features=max_features,
    maxlen=maxlen,
    dropout=dropout,
    batch_size=batch_size,
    epochs=epochs,
    class_labels=class_labels)
scores_tracker['nn_gru_glove_twitter_clean'] = score_gru_glove_twitter_clean

1. Preprocessed completed



  if sys.path[0] == '':


2. Embeddings generated

3. Model generated (GRU)

Epoch 1/2

Epoch 00001: val_loss improved from inf to 0.04584, saving model to models\weights_gru_glove_twitter_clean.best.hdf5
Epoch 2/2

Epoch 00002: val_loss improved from 0.04584 to 0.04522, saving model to models\weights_gru_glove_twitter_clean.best.hdf5
4. Predictions completed

Mean ROC-AUC: 0.9798407787592017
5. Evaluation completed

Wall time: 6min 1s
