In [1]:
import pandas as pd
import numpy as np
from keras import regularizers
from keras.models import Model
from nltk.tokenize import TweetTokenizer
from keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
PATH = '../../data/'

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')

train_sentence = train['comment_text_cleaned']
test_sentence = test['comment_text_cleaned']

text_length = pd.concat([train_sentence.apply(lambda x: len(x.split())),\
                         test_sentence.apply(lambda x: len(x.split()))])

mean_length = text_length.mean()
std_length = text_length.std()

print(train.shape)
print(test.shape)
print(mean_length)
print(std_length)

(159571, 27)
(153164, 21)
72.2062896702
114.017305915


In [3]:
# config
MAX_FEATURES = 20000 # max num of words
MAX_LEN = np.round(mean_length + 3*std_length).astype(int) # max sequence length
EMBED_SIZE = 50 # embedding size
LSTM_UNITS = 50 # LSTM hidden layer unit number
DENSE_UNITS = 50
DROPOUT = 0.2 # dropout rate
BATCH_SIZE = 32
EPOCHS = 2
EMBEDDING_FILE = 'glove.6B.50d.txt' # 200d

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
print(MAX_LEN)

414


In [4]:
tokenizer = text.Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(pd.concat([train_sentence, test_sentence]).values)
tokenized_train = tokenizer.texts_to_sequences(train_sentence.values)
tokenized_test = tokenizer.texts_to_sequences(test_sentence.values)

X_train = sequence.pad_sequences(tokenized_train, maxlen=MAX_LEN)
y = train[label_cols].values
X_test = sequence.pad_sequences(tokenized_test, maxlen=MAX_LEN)

print(tokenized_train[0])

[634, 79, 2, 33, 46, 198, 26, 708, 3771, 8376, 805, 1, 140, 45, 1, 12, 226, 50, 5098, 17, 64, 2056, 159, 5, 492, 31, 127, 1177, 8377, 2249, 7, 51, 14, 12, 98, 2, 280, 28, 2, 43, 22, 150, 5, 1, 2550, 90]


In [5]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

def get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
    embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file))
    all_embs = np.stack(embeddings_index.values())
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.random.normal(all_embs.mean(), all_embs.std(), (nb_words, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix, nb_words

def get_lstm_model(embedding_file, embed_size, max_features, tokenizer,\
                   max_len, lstm_units, dense_units, label_cols, dropout):
    embedding_matrix, inp_len = get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer)
    input = Input(shape=(max_len, ))
    x = Embedding(inp_len, embed_size, weights=[embedding_matrix])(input)
    x = Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(dropout)(x)
    x = Dense(len(label_cols), activation='sigmoid')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

def train_model(model, file_path, batch_size, epochs, X_train, y):
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=20)
    callbacks_list = [checkpoint, earlystopping]
    h = model.fit(X_train, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)
    print(h.history)
    return model

def predict(model, file_path, X_test):
    model.load_weights(file_path)
    return model.predict(X_test, verbose=1)
    
def save(model_name, y_test, label_cols, path, is_train=False):
    if is_train:
#         submission = pd.read_csv(path + 'lstm_one_file/' + 'sample_train.csv')
        file_name = path + 'lstm_one_file/' + 'train_' + model_name + '.csv'
    else:
#         submission = pd.read_csv(path + 'sample_submission.csv')
        file_name = path + model_name + '/' + model_name + '.csv'
    submission = pd.DataFrame(y_test, columns=label_cols)
    submission.to_csv(file_name, index=False)
    
print('done')

done


In [6]:
file_path = '../../model/lstm_best.hdf5'
sample_submission_file_path = PATH + 'sample_submission.csv'

print('getting model')
model = get_lstm_model(PATH + EMBEDDING_FILE, EMBED_SIZE, MAX_FEATURES, tokenizer,\
                       MAX_LEN, LSTM_UNITS, DENSE_UNITS, label_cols, DROPOUT)
print('training')
model = train_model(model, file_path, BATCH_SIZE, EPOCHS, X_train, y)
# print('predicting')
# y_test = predict(model, file_path, X_test)
# print('train predicting')
# y_train = predict(model, file_path, X_train)

# save('lstm', y_test, label_cols, PATH)
# save('lstm', y_train, label_cols, PATH, True)

print('done')

getting model
training
Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2
{'val_loss': [0.04743595612329328, 0.045119911826785639], 'val_acc': [0.98220328167725179, 0.98280903937180464], 'loss': [0.061505156335251408, 0.044636674059173063], 'acc': [0.97857668344104587, 0.98283813561817646]}
done


In [7]:
train = pd.read_csv(PATH + 'cleaned_train.csv')
tokenized_train = tokenizer.texts_to_sequences(train['comment_text_cleaned'].values)
X_train = sequence.pad_sequences(tokenized_train, maxlen=MAX_LEN)
y_train = predict(model, file_path, X_train)
# save('lstm', y_train, label_cols, PATH, True)



In [10]:
p = pd.DataFrame(y_train, columns=label_cols)
r = train[label_cols]
print(p.shape, r.shape)

(159571, 6) (159571, 6)


In [11]:
from sklearn.metrics import confusion_matrix
thres = 0.5
def f(x):
    return (x > thres)*1

for i in label_cols:
    print(i)
    y = p[i].apply(f)
    M = confusion_matrix(r[i], y)
    print(M)

toxic
[[142571   1706]
 [  3451  11843]]
severe_toxic
[[157917     59]
 [  1455    140]]
obscene
[[149766   1356]
 [  1327   7122]]
threat
[[159093      0]
 [   478      0]]
insult
[[150086   1608]
 [  2353   5524]]
identity_hate
[[158063    103]
 [  1063    342]]
