In [26]:
import pandas as pd
import numpy as np
from keras import regularizers
from keras.models import Model
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, BatchNormalization
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [27]:
PATH = '../../data/'
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train0 = pd.read_csv(PATH + 'cleaned_train.csv').fillna('na')
train0['r'] = train0[label_cols].sum(axis=1)
train = pd.concat([train0[train0['r'] != 0], train0[train0['r'] == 0].sample(n=9000)])
test = pd.read_csv(PATH + 'cleaned_test.csv').fillna('na')

train_sentence = train['comment_text_cleaned']
test_sentence = test['comment_text_cleaned']

text_length = pd.concat([train_sentence.apply(lambda x: len(x.split())),\
                         test_sentence.apply(lambda x: len(x.split()))])

mean_length = text_length.mean()
std_length = text_length.std()

print(train.shape)
print(test.shape)
print(mean_length)
print(std_length)

(25225, 28)
(153164, 21)
69.1417239852
117.984804066


In [28]:
# config
MAX_FEATURES = 20000 # max num of words
MAX_LEN = np.round(mean_length + 3*std_length).astype(int) # max sequence length
EMBED_SIZE = 50 # embedding size
LSTM_UNITS = 200 # LSTM hidden layer unit number
DENSE_UNITS = 200
DROPOUT = 0.5 # dropout rate
BATCH_SIZE = 32
EPOCHS = 2
EMBEDDING_FILE = 'glove.6B.50d.txt'

print(MAX_LEN)

423


In [29]:
tokenizer = text.Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(pd.concat([train_sentence, test_sentence]).values)
tokenized_train = tokenizer.texts_to_sequences(train_sentence.values)
tokenized_test = tokenizer.texts_to_sequences(test_sentence.values)

X_train = sequence.pad_sequences(tokenized_train, maxlen=MAX_LEN)
y = train[label_cols].values
X_test = sequence.pad_sequences(tokenized_test, maxlen=MAX_LEN)

print(tokenized_train[0])

[1335, 171, 5, 1029, 308, 17, 27, 98]


In [30]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

def get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
    embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file))
    all_embs = np.stack(embeddings_index.values())
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.random.normal(all_embs.mean(), all_embs.std(), (nb_words, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix, nb_words

def get_lstm_model(embedding_file, embed_size, max_features, tokenizer,\
                   max_len, lstm_units, dense_units, label_cols, dropout):
    embedding_matrix, inp_len = get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer)
    input = Input(shape=(max_len, ))
    x = Embedding(inp_len, embed_size, weights=[embedding_matrix])(input)
    x = Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(x)
    x = GlobalMaxPool1D()(x)
    x = BatchNormalization()(x)
    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(dropout)(x)
    x = Dense(len(label_cols), activation='sigmoid')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

def train_model(model, file_path, batch_size, epochs, X_train, y):
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=20)
    callbacks_list = [checkpoint, earlystopping]
    model.fit(X_train, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)
    return model

def predict(model, file_path, X_test):
    model.load_weights(file_path)
    return model.predict(X_test, verbose=1)
    
def save(model_name, y_test, label_cols, path, is_train=False):
    if is_train:
        submission = pd.read_csv(path + 'sample_train.csv')
        file_name = 'train_' + model_name
    else:
        submission = pd.read_csv(path + 'sample_submission.csv')
        file_name = model_name
    submission[label_cols] = y_test
    submission.to_csv(path + model_name + '/' + file_name + '.csv', index=False)
    
print('done')

done


In [31]:
file_path = '../../model/lstm_best0.hdf5'
sample_submission_file_path = PATH + 'sample_submission.csv'

print('getting model')
model = get_lstm_model(PATH + EMBEDDING_FILE, EMBED_SIZE, MAX_FEATURES, tokenizer,\
                       MAX_LEN, LSTM_UNITS, DENSE_UNITS, label_cols, DROPOUT)
print('training')
model = train_model(model, file_path, BATCH_SIZE, EPOCHS, X_train, y)
print('predicting')
# y_test = predict(model, file_path, X_test)
# print('train predicting')
y_train = predict(model, file_path, X_train)

# save('lstm', y_test, label_cols, PATH)
save('lstm0', y_train, label_cols, PATH, True)

print('done')

getting model
training
Train on 22702 samples, validate on 2523 samples
Epoch 1/2
Epoch 2/2
predicting


ValueError: Length of values does not match length of index

In [32]:
train = pd.read_csv('../../data/cleaned_train.csv')
train_sentence = train['comment_text_cleaned']
tokenized_train = tokenizer.texts_to_sequences(train_sentence.values)
X_train = sequence.pad_sequences(tokenized_train, maxlen=MAX_LEN)
y_train = predict(model, file_path, X_train)
save('lstm0', y_train, label_cols, PATH, True)



In [34]:
# y = pd.DataFrame(y_train, columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'])
l = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y = pd.read_csv('../../data/lstm0/train_lstm0.csv')
r = pd.read_csv('../../data/cleaned_train.csv')[l]

In [35]:
# 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
from sklearn.metrics import confusion_matrix
l = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
thres = 0.5
def f(x):
    return (x > thres)*1

for i in l:
    print(i)
    z = y[i].apply(f)
    M = confusion_matrix(r[i], z)
    print(M)

toxic
[[119381  24896]
 [   554  14740]]
severe_toxic
[[157202    774]
 [   895    700]]
obscene
[[148654   2468]
 [  1537   6912]]
threat
[[159010     83]
 [   409     69]]
insult
[[147417   4277]
 [  1289   6588]]
identity_hate
[[157396    770]
 [   847    558]]
