In [1]:
import pandas as pd
import numpy as np
from keras import regularizers
from keras.models import Model
from nltk.tokenize import TweetTokenizer
from keras.layers import Dense, Embedding, Input, LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
PATH = '../../data/'

train = pd.read_csv(PATH + 'lstm_one_file/lstm.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')

train_sentence = train['comment_text_cleaned']
test_sentence = test['comment_text_cleaned']

text_length = pd.concat([train_sentence.apply(lambda x: len(x.split())),\
                         test_sentence.apply(lambda x: len(x.split()))])

mean_length = text_length.mean()
std_length = text_length.std()

print(train.shape)
print(test.shape)
print(mean_length)
print(std_length)

(42622, 28)
(153164, 21)
68.3695769871
124.349041979


In [3]:
# config
MAX_FEATURES = 20000 # max num of words
MAX_LEN = np.round(mean_length + 3*std_length).astype(int) # max sequence length
EMBED_SIZE = 50 # embedding size
LSTM_UNITS = 50 # LSTM hidden layer unit number
DENSE_UNITS = 50
DROPOUT = 0.2 # dropout rate
BATCH_SIZE = 32
EPOCHS = 2
EMBEDDING_FILE = 'glove.6B.50d.txt' # 200d

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
print(MAX_LEN)

441


In [4]:
tokenizer = text.Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(pd.concat([train_sentence, test_sentence]).values)
tokenized_train = tokenizer.texts_to_sequences(train_sentence.values)
tokenized_test = tokenizer.texts_to_sequences(test_sentence.values)

X_train = sequence.pad_sequences(tokenized_train, maxlen=MAX_LEN)
y = train[label_cols].values
X_test = sequence.pad_sequences(tokenized_test, maxlen=MAX_LEN)

print(tokenized_train[0])

[6962, 8566, 362, 7, 68, 1446, 18, 2, 845, 1, 718, 248, 6962, 1, 4, 286, 7642, 230, 15, 22, 52, 8233, 17, 334, 489, 4, 566, 49, 14, 12, 953, 17, 8566, 362, 14, 12, 13, 5, 1168, 40, 9, 78, 49, 1, 4, 3282, 462, 22, 29, 11972, 9803, 17, 6312, 68, 1446, 18, 2, 845, 1, 50, 4, 321, 217, 8, 77, 11, 443]


In [5]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

def get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer):
    embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file))
    all_embs = np.stack(embeddings_index.values())
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.random.normal(all_embs.mean(), all_embs.std(), (nb_words, embed_size))
    for word, i in word_index.items():
        if i < max_features:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix, nb_words

def get_lstm_model(embedding_file, embed_size, max_features, tokenizer,\
                   max_len, lstm_units, dense_units, label_cols, dropout):
    embedding_matrix, inp_len = get_embedding_matrix(embedding_file, embed_size, max_features, tokenizer)
    input = Input(shape=(max_len, ))
    x = Embedding(inp_len, embed_size, weights=[embedding_matrix])(input)
    x = Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=dropout, recurrent_dropout=dropout))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(dropout)(x)
    x = Dense(len(label_cols), activation='sigmoid')(x)
    model = Model(inputs=input, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

def train_model(model, file_path, batch_size, epochs, X_train, y):
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    earlystopping = EarlyStopping(monitor="val_loss", mode="min", patience=20)
    callbacks_list = [checkpoint, earlystopping]
    h = model.fit(X_train, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)
    print(h.history)
    return model

def predict(model, file_path, X_test):
    model.load_weights(file_path)
    return model.predict(X_test, verbose=1)
    
def save(model_name, y_test, label_cols, path, is_train=False):
    if is_train:
#         submission = pd.read_csv(path + 'lstm_one_file/' + 'sample_train.csv')
        file_name = path + 'lstm_one_file/' + 'train_' + model_name + '.csv'
    else:
#         submission = pd.read_csv(path + 'sample_submission.csv')
        file_name = path + model_name + '/' + model_name + '.csv'
    submission = pd.DataFrame(y_test, columns=label_cols)
    submission.to_csv(file_name, index=False)
    
print('done')

done


In [6]:
file_path = '../../model/lstm_best.hdf5'
sample_submission_file_path = PATH + 'sample_submission.csv'

print('getting model')
model = get_lstm_model(PATH + EMBEDDING_FILE, EMBED_SIZE, MAX_FEATURES, tokenizer,\
                       MAX_LEN, LSTM_UNITS, DENSE_UNITS, label_cols, DROPOUT)
print('training')
model = train_model(model, file_path, BATCH_SIZE, EPOCHS, X_train, y)
print('predicting')
# y_test = predict(model, file_path, X_test)
print('train predicting')
# y_train = predict(model, file_path, X_train)

# save('lstm', y_test, label_cols, PATH)
# save('lstm', y_train, label_cols, PATH, True)

print('done')

getting model
training
Train on 38359 samples, validate on 4263 samples
Epoch 1/2
Epoch 2/2
{'val_loss': [0.076986934403326768, 0.050994507586546355], 'val_acc': [0.9750566845233245, 0.98436155818086424], 'loss': [0.1969421190524854, 0.074392549217616677], 'acc': [0.9233122156973399, 0.9763158559239804]}
predicting
train predicting
done


In [7]:
train = pd.read_csv(PATH + 'cleaned_train.csv')
tokenized_train = tokenizer.texts_to_sequences(train['comment_text_cleaned'].values)
X_train = sequence.pad_sequences(tokenized_train, maxlen=MAX_LEN)
y_train = predict(model, file_path, X_train)
# save('lstm', y_train, label_cols, PATH, True)



In [8]:
p = pd.DataFrame(y_train, columns=label_cols)
r = pd.read_csv('../../data/sample_train.csv')
print(p.shape, r.shape)

(159571, 6) (159571, 7)


In [9]:
from sklearn.metrics import confusion_matrix
thres = 0.5
def f(x):
    return (x > thres)*1

for i in label_cols:
    print(i)
    y = p[i].apply(f)
    M = confusion_matrix(r[i], y)
    print(M)

toxic
[[141892   2385]
 [  5037  10257]]
severe_toxic
[[151124   6852]
 [    40   1555]]
obscene
[[148595   2527]
 [  1531   6918]]
threat
[[157459   1634]
 [   163    315]]
insult
[[149881   1813]
 [  2693   5184]]
identity_hate
[[155770   2396]
 [   479    926]]
