### Models with Glove embedding ###

Available models:
- BiLSTM
- LSTM_CNN
- GRU_CNN
- LSTM_CNN2
- CNN_LSTM
- CNN_Large

Description in report

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping

from models.all_models import BiLSTM, LSTM_CNN, GRU_CNN, LSTM_CNN2, CNN_LSTM, CNN_Large
from models.constants import MAX_LEN, BERT_MODEL, TRAIN, TEST, EMBED, MAX_WORDS

Download glove vectors

In [None]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
!mkdir embed
!mv glove.840B.300d.zip embed/glove.840B.300d.txt
!cd ~/.vector_cache; unzip embed/glove.840B.300d.txt

-Read train data

-Split train data into train and validation sets in the ration 90%-10%

In [None]:
EMBED = '~/.vector_cache/glove.840B.300d.txt'
train = pd.read_csv(TRAIN)
test = pd.read_csv(TEST)

train, val = train_test_split(train, test_size=0.1)

- Tokenize and pad sequence
- Get datasets for model training and evaluation in required form
- Get embedding matrix for embedding layer

In [None]:
def get_tokenized(train, val, test):
    train_x = train['comment_text'].str.lower()
    val_x = val['comment_text'].str.lower()
    test_x = test['comment_text'].str.lower()


    train_y = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
    val_y = val[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
    
    tokenizer = text.Tokenizer(num_words=MAX_WORDS, lower=True)
    tokenizer.fit_on_texts(list(train_x))
    train_x = tokenizer.texts_to_sequences(train_x)
    val_x = tokenizer.texts_to_sequences(val_x)
    test_x = tokenizer.texts_to_sequences(test_x)

    train_x = sequence.pad_sequences(train_x, maxlen=MAX_LEN)
    val_x = sequence.pad_sequences(val_x, maxlen=MAX_LEN)
    test_x = sequence.pad_sequences(test_x, maxlen=MAX_LEN)
    return tokenizer, train_x, val_x, test_x, train_y, val_y



def get_embedding_matrix(tokenizer, embed_file):
    embeddings = {}
    with open(embed_file, encoding='utf8') as f:
        for line in f:
            values = line.rstrip().rsplit(' ')
            word, vec = values[0], np.asarray(values[1:], dtype='float32')
            embeddings[word] = vec


    embedding_matrix = np.zeros((MAX_WORDS, EMBED_SIZE))
    for word, i in tokenizer.word_index.items():
        if i < MAX_WORDS:
            embedding_vector = embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix


In [None]:
tokenizer, train_x, val_x, test_x, train_y, val_y = get_tokenized(train, val, test)
embedding_matrix = get_embedding_matrix(tokenizer, EMBED)

- Fit the model to train data
- Use early stopping to break when val loss increases
- Optimizer: Adam, Criterion: BCE

In [None]:
BATCH_SIZE = 128


es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
model = LSTM_CNN(embedding_matrix, dropout=0.3, lstm_hidden=128, cnn_filter=64)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=5, verbose=1,callbacks=[es],validation_data=(val_x,val_y))


- Make predictions
- Match format of sample_submission

In [None]:
predictions = model.predict(test_x, batch_size=BATCH_SIZE, verbose=1)
submission = pd.read_csv('input/sample_submission.csv')
submission[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = predictions
submission.to_csv('submission.csv', index=False)