In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping

from models.all_models import BiLSTM, LSTM_CNN, GRU_CNN, LSTM_CNN2, CNN_LSTM, CNN_Large
from models.constants import MAX_LEN, BERT_MODEL, TRAIN, TEST, EMBED, MAX_WORDS

In [7]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
!mkdir embed
!mv glove.840B.300d.zip embed/glove.840B.300d.txt
!cd ~/.vector_cache; unzip embed/glove.840B.300d.txt

--2020-11-30 12:08:15--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2020-11-30 12:08:15--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2020-11-30 12:08:15--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip

In [10]:
EMBED = '~/.vector_cache/glove.840B.300d.txt'
train = pd.read_csv(TRAIN)
test = pd.read_csv(TEST)

train, val = train_test_split(train, test_size=0.1)

In [13]:


def get_tokenized(train, val, test):
    train_x = train['comment_text'].fillna('').str.lower()
    val_x = val['comment_text'].fillna('').str.lower()
    test_x = test['comment_text'].fillna('').str.lower()


    train_y = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
    val_y = val[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
    
    tokenizer = text.Tokenizer(num_words=MAX_WORDS, lower=True)
    tokenizer.fit_on_texts(list(train_x))
    train_x = tokenizer.texts_to_sequences(train_x)
    val_x = tokenizer.texts_to_sequences(val_x)
    test_x = tokenizer.texts_to_sequences(test_x)

    train_x = sequence.pad_sequences(train_x, maxlen=MAX_LEN)
    val_x = sequence.pad_sequences(val_x, maxlen=MAX_LEN)
    test_x = sequence.pad_sequences(test_x, maxlen=MAX_LEN)
    return tokenizer, train_x, val_x, test_x, train_y, val_y



def get_embedding_matrix(tokenizer, embed_file):
    embeddings = {}
    with open(embed_file, encoding='utf8') as f:
        for line in f:
            values = line.rstrip().rsplit(' ')
            word, vec = values[0], np.asarray(values[1:], dtype='float32')
            embeddings[word] = vec


    embedding_matrix = np.zeros((MAX_WORDS, EMBED_SIZE))
    for word, i in tokenizer.word_index.items():
        if i < MAX_WORDS:
            embedding_vector = embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix


In [14]:
tokenizer, train_x, val_x, test_x, train_y, val_y = get_tokenized(train, val, test)
embedding_matrix = get_embedding_matrix(tokenizer, EMBED)

FileNotFoundError: [Errno 2] No such file or directory: '~/.vector_cache/glove.840B.300d.txt'

In [None]:
BATCH_SIZE = 128


es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
model = LSTM_CNN(embedding_matrix, dropout=0.3, lstm_hidden=128, cnn_filter=64)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=2, verbose=1,callbacks=[es],validation_data=(val_x,val_y))


In [None]:
def bert_tokenize(tokenizer, sentences):
    tokenized_sentences = []
    for sentence in sentences:
        tokenized_sentence = tokenizer.encode(sentence,add_special_tokens = True, truncation=True, max_length = MAX_LEN)
        tokenized_sentences.append(tokenized_sentence)
    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []
    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)
    return np.asarray(attention_masks)

def get_bert_tokenized(train, val, test):
    train_x = train['comment_text'].fillna('').str.lower()
    val_x = val['comment_text'].fillna('').str.lower()
    test_x = test['comment_text'].fillna('').str.lower()


    train_y = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
    val_y = val[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    train_x = bert_tokenize(tokenizer,train_x)
    train_x = pad_sequences(train_x, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
    attn_train_x = create_attention_masks(train_x)
    
    val_x = bert_tokenize(tokenizer,val_x)
    val_x = pad_sequences(val_x, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
    attn_val_x = create_attention_masks(val_x)
    
    
    test_x = bert_tokenize(tokenizer,test_x)
    test_x = pad_sequences(test_x, maxlen=MAX_LEN,  dtype="long", value=0, truncating="post", padding="post")
    attn_text_x = create_attention_masks(test_x)
    
    return train_x, val_x, test_x, train_y, val_y, attn_train_x, attn_val_x, attn_test_x

In [None]:
train_x, val_x, test_x, train_y, val_y, attn_train_x, attn_val_x, attn_test_x = get_bert_tokenized(train, val, test)

In [None]:
BATCH_SIZE = 64


es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
model = BertModel(TFBertModel.from_pretrained('bert-base-uncased'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=2, verbose=1,callbacks=[es],validation_data=(val_x,val_y))


In [None]:
predictions = model.predict(test_x, batch_size=BATCH_SIZE, verbose=1)
submission = pd.read_csv('input/sample_submission.csv')
submission[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = predictions
submission.to_csv('submission.csv', index=False)