In [2]:
!pip install keras==2.4.3

Collecting keras
  Downloading Keras-2.4.3-py2.py3-none-any.whl (36 kB)
Installing collected packages: keras
Successfully installed keras-2.4.3


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer

from all_models import *

In [2]:
train = pd.read_csv('cleaned/train.csv')
test = pd.read_csv('cleaned/test.csv')

train, val = train_test_split(train, test_size=0.1)

In [None]:
def bert_tokenize(tokenizer, sentences):
    tokenized_sentences = []
    for sentence in sentences:
        tokenized_sentence = tokenizer.encode(sentence,add_special_tokens = True, truncation=True, max_length = MAX_LEN)
        tokenized_sentences.append(tokenized_sentence)
    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []
    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)
    return np.asarray(attention_masks)

def get_bert_tokenized(model_name, train, val, test):
    train_x = train['comment_text']
    val_x = val['comment_text']
    test_x = test['comment_text']


    train_y = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
    val_y = val[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
    
    tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
    train_x = bert_tokenize(tokenizer,train_x)
    train_x = pad_sequences(train_x, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
    attn_train_x = create_attention_masks(train_x)
    
    val_x = bert_tokenize(tokenizer,val_x)
    val_x = pad_sequences(val_x, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
    attn_val_x = create_attention_masks(val_x)
    
    
    test_x = bert_tokenize(tokenizer,test_x)
    test_x = pad_sequences(test_x, maxlen=MAX_LEN,  dtype="long", value=0, truncating="post", padding="post")
    attn_text_x = create_attention_masks(test_x)
    
    return train_x, val_x, test_x, train_y, val_y, attn_train_x, attn_val_x, attn_test_x

In [None]:
train_x, val_x, test_x, train_y, val_y, attn_train_x,
    attn_val_x, attn_test_x = get_bert_tokenized(train, val, test)

In [None]:
BATCH_SIZE = 64

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
model = BertModel(TFBertModel.from_pretrained('bert-base-uncased'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=5,
          verbose=1,callbacks=[es],validation_data=(val_x,val_y))


In [None]:
predictions = model.predict(test_x, batch_size=BATCH_SIZE, verbose=1)
submission = pd.read_csv('input/sample_submission.csv')
submission[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = predictions
submission.to_csv('submission.csv', index=False)