---Tested on AWS instance DeepLearning AMI---
---Jupyter notebook file---

In [4]:
!pip install keras==2.4.3
!pip install transformers==3.5.1

Collecting transformers
  Downloading transformers-3.5.1-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 16.4 MB/s eta 0:00:01
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.43.tar.gz (883 kB)
[K     |████████████████████████████████| 883 kB 57.7 MB/s eta 0:00:01
Collecting tokenizers==0.9.3
  Downloading tokenizers-0.9.3-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 58.1 MB/s eta 0:00:01
Collecting regex!=2019.12.17
  Downloading regex-2020.11.13-cp37-cp37m-manylinux2014_x86_64.whl (719 kB)
[K     |████████████████████████████████| 719 kB 59.0 MB/s eta 0:00:01
[?25hCollecting sentencepiece==0.1.91
  Downloading sentencepiece-0.1.91-cp37-cp37m-manylinux1_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 60.0 MB/s eta 0:00:01
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25ldone
[?25h  Created wheel for sacremoses: filename=sa

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer,TFBertModel

from models.all_models import BertModel as bm
from models.constants import MAX_LEN, BERT_MODEL, TRAIN, TEST

In [3]:
train = pd.read_csv(TRAIN)
test = pd.read_csv(TEST)

train, val = train_test_split(train, test_size=0.1)



In [8]:
def bert_tokenize(tokenizer, sentences):
    tokenized_sentences = []
    for sentence in sentences:
        tokenized_sentence = tokenizer.encode(sentence,add_special_tokens = True, truncation=True, max_length = MAX_LEN)
        tokenized_sentences.append(tokenized_sentence)
    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []
    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)
    return np.asarray(attention_masks)

def get_bert_tokenized(model_name, train, val, test):
    train_x = train['comment_text'].str.lower()
    val_x = val['comment_text'].str.lower()
    test_x = test['comment_text'].str.lower()


    train_y = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
    val_y = val[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
    
    tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
    train_x = bert_tokenize(tokenizer,train_x)
    train_x = pad_sequences(train_x, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
    attn_train_x = create_attention_masks(train_x)
    
    val_x = bert_tokenize(tokenizer,val_x)
    val_x = pad_sequences(val_x, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
    attn_val_x = create_attention_masks(val_x)
    
    
    test_x = bert_tokenize(tokenizer,test_x)
    test_x = pad_sequences(test_x, maxlen=MAX_LEN,  dtype="long", value=0, truncating="post", padding="post")
    attn_test_x = create_attention_masks(test_x)
    
    return train_x, val_x, test_x, train_y, val_y, attn_train_x, attn_val_x, attn_test_x

In [9]:
train_x, val_x, test_x, train_y, val_y, attn_train_x,\
    attn_val_x, attn_test_x = get_bert_tokenized(BERT_MODEL, train, val, test)

In [17]:
BATCH_SIZE = 64

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
model = bm(TFBertModel.from_pretrained('bert-base-uncased'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
for w in model.bert.weights:
        w._trainable= False
model.fit(train_x, train_y, batch_size=BATCH_SIZE, epochs=5,
          verbose=1,callbacks=[es],validation_data=(val_x,val_y))


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/5








Epoch 2/5
Epoch 00002: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f86c7e3b0d0>

In [None]:
predictions = model.predict(test_x, batch_size=BATCH_SIZE, verbose=1)
submission = pd.read_csv('input/sample_submission.csv')
submission[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = predictions
submission.to_csv('submission.csv', index=False)

