In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [1]:
#This notebook is run on Kaggle
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from keras.layers import Dense, Input, LSTM,GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate
from keras.optimizers import Adam, SGD
from keras.models import Model
import transformers
from transformers import TFAutoModel, AutoTokenizer,AutoModel
from tqdm.notebook import tqdm
from keras.callbacks import Callback
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
import re



In [2]:
MODEL = 'jplu/tf-xlm-roberta-large'
# select model from hugging face

In [3]:
#tokenize
def encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=True, 
        return_token_type_ids=True,
        pad_to_max_length=True,
        max_length=maxlen
     )

    return np.array(enc_di['input_ids'],dtype=np.int32),np.array(enc_di['attention_mask'],dtype=np.int32),np.array(enc_di['token_type_ids'],dtype=np.int32)

In [4]:
def build_model(transformer, max_len):
    
    input_ids = Input(shape=(max_len,), dtype=np.int32, name='input_ids')
    input_mask = Input(shape=(max_len,), dtype=np.int32, name='input_mask')
    segment_ids = Input(shape=(max_len,), dtype=np.int32, name='segment_ids')
    transformer_output = transformer((input_ids, input_mask, segment_ids))[0]
    
    #max + mean pooling
    gp = GlobalMaxPooling1D()(transformer_output)
    ap = GlobalAveragePooling1D()(transformer_output)
    stack = concatenate([gp,ap],axis=1)
    
    output = Dense(1, activation = 'sigmoid')(stack)
    
    model = Model(inputs=[input_ids, input_mask, segment_ids], outputs=output)
    model.compile(Adam(lr=0.2e-5), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC()])
    
    return model

In [5]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=513.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




In [7]:
#Only keeping the head and the tail is a good way for dealing with long text classification problems, 
#as indicated in the paper "How to Fine-Tune BERT for Text Classification" by Chi et al
    
    input_df = df.copy()
    max_len = head_len + tail_len
    input_df['text_head'] = input_df[text_column].apply(lambda x: ' '.join(x.split()[:head_len]))
    input_df['text_tail'] = input_df[text_column].apply(lambda x: ' '.join(x.split()[-tail_len:]))
    input_df['text_len_1'] = input_df[text_column].apply(lambda x: len(x.split()))
    input_df[text_column] = np.where(input_df['text_len_1'] > max_len,
                                     input_df['text_head'] + ' ' + input_df['text_tail'],
                                     input_df[text_column])
    input_df['text_len_2'] = input_df[text_column].apply(lambda x: len(x.split()))
    
    return input_df

In [8]:
def make_train_set():
    
    cols = ['comment_text', 'toxic']
    train = pd.read_csv('../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-es-cleaned.csv')[cols].head(0)

    for lang in langs:
        train_lang = pd.read_csv(f'../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-{lang}-cleaned.csv')[cols]
        train_lang['lang'] = lang
        train_lang_sampled = pd.concat([train_lang.query('toxic==1'),
                                        train_lang.query('toxic==0').sample(sum(train_lang.toxic))])
        train = train.append(train_lang_sampled)
    
    return train

In [9]:
valid = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/test.csv')
langs = list(set(test['lang']))
train = make_train_set()

In [20]:
%%time
train_head_tail = get_head_tail(train, text_column= 'comment_text',head_len=126, tail_len=126)
valid_head_tail = get_head_tail(valid, text_column= 'comment_text', head_len=126, tail_len=126)
test_head_tail = get_head_tail(test, text_column= 'content', head_len=126, tail_len=126)

CPU times: user 10.4 s, sys: 159 ms, total: 10.6 s
Wall time: 10.6 s


In [21]:
MAX_LEN = 256

In [22]:
%%time
x_train = encode(train_head_tail['comment_text'].values, tokenizer, maxlen=MAX_LEN)
x_valid = encode(valid_head_tail['comment_text'].values, tokenizer, maxlen=MAX_LEN)
x_test = encode(test_head_tail['content'].values, tokenizer, maxlen=MAX_LEN)

y_train = train['toxic'].values
y_valid = valid['toxic'].values

CPU times: user 4min 15s, sys: 1.44 s, total: 4min 17s
Wall time: 4min 16s


In [16]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 2
BATCH_SIZE = 16* strategy.num_replicas_in_sync
MAX_LEN = 256

In [23]:
valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

In [24]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .shuffle(len(y_train))
    .batch(BATCH_SIZE)
    .repeat()
    .prefetch(AUTO)
)

In [19]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3271420488.0, style=ProgressStyle(descr…


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 256)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode ((None, 256, 1024),  559890432   input_ids[0][0]                  
                                                                 input_mask[0][0]            

In [25]:
#fit the model using the training set
%%time
n_steps = train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    steps_per_epoch=n_steps,
    epochs=EPOCHS,
    shuffle=False,
)

Epoch 1/2


  num_elements)


Epoch 2/2
CPU times: user 3min 37s, sys: 11.3 s, total: 3min 49s
Wall time: 37min 29s


In [26]:
#fit the model using validation set
%%time
n_steps = valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=4,
    shuffle=False,
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
CPU times: user 8.92 s, sys: 613 ms, total: 9.53 s
Wall time: 2min 51s


In [28]:
submission = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')

In [35]:
submission['toxic'] = model.predict(x_test, verbose=1)



In [36]:
submission.head()

Unnamed: 0,id,toxic
0,0,0.012759
1,1,0.196475
2,2,0.629261
3,3,0.015343
4,4,0.026323


In [31]:
model.save_weights('checkpoint.h5', overwrite=True)