In [2]:
!pip install -q tqdm scikit-learn seaborn matplotlib numpy pandas transformers

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
%matplotlib inline

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
import transformers

import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Plan

  * Embed each paragraph using xlm-roberta:
      * create a `[PARA_SIZE, SENT_SIZE, 128]` matrix of embeddings
  * train a simple logistic regression model:
      * Input layer - `[PARA_SIZE, SENT_SIZE, 128]`
      * Flatten layer - `[PARA_SIZE * SENT_SIZE * 128]`
      * Dense layer - `[PARA_SIZE * SENT_SIZE, 1]`
      * Output layer - `[1]` <- tanh loss? sigmoid loss?

## Create a dataset

In [4]:
data = pd.read_csv('./jigsaw-toxic-comment-train.csv')
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
223544,fff8f64043129fa2,":Jerome, I see you never got around to this…! ...",0,0,0,0,0,0
223545,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...,0,0,0,0,0,0
223546,fffa8a11c4378854,==shame on you all!!!== \n\n You want to speak...,0,0,0,0,0,0
223547,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,0,1,0,1,0


In [5]:
from transformers import XLMRobertaTokenizer, TFXLMRobertaForSequenceClassification


tokenizer = XLMRobertaTokenizer.from_pretrained('jplu/tf-xlm-roberta-base')
model = TFXLMRobertaForSequenceClassification.from_pretrained('jplu/tf-xlm-roberta-base')

input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # batchsize 1
labels = tf.reshape(tf.constant(1), (-1, 1))  # batchsize 1

outputs = model.call(input_ids)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1885418496.0, style=ProgressStyle(descr…




In [6]:
outputs[0].numpy()

array([[ 0.23678419, -0.01497877]], dtype=float32)

In [7]:
data['encoded_text'] = data['comment_text']\
    .map(tokenizer.encode)
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,encoded_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"[0, 5443, 5868, 2320, 44084, 70, 27211, 7, 722..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"[0, 391, 25, 11, 98251, 38, 1529, 14858, 90, 9..."
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"[0, 28240, 332, 4, 87, 25, 39, 6183, 959, 3157..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"[0, 44, 5455, 87, 831, 25, 18, 3249, 2499, 277..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"[0, 2583, 4, 14095, 4, 621, 759, 40814, 5, 285..."
...,...,...,...,...,...,...,...,...,...
223544,fff8f64043129fa2,":Jerome, I see you never got around to this…! ...",0,0,0,0,0,0,"[0, 152, 1375, 56, 13450, 4, 87, 1957, 398, 83..."
223545,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...,0,0,0,0,0,0,"[0, 6, 69112, 116742, 1002, 26462, 4216, 69112..."
223546,fffa8a11c4378854,==shame on you all!!!== \n\n You want to speak...,0,0,0,0,0,0,"[0, 6, 69112, 2420, 282, 98, 398, 756, 1564, 6..."
223547,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,0,1,0,1,0,"[0, 105030, 527, 29877, 58627, 15744, 62, 541,..."


In [8]:
model.layers

[<transformers.modeling_tf_roberta.TFRobertaMainLayer at 0x7f0d8f312668>,
 <transformers.modeling_tf_roberta.TFRobertaClassificationHead at 0x7f0d8ee9d7f0>]

In [9]:
model.layers[0].trainable = False

In [10]:
model.summary()

Model: "tfxlm_roberta_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  278043648 
_________________________________________________________________
classifier (TFRobertaClassif multiple                  592130    
Total params: 278,635,778
Trainable params: 592,130
Non-trainable params: 278,043,648
_________________________________________________________________


In [18]:
model.compile(
    optimizer=keras.optimizers.RMSprop(learning_rate=1e-3),
    loss=keras.losses.SparseCategoricalCrossentropy(),
    metrics=[keras.metrics.sparse_categorical_accuracy]
)

In [43]:
filtered_df = data[ data['toxic'] == 0].sample(
    len(data[ data['toxic'] == 1])
)
filtered_df = filtered_df.append(data[ data['toxic'] == 1])

train_x = list(filtered_df['encoded_text'])
train_y = list(filtered_df['toxic'])

In [44]:
def gen():
    for x, y in zip(train_x, train_y):
        yield [x[:512]], y

train_dataset = tf.data.Dataset.from_generator(
    gen,
    (tf.int64, tf.int64),
    (
        tf.TensorShape([1, None]),
        tf.TensorShape([])
    )
)

In [47]:
model.fit(train_dataset.shuffle(100000), epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f0d8f5c1a90>