# Comment Toxicity DL Model
---

## 1. Setup

In [None]:
!pip install tensorflow tensorflow-gpu pandas matplotlib sklearn

In [1]:
import os
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
import numpy as np

In [2]:
np.expand_dims??

[1;31mSignature:[0m [0mnp[0m[1;33m.[0m[0mexpand_dims[0m[1;33m([0m[0ma[0m[1;33m,[0m [0maxis[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mSource:[0m   
[1;33m@[0m[0marray_function_dispatch[0m[1;33m([0m[0m_expand_dims_dispatcher[0m[1;33m)[0m[1;33m
[0m[1;32mdef[0m [0mexpand_dims[0m[1;33m([0m[0ma[0m[1;33m,[0m [0maxis[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m    [1;34m"""
    Expand the shape of an array.

    Insert a new axis that will appear at the `axis` position in the expanded
    array shape.

    Parameters
    ----------
    a : array_like
        Input array.
    axis : int or tuple of ints
        Position in the expanded axes where the new axis (or axes) is placed.

        .. deprecated:: 1.13.0
            Passing an axis where ``axis > a.ndim`` will be treated as
            ``axis == a.ndim``, and passing ``axis < -a.ndim - 1`` will
            be treated as ``axis == 0``. This behavior is deprecated.

        .. versionchanged:: 1.18

In [3]:
df = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge', 'train.csv', 'train.csv' ))

In [4]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
df.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\nAnd ... I really don't think you understand...",0,0,0,0,0,0


## 2. Preprocess

In [6]:
from tensorflow.keras.layers import TextVectorization

In [7]:
TextVectorization??

[1;31mInit signature:[0m
[0mTextVectorization[0m[1;33m([0m[1;33m
[0m    [0mmax_tokens[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mstandardize[0m[1;33m=[0m[1;34m'lower_and_strip_punctuation'[0m[1;33m,[0m[1;33m
[0m    [0msplit[0m[1;33m=[0m[1;34m'whitespace'[0m[1;33m,[0m[1;33m
[0m    [0mngrams[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0moutput_mode[0m[1;33m=[0m[1;34m'int'[0m[1;33m,[0m[1;33m
[0m    [0moutput_sequence_length[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mpad_to_max_tokens[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mvocabulary[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0midf_weights[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msparse[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mragged[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [1;33m**[0m[0mkwargs[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m

In [8]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [9]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [10]:
# Numbers of words in the vocab
MAX_FEATURES = 200000

In [11]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                              output_sequence_length=1800,
                              output_mode='int')

In [12]:
vectorizer.adapt(X.values)

In [13]:
vectorized_text = vectorizer(X.values)

In [14]:
# MCSHBAP - map, cache, shuffule, batch, prefetch
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [15]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [16]:
int(len(dataset)*.7)

6981

In [17]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [18]:
train_generator = train.as_numpy_iterator()

In [19]:
train_generator.next()

(array([[   62,   211,    35, ...,     0,     0,     0],
        [   46,   185,    18, ...,     0,     0,     0],
        [   75,    46,   326, ...,     0,     0,     0],
        ...,
        [48556,    43,     6, ...,     0,     0,     0],
        [   10,    25,    11, ...,     0,     0,     0],
        [  186,   103,    53, ...,     0,     0,     0]], dtype=int64),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], dtype=int64))

## 3. Model

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [21]:
model = Sequential()

model.add(Embedding(MAX_FEATURES+1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [22]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [23]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
history = model.fit(train, epochs=1, validation_data=val)

## 4. Predict

In [24]:
input_text = vectorizer("You suck!")

In [25]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [26]:
(model.predict(batch_X) > 0.5).astype(int)



array([[1, 1, 0, 1, 1, 1],
       [1, 1, 0, 0, 1, 1],
       [1, 1, 0, 1, 1, 1],
       [1, 1, 0, 0, 1, 1],
       [1, 1, 0, 1, 1, 1],
       [0, 1, 0, 0, 1, 1],
       [1, 1, 0, 0, 1, 1],
       [1, 1, 0, 1, 1, 1],
       [1, 1, 0, 1, 1, 1],
       [1, 1, 0, 1, 1, 1],
       [1, 1, 0, 0, 0, 1],
       [1, 1, 0, 0, 1, 1],
       [1, 1, 0, 0, 1, 1],
       [1, 1, 0, 1, 0, 1],
       [1, 1, 0, 1, 1, 1],
       [1, 1, 0, 0, 1, 1]])

## 5. Evaulate

In [28]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [29]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [30]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)



KeyboardInterrupt: 

In [31]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.037256062030792236, Recall:0.692307710647583, Accuracy:0.0


## 6. Test

In [None]:
!pip install gradio jinja2

In [33]:
import gradio as gr

In [None]:
model.save('toxicity.h5')

In [34]:
model = tf.keras.models.load_model('toxicity.h5')

In [35]:
input_str = vectorizer('i hate you!')

In [36]:
res = model.predict(np.expand_dims(input_str,0))



In [37]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [38]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')



In [39]:
interface.launch(share=True)

Running on local URL:  http://127.0.0.1:7862/
Running on public URL: https://39241.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


(<gradio.routes.App at 0x20556819bb0>,
 'http://127.0.0.1:7862/',
 'https://39241.gradio.app')

