### Imports

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy
import gradio as g

In [2]:
data = pd.read_csv('./train.csv') 

In [3]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


### Preprocess Data using TextVectorization and creating the dataset

In [4]:
x = data['comment_text']
y = data[data.columns[2:]].values

In [5]:
x

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [6]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [7]:
x.shape, y.shape

((159571,), (159571, 6))

In [8]:
MAX_FEATURES = 200000

In [9]:
vectorizer = TextVectorization(max_tokens = MAX_FEATURES,
                              output_sequence_length = 1800,
                              output_mode = 'int')

In [10]:
vectorizer

<keras.layers.preprocessing.text_vectorization.TextVectorization at 0x7fd8e0aa5150>

In [11]:
vectorizer.adapt(x.values)

In [12]:
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'the', 'to', 'of']

In [13]:
vectorizer('I love Boston')[:3]

<tf.Tensor: shape=(3,), dtype=int64, numpy=array([   8,  457, 4174])>

In [14]:
vectorized_text = vectorizer(x.values)

In [15]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

In [16]:
# Data Building Pipeline: Map, Cache, Shuffle, Batch, Prefetch
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [17]:
dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(None, 1800), dtype=tf.int64, name=None), TensorSpec(shape=(None, 6), dtype=tf.int64, name=None))>

In [18]:
batch_x,batch_y = dataset.as_numpy_iterator().next()

In [19]:
batch_x.shape,batch_y.shape

((16, 1800), (16, 6))

In [20]:
train = dataset.take(int(len(dataset) * 0.7))
val = dataset.skip(int(len(dataset) * 0.7)).take(int(len(dataset) * 0.2))
test = dataset.skip(int(len(dataset) * 0.9)).take(int(len(dataset) * 0.1))

### Creating and Training a Deep Neural Network

In [21]:
model = Sequential()
model.add(Embedding(MAX_FEATURES + 1,32))
model.add(Bidirectional(LSTM(32,activation = 'tanh')))
model.add(Dense(128,activation = 'relu'))
model.add(Dense(256,activation = 'relu'))
model.add(Dense(128,activation = 'relu'))
model.add(Dense(6,activation = 'sigmoid'))

In [22]:
model.compile(loss = 'BinaryCrossentropy',optimizer = 'Adam')

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [24]:
history = model.fit(train,epochs = 2,validation_data = val)



### Making Prediction from the trained model

In [25]:
input_text = vectorizer("You suck, and you are not good at anything!!!")
result = model.predict(np.expand_dims(input_text,0))



In [26]:
result

array([[0.9735049 , 0.17865597, 0.86612546, 0.0242242 , 0.6861623 ,
        0.10093824]], dtype=float32)

In [27]:
data.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

### Model Evaluation

In [30]:
precision = Precision()
recall = Recall()
acc = CategoricalAccuracy()

In [31]:
for batch in test.as_numpy_iterator(): 
    X_true, y_true = batch
    yhat = model.predict(X_true)
    
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    precision.update_state(y_true, yhat)
    recall.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)



In [32]:
print("Precision -> ",precision.result().numpy())
print("Recall -> ",recall.result().numpy())
print("Accuracy -> ",acc.result().numpy())

Precision ->  0.8262161
Recall ->  0.6313848
Accuracy ->  0.47141424


### Save the model for future use

In [35]:
model.save('comment_toxic.h5')

### Making a Gradio Application so that I can be used to make real predictions

In [37]:
model = tf.keras.models.load_model('comment_toxic.h5')

In [38]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(data.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [39]:
interface = g.Interface(fn = score_comment, 
                         inputs = g.inputs.Textbox(lines = 2, placeholder = 'Comment to score'),
                        outputs = 'text')

  "Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your component from gradio.components",


In [40]:
interface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Running on public URL: https://12774.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces: https://huggingface.co/spaces


(<gradio.routes.App at 0x7fd7e9dd9ed0>,
 'http://127.0.0.1:7860/',
 'https://12774.gradio.app')