<a href="https://colab.research.google.com/github/vyom10445/toxicity-classifier/blob/main/notebooks/Toxicity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [None]:
!git clone https://github.com/vyom10445/toxicity-classifier.git

In [None]:
import os
os.chdir("toxicity-classifier")

In [None]:
df = pd.read_csv(
    os.path.join(
        "data",
        "jigsaw-toxic-comment-classification-challenge",
        "train.csv"
    )
)

In [None]:
from tensorflow.keras.layers import TextVectorization #preprocessing

In [None]:
x = df['comment_text']
y = df[df.columns[2:]].values

In [None]:
MAX_FEATURES = 200000 # number of words in the vocab

In [None]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [None]:
vectorizer.adapt(x.values)

In [None]:
vectorized_text = vectorizer(x.values)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps prevent bottlenecks

In [None]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [None]:
#build a sequential model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Bidirectional,Dropout,Dense,Embedding

In [None]:
#instantiate our model(instantiate the sequential api)
model=Sequential()
#create the embedding layer
model.add(Embedding(MAX_FEATURES+1,32))
#bidirectional LSTM layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers(dense layers)
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))
#Final layer
model.add(Dense(6,activation='sigmoid')) #maps to the no. of different of outputs that we have got inside our neural network

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
history = model.fit(train , epochs=1 , validation_data=val)

In [None]:
#make predictions
input_text = vectorizer('You freaking suck! I am going to hit you.')

In [None]:
res = model.predict(np.expand_dims(input_text,axis=0))

In [None]:
(res > 0.5).astype(int)

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.5).astype(int)

In [None]:
#evaluate model
from tensorflow.keras.metrics import Precision,Recall,CategoricalAccuracy

In [None]:
pre=Precision()
re=Recall()
acc=CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
    # Unpack the batch
    X_true, y_true = batch
    # Make a prediction
    yhat = model.predict(X_true)

    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

In [None]:
#test and gradio
!pip install gradio jinja2

In [None]:
import gradio as gr

In [None]:
model.save('toxicity.h5')

In [None]:
input_str = vectorizer('hey i freaken hate you!')

In [None]:
res = model.predict(np.expand_dims(input_str,0))

In [None]:
res

In [None]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)

    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)

    return text

In [None]:
interface = gr.Interface(fn=score_comment,
                         inputs = gr.Textbox(lines=2, placeholder='Comment to score'),
                        outputs=gr.Textbox(lines=10, label="Prediction")
)

In [None]:
interface.launch(share=True)