In [1]:
import os
user = ''
key = ''

if '.kaggle' not in os.listdir('/root'):
    !mkdir ~/.kaggle
!touch /root/.kaggle/kaggle.json
!chmod 666 /root/.kaggle/kaggle.json
with open('/root/.kaggle/kaggle.json', 'w') as f:
    f.write('{"username":"%s","key":"%s"}' % (user, key))
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

In [None]:
!unzip /content/jigsaw-toxic-comment-classification-challenge.zip

In [None]:
!unzip /content/train.csv.zip
!unzip /content/test.csv.zip

In [5]:
import pandas as pd
import tensorflow as tf
import numpy as np

In [6]:
df = pd.read_csv('/content/train.csv')

In [None]:
df.head()

In [7]:
from tensorflow.keras.layers import TextVectorization

In [8]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [9]:
MAX_FEATURES = 200000 # number of words in the vocab

In [10]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [11]:
vectorizer.adapt(X.values)

In [12]:
vectorized_text = vectorizer(X.values)

In [13]:
#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps bottlenecks

In [14]:
train = dataset.take(int(len(dataset)*.9))
val = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [16]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(64, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [17]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam', )

In [None]:
model.summary()

In [None]:
history = model.fit(train, epochs=20, validation_data=val, validation_steps=30)

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

In [47]:
model.save('toxicity.h5')

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [49]:
!cp /content/toxicity.h5 /content/gdrive/MyDrive

In [None]:
!unzip /content/test_labels.csv.zip

In [27]:
testdf = pd.read_csv('/content/test.csv')
test_labelsdf = pd.read_csv('/content/test_labels.csv')

In [None]:
testdf.head()

In [None]:
test_labelsdf.head()

In [30]:
X_test = testdf["comment_text"]
y_test = test_labelsdf[test_labelsdf.columns[1:]].values

In [31]:
vectorized_test = vectorizer(X_test.values)

In [32]:
testdata = tf.data.Dataset.from_tensor_slices((vectorized_test, y_test))
testdata = dataset.cache()
testdata = dataset.batch(16)
testdata = dataset.prefetch(8)

In [33]:
batch_X, batch_y = testdata.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.5).astype(int)

In [35]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [36]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in testdata.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

In [16]:
!cp /content/gdrive/MyDrive/toxicity.h5 /content

In [None]:
!pip install gradio jinja2

In [41]:
import gradio as gr

In [None]:
model.save('/content/toxicity.h5')

In [19]:
model = tf.keras.models.load_model('toxicity.h5')

In [50]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [None]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

In [None]:
interface.launch(share=True)