<a href="https://colab.research.google.com/github/timthedev07/toxic-comment-classification/blob/dev/toxic-comment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries.

In [1]:
import os
import string
import re
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Dropout
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.callbacks import EarlyStopping
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

Download the corpus `stopwords`.

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Custom function for cleaning input strings.

In [43]:
def custom_standardization(text: tf.Tensor):
    # to lower case
    text = tf.strings.lower(text)
    # expand contraction
    pairs = [
        ("won't", "will not"),
        ("can't", "can not"),
        ("n't", " not"),
        ("'re", " are"),
        ("'s", " is"),
        ("'d", " would"),
        ("'ll", " will"),
        ("'t", " not"),
        ("'ve", " have"),
        ("'m", " am"),
    ]
    for contracted, replacement in pairs:
        text = tf.strings.regex_replace(text, contracted, replacement)
    
    # clean special symbols
    text = tf.strings.regex_replace(text, "<br />", " ")
    text = tf.strings.regex_replace(text, r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?", " ")
    text = tf.strings.regex_replace(text, r'@([A-Za-z0-9_]+)', " ")
    text = tf.strings.regex_replace(text, r"\([^)]*\)", " ")
    text = tf.strings.regex_replace(text, r"[^A-Za-z0-9]+", " ")

    # remove stopwords
    for i in stop_words:
        text = tf.strings.regex_replace(text, f"[^A-Za-z0-9_]+{i}[^A-Za-z0-9_]+", " ")

    return text


Reading the dataset.

In [35]:
!rm -rf ./sample_data
!rm -rf ./content
!rm -rf ./data
!mkdir data

from google.colab import drive
drive.mount('/content/gdrive')

!cp "/content/gdrive/My Drive/datasets/toxic-comment-classification/train.csv.zip" data
!cp "/content/gdrive/My Drive/datasets/toxic-comment-classification/test.csv.zip" data
%cd data
!unzip -ojq train.csv.zip
!unzip -ojq test.csv.zip
!rm -rf train.csv.zip
!rm -rf test.csv.zip
%cd ..
data = pd.read_csv("data/train.csv")
testData = pd.read_csv("data/test.csv")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/data
/content


In [60]:
labels = {
    "clean": 0,
    "toxic": 1,
    "severe_toxic": 2,
    "obscene": 3,
    "threat": 4,
    "insult": 5,
    "identity_hate": 6,
}

# adding the new row "label" to indicate the label's corresponding number
for label, labelNum in labels.items():
    if label == "clean":
        continue
    data.loc[data[label] == 1, "label"] = labelNum

# set to 0(clean) on rows where no value is set for "label"
data.loc[data["label"].isna(), "label"] = 0

# change dtype
data["label"] = data["label"].astype(np.int8)

# one hot encoding for y data
y = pd.get_dummies(data["label"])

# all comments as x
x = data["comment_text"]

trainX, testX, trainY, testY = train_test_split(x, y, shuffle = True, random_state = 42, test_size = 0.2)

The model

In [25]:
VOCAB_SIZE = 15000
SEQUENCE_LENGTH = 120
EPOCHS = 10

def getTrainedModel(_trainX, _trainY, _testX, _testY):
    vectorize_layer = TextVectorization(
        standardize=custom_standardization,
        max_tokens=VOCAB_SIZE,
        output_mode='int',
        output_sequence_length=SEQUENCE_LENGTH)

    vectorize_layer.adapt(np.concatenate([_trainX, _testX]))

    embedding_dim = 32

    model = Sequential([
        vectorize_layer,
        Embedding(VOCAB_SIZE, embedding_dim, name="embedding"),
        GlobalAveragePooling1D(),
        Dense(32, activation='relu'),
        Dropout(0.5),
        Dense(len(labels), activation="softmax")
    ])

    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=['accuracy']
    )

    callback = EarlyStopping(patience=1)

    model.fit(
        _trainX,
        _trainY,
        epochs=EPOCHS,
        batch_size=32,
        validation_data=(_testX, _testY),
        callbacks = [callback]
    )

    return model

model = getTrainedModel(trainX, trainY, testX, testY)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [None]:
model.summary()
model.save("model")

# compress model
!zip -r model.zip model

# download model
from google.colab import files
files.download("model.zip")

Test the model on a few samples(Note: the texts shown below may contain inappropriate usage of the English language)

In [64]:
def determineLabel(probabilities):
    labelNum = np.where(probabilities == np.amax(probabilities))[0]

    for key, val in labels.items():
        if val == labelNum:
            return key

evalX = testData["comment_text"][:10]
res = model(evalX).numpy()

targetLabels = list(map(determineLabel, res))

for i in range(len(labels)):
    print(f"Text:\n  {custom_standardization(evalX[i])}\nCategory:\n  {targetLabels[i]}\n\n")

Text:
  b'yo bitch ja rule succesful ever whats hating sad mofuckas bitch slap ur pethedic white faces get kiss ass guys sicken ja rule pride da music man dont diss shit nothin wrong bein like tupac brother fuckin white boys get things right next time '
Category:
  insult


Text:
  b' rfc title fine imo '
Category:
  clean


Text:
  b' sources zawe ashton lapland '
Category:
  clean


Text:
  b' look back source information updated correct form guess source updated shall update information thank message '
Category:
  clean


Text:
  b'i anonymously edit articles '
Category:
  clean


Text:
  b'thank understanding think highly would revert without discussion '
Category:
  clean


Text:
  b'please add nonsense wikipedia edits considered vandalism quickly undone would like experiment please use sandbox instead thank '
Category:
  clean


