In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, GRU
from sklearn.model_selection import train_test_split
import re


In [2]:
df = pd.read_csv('/content/drive/MyDrive/toxic-comments.csv')

In [3]:
df.drop('id', axis = 1, inplace = True)

In [4]:
df.columns

Index(['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [5]:
df.shape

(159571, 7)

In [6]:
# define target variables
toxicities = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [7]:
df.sum(numeric_only = True) #df[toxicities].sum()

Unnamed: 0,0
toxic,15294
severe_toxic,1595
obscene,8449
threat,478
insult,7877
identity_hate,1405


In [8]:
# .all(axis=1):
# Checks whether all values in each row (across the columns specified) are True.

In [9]:
df[(df[toxicities] == 0).all(axis=1)].shape

(143346, 7)

In [10]:
neutral = df.sum(axis=1, numeric_only=True)

In [11]:
neutral = neutral[neutral == 0]

In [12]:
neutral.count()

143346

### Data Cleaning

In [14]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9]+', ' ', text)
    return text

In [16]:
df['comment_text'] = df['comment_text'].apply(clean_text)

### seprate the input and output variables

In [21]:
X = df['comment_text']
y = df[toxicities].values

In [22]:
y.shape

(159571, 6)

### Data Preparation

In [26]:
tokenizer = Tokenizer(num_words = 50000) # The Tokenizer will assign an integer index to each word based on its frequency in the dataset (more frequent words get smaller indices).

tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
padded_sequences = pad_sequences(sequences, maxlen = 200)

In [27]:
padded_sequences.shape

(159571, 200)

### cross-validation

In [28]:
X_train, X_test, y_train, y_test =  train_test_split(
    padded_sequences, y, test_size = 0.2, random_state = 0
)

In [29]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((127656, 200), (31915, 200), (127656, 6), (31915, 6))

### Build the model

input_length=200
This is the fixed length of input sequences expected by the model.
Each input sequence passed to the embedding layer must have exactly 200 tokens (as ensured by pad_sequences earlier).

In [30]:
model = Sequential()

model.add(Embedding(50000, 128, input_length = 200))
model.add(GRU(128, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(6, activation = 'sigmoid'))



In [31]:
model.build()

In [None]:
model.summary()