In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/glove6b/glove.6B.300d.txt
/kaggle/input/glove6b/glove.6B.100d.txt
/kaggle/input/glove6b/glove.6B.50d.txt
/kaggle/input/glove6b/glove.6B.200d.txt


In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from keras.layers import GlobalAveragePooling1D
from keras.layers import LSTM
from keras.layers import Bidirectional

Using TensorFlow backend.


In [3]:
vocab_size = 20000
max_length = 120
embedding_dim = 50
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"


In [4]:
train = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip")
test = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip")

In [5]:
train.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [6]:
test.isnull().sum()

id              0
comment_text    0
dtype: int64

In [7]:
label = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

#### Slicing the dataframe

In [8]:
y = train[label].values
test_list = test["comment_text"].fillna("_na_").values
train_sentences = train["comment_text"].fillna("_na_").values
train_sentences

array(["Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
       "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
       "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
       ...,
       'Spitzer \n\nUmm, theres no actual article for prostitution ring.  - Crunch Captain.',
       'And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.',
       '"\nAnd ... I really don\'t think you understand.  I came here and my idea was bad right away.  What kind of communit

In [9]:
y.shape

(159571, 6)

In [10]:
tokenizer = Tokenizer(num_words=vocab_size,oov_token= oov_tok)
tokenizer.fit_on_texts(list(train_sentences))
# word_index = tokenizer.word_index

In [11]:
word_index = tokenizer.word_index

In [12]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences,padding=padding_type,maxlen = max_length)
test_sequences = tokenizer.texts_to_sequences(test_list)
test_padded = pad_sequences(test_sequences, padding = padding_type,maxlen = max_length)
print("train sequences: ",len(train_sequences[0]))
print("train padded: ",len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

# print(len(train_sequences[10]))
# print(len(train_padded[10]))

train sequences:  47
train padded:  120
18
120


In [13]:
# inp = Input(shape=(maxlen,))
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50,return_sequences=True)),
tf.keras.layers.GlobalMaxPooling1D(),
tf.keras.layers.Dense(50,activation='relu'),
tf.keras.layers.Dense(6,activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 50)           1000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 120, 100)          40400     
_________________________________________________________________
global_max_pooling1d (Global (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 50)                5050      
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 306       
Total params: 1,045,756
Trainable params: 1,045,756
Non-trainable params: 0
_________________________________________________________________


In [14]:
train_padded.shape

(159571, 120)

In [15]:
num_epochs = 5
history = model.fit(train_padded,y,epochs=num_epochs)

Train on 159571 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5

In [16]:
test_pred = model.predict([test_padded],verbose=2)
sample_submission = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip")
sample_submission[label] = test_pred
sample_submission.to_csv('submission.csv', index=False)

153164/153164 - 27s
