In [19]:
%matplotlib inline
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import sys,os
import keras
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import ModelCheckpoint

In [20]:
train = pd.read_csv("jigsaw-toxic-comment-classification-challenge/train.csv")
# test = pd.read_csv("jigsaw-toxic-comment-classification-challenge/test.csv")
train.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [21]:
train.isnull().any()

id               False
comment_text     False
toxic            False
severe_toxic     False
obscene          False
threat           False
insult           False
identity_hate    False
dtype: bool

In [22]:
list_classes = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
y = train[list_classes].values
print(y.shape)

(159571, 6)


In [23]:
X = train["comment_text"]
X.shape

(159571,)

In [24]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
X_train.shape

(127656,)

In [25]:
max_features = 20000
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(X))
list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
list_tokenized_train[:1]

[[4, 6, 56, 18, 451, 11, 2785]]

In [28]:
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [29]:
embed_size = 128
model = Sequential()

model.add(Embedding(max_features,embed_size,input_length=maxlen))

model.add(LSTM(64,return_sequences=True,use_bias=True,bias_initializer=keras.initializers.Ones()))

model.add(GlobalMaxPool1D())

model.add(Dense(64, activation='relu'))

model.add(Dense(6, activation='sigmoid'))

In [33]:
# sgd = optimizers.SGD(lr=0.01)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [34]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 200, 128)          2560000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 200, 64)           49408     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 390       
Total params: 2,613,958
Trainable params: 2,613,958
Non-trainable params: 0
_________________________________________________________________


In [35]:
batch_size=64
epochs = 8
checkpoint = ModelCheckpoint('check01',monitor='val_acc',verbose=1, save_best_only=True,mode='max')
callbacks_list=[checkpoint]
text_model = model.fit(X_t,y_train,batch_size=batch_size,epochs=epochs,verbose=1, validation_data=(X_te,y_test),callbacks=callbacks_list)

Train on 127656 samples, validate on 31915 samples
Epoch 1/8

Epoch 00001: val_acc improved from -inf to 0.98266, saving model to check01
Epoch 2/8

Epoch 00002: val_acc improved from 0.98266 to 0.98310, saving model to check01
Epoch 3/8

Epoch 00003: val_acc did not improve from 0.98310
Epoch 4/8

Epoch 00004: val_acc did not improve from 0.98310
Epoch 5/8

Epoch 00005: val_acc did not improve from 0.98310
Epoch 6/8

Epoch 00006: val_acc did not improve from 0.98310
Epoch 7/8

Epoch 00007: val_acc did not improve from 0.98310
Epoch 8/8

Epoch 00008: val_acc did not improve from 0.98310
