In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Bidirectional,LSTM,RepeatVector,TimeDistributed,Activation
from keras.layers import BatchNormalization, Flatten, Conv1D, MaxPooling1D,GlobalMaxPool1D
from keras.layers import Dropout,SpatialDropout1D
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.models import Model, model_from_json
import tensorflow as tf

Using TensorFlow backend.


In [2]:
# read in the data
df_train = pd.read_csv('TrainTestData/ToxicCommentsTrainData.csv')
print(df_train.shape)

(159571, 8)


In [3]:
df_train['comment_text'] = df_train['comment_text'].fillna("unknown")
df_train['comment_text'] = df_train['comment_text'].str.lower()

In [4]:
df_train.tail(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159561,ffd2e85b07b3c7e4,"""\nno he did not, read it again (i would have ...",0,0,0,0,0,0
159562,ffd72e9766c09c97,"""\n auto guides and the motoring press are not...",0,0,0,0,0,0
159563,ffe029a7c79dc7fe,"""\nplease identify what part of blp applies be...",0,0,0,0,0,0
159564,ffe897e7f7182c90,catalan independentism is the social movement ...,0,0,0,0,0,0
159565,ffe8b9316245be30,the numbers in parentheses are the additional ...,0,0,0,0,0,0
159566,ffe987279560d7ff,""":::::and for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,you should be ashamed of yourself \n\nthat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"spitzer \n\numm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\nand ... i really don't think you understand...",0,0,0,0,0,0


In [5]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [6]:
# define text data
docs_train = df_train['comment_text'].astype(str)

# initialize the tokenizer
t = Tokenizer(num_words=200000)
t.fit_on_texts(docs_train)
with open('TrainTestData/SavedTokenizer.pickle', 'wb') as handle:
    pickle.dump(t, handle, protocol=pickle.HIGHEST_PROTOCOL)

vocab_size = len(t.word_index) + 1
# integer encode the text data
encoded_docs_train = t.texts_to_sequences(docs_train)
# pad the vectors to create uniform length
df_train_padded = pad_sequences(encoded_docs_train, maxlen=150, padding='post')
print(df_train_padded.shape)

(159571, 150)


In [7]:
# load the glove840B embedding into memory after downloading and unzippping

embeddings_index = dict()
f = open('Glove/glove.840B.300d.txt', encoding="utf8")

for line in f:
	values = line.split(' ')
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

Loaded 2196016 word vectors.


In [8]:
#MAIN Create LSTM model
#Best performing model-lr=0.0003
model=Sequential()
model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], 
                  input_length=150, trainable=False))
model.add(Dropout(0.2))
model.add((Bidirectional(LSTM(50,return_sequences=True))))
model.add(GlobalMaxPool1D())
model.add(Dense(70, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(6, activation="sigmoid"))

In [9]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 300)          63101100  
_________________________________________________________________
dropout_1 (Dropout)          (None, 150, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 150, 100)          140400    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 70)                7070      
_________________________________________________________________
dropout_2 (Dropout)          (None, 70)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                

In [10]:
# compile the model
Adam_opt = Adam(lr=0.0003, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.000015)
model.compile(optimizer = Adam_opt, loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])
# serialize model to json
json_model = model.to_json()
#save the model architecture to JSON file
with open('ModelData/LSTMModel.json', 'w') as json_file:
    json_file.write(json_model)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [11]:
#Define X and y
X = df_train_padded
y = df_train[list_classes].values
#Split Training data into training data and validation data
X_train, X_eval, y_train ,y_eval = train_test_split(X, y,test_size=0.05,shuffle=True) 

In [12]:
early_stopping = EarlyStopping(monitor='val_loss', patience=4, mode='min',min_delta=0.0005)
save_best = ModelCheckpoint('ModelData/LSTMModelWeights.hdf', save_best_only=True, 
                           monitor='val_auc', mode='max')

model.fit(X_train, y_train, validation_data=(X_eval, y_eval),
                    epochs=40, verbose=1,callbacks=[early_stopping,save_best],batch_size=128)


Train on 151592 samples, validate on 7979 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40


<keras.callbacks.callbacks.History at 0x1d0f7abb808>