In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM,Bidirectional,Conv1D,MaxPool1D,Dense,Input, Flatten,Dropout,SpatialDropout1D,GlobalMaxPooling1D,GlobalAveragePooling1D,concatenate
from keras.models import Sequential, Model
from keras import backend as K
from keras import callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv('train.csv', index_col=0)
X = df.comment_text
Y = df[['toxic', 'severe_toxic','obscene','threat','insult','identity_hate']]
train_X, test_X, train_Y, test_Y = train_test_split(X,Y, test_size = 0.2, random_state = 10)

word_dimensions = 300 #300d glove embedding
max_vocabs = 100000# unique words to use
max_len = 200 # maximum length of words to use in a comment

# convert comments to sequences of indices
tokenizer = Tokenizer(lower = True,filters='!"#$%&()*+,-./\':;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(train_X)
tokenizer.num_words = max_vocabs
train_X_tok = tokenizer.texts_to_sequences(train_X)
test_X_tok  = tokenizer.texts_to_sequences(test_X)
top_vocabs = dict(tokenizer.word_counts)
top_vocabs = set(sorted(top_vocabs, key = top_vocabs.get, reverse=True)[:max_vocabs])
word_index = dict((key, value) for key,value in tokenizer.word_index.items() if key in top_vocabs)

# pad sequences to desired length
train_X_tok = pad_sequences(train_X_tok, maxlen=max_len)
test_X_tok = pad_sequences(test_X_tok, maxlen = max_len)

In [3]:
# load pre-trained embedding
embedding_index = {}
with open('C:/Users/yangt/Desktop/glove.42B.300d.txt',encoding='utf-8') as f:
	for line in f:
		values = line.strip().split()
		word = values[0]
		vector = np.asarray(values[1:], dtype='float32')
		embedding_index[word] = vector

In [4]:
# construct embedding matrix where each row is a word
# word not found in pretrained embedding are vectors of zeros
not_found_list = []
embedding_matrix = np.random.uniform(size=(len(word_index)+1, word_dimensions))
for word,index in word_index.items():
	vec = embedding_index.get(word)
	if vec is not None:
		embedding_matrix[index] = vec  
	else:
		not_found_list.append(word)


print(len(not_found_list)/100000)


0.21501


In [5]:
# early stoping 
early = EarlyStopping(monitor="val_loss", mode="auto", patience=1)

# bilstm construction
bilstm = Sequential()
bilstm.add(Embedding(embedding_matrix.shape[0],word_dimensions ,input_length=max_len, weights = [embedding_matrix]))
bilstm.add(SpatialDropout1D(0.2))
bilstm.add(Bidirectional(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2)))
bilstm.add(Dense(6, activation='sigmoid'))
bilstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(bilstm.summary())

# maxpool bilstm
maxbilstm = Sequential()
maxbilstm.add(Embedding(embedding_matrix.shape[0],word_dimensions ,input_length=max_len, weights = [embedding_matrix]))
maxbilstm.add(SpatialDropout1D(0.2))
maxbilstm.add(Bidirectional(LSTM(128,dropout = 0.2, recurrent_dropout = 0.2, return_sequences =True)))
maxbilstm.add(GlobalMaxPooling1D())
maxbilstm.add(Dense(6, activation='sigmoid'))
maxbilstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(maxbilstm.summary())

# mixed pool bilstm
mixbilstm_input = Input(shape=(max_len,))
mixbilstm = Embedding(embedding_matrix.shape[0],word_dimensions ,input_length=max_len, weights = [embedding_matrix])(mixbilstm_input)
mixbilstm=SpatialDropout1D(0.2)(mixbilstm)
mixbilstm = Bidirectional(LSTM(128,dropout = 0.2, recurrent_dropout = 0.2, return_sequences =True))(mixbilstm)
mpool = GlobalMaxPooling1D()(mixbilstm)
apool = GlobalAveragePooling1D()(mixbilstm)
mixpool = concatenate([mpool, apool])
mixbilstm =Dropout(0.5)(mixpool)
mixbilstm = Dense(6, activation='sigmoid')(mixbilstm)
mixbilstm = Model(inputs = [mixbilstm_input], outputs = mixbilstm)
mixbilstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(mixbilstm.summary())

# mixed pool wtih conv layer
mixconv_input = Input(shape=(max_len,))
mixconv = Embedding(embedding_matrix.shape[0],word_dimensions ,input_length=max_len, weights = [embedding_matrix])(mixconv_input)
mixconv=SpatialDropout1D(0.2)(mixconv)
mixconv = Bidirectional(LSTM(128,dropout = 0.2, recurrent_dropout = 0.2, return_sequences =True))(mixconv)
mixconv =Conv1D(filters =64, kernel_size = 3)(mixconv)
mpool = GlobalMaxPooling1D()(mixconv)
apool = GlobalAveragePooling1D()(mixconv)
mixpool = concatenate([mpool, apool])
mixconv =Dropout(0.5)(mixpool)
mixconv = Dense(6, activation='sigmoid')(mixconv)
mixconv = Model(inputs = [mixconv_input], outputs = mixconv)
mixconv.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(mixconv.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          30000300  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               439296    
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 1542      
Total params: 30,441,138
Trainable params: 30,441,138
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 300)          30000300  
___________________________________________________________

In [6]:
# model fitting
bilstm.fit(train_X_tok, train_Y,batch_size = 256, epochs =10, validation_split = 0.2, callbacks = [early])
maxbilstm.fit(train_X_tok, train_Y,batch_size = 256, epochs =10, validation_split = 0.2, callbacks = [early])
mixbilstm.fit(train_X_tok, train_Y,batch_size = 256, epochs =10, validation_split = 0.2, callbacks = [early])
mixconv.fit(train_X_tok, train_Y,batch_size = 256, epochs =10, validation_split = 0.2, callbacks = [early])

Train on 102124 samples, validate on 25532 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Train on 102124 samples, validate on 25532 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Train on 102124 samples, validate on 25532 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Train on 102124 samples, validate on 25532 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


<keras.callbacks.History at 0x2acc6898b00>

In [7]:
# prediction
bilstm_pred = bilstm.predict_proba(test_X_tok)
maxbilstm_pred = maxbilstm.predict_proba(test_X_tok)
mixbilstm_pred = mixbilstm.predict(test_X_tok)
mixconv_pred = mixconv.predict(test_X_tok)

In [10]:
import pandas as pd
from sklearn.metrics import roc_auc_score
result = {'models':['bilstm', 'mxbilstm', 'mixbilstm', 'mixconv']}

for i,c in enumerate(test_Y.columns):
    result[c] = [roc_auc_score(test_Y[c], bilstm_pred[:, i]),roc_auc_score(test_Y[c], maxbilstm_pred[:, i]),roc_auc_score(test_Y[c], mixbilstm_pred[:, i]),roc_auc_score(test_Y[c], mixconv_pred[:, i]) ]

pd.DataFrame(result).to_csv('dl_result.csv')

In [13]:
test_df = pd.read_csv('test.csv',index_col=0)

test_df  = tokenizer.texts_to_sequences(test_df.comment_text)

test_df = pad_sequences(test_df, maxlen = max_len)

test_pred  = maxbilstm.predict(test_df)

submission = pd.DataFrame(data=test_pred, columns=test_Y.columns,index=pd.read_csv('test.csv',index_col=0).index)

submission.to_csv('submission.csv')