In [1]:
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_path = '/home/student/Documents/Kaggle/train.csv'
test_path = '/home/student/Documents/Kaggle/test.csv'
embed_path = '/home/student/Documents/Kaggle/Embeddings/glove.6B.50d.txt'

In [3]:
embed_size = 50
max_features  = 20000
maxlen = 100

In [4]:
Train_Data = pd.read_csv(train_path)
Test_Data = pd.read_csv(test_path)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = Train_Data[list_classes].values

In [5]:
list_sentence_train = Train_Data.comment_text
list_sentence_test = Test_Data.comment_text

In [6]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list_sentence_train)

list_tokenized_train = tokenizer.texts_to_sequences(list_sentence_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentence_test)

X_train = pad_sequences(list_tokenized_train,maxlen=maxlen)
X_trest = pad_sequences(list_tokenized_test,maxlen=maxlen)

In [7]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

embedding_index = dict(get_coefs(*o.strip().split()) for o in open(embed_path))

In [8]:
all_embeds = np.stack(embedding_index.values())
emb_mean,emb_std = all_embeds.mean(),all_embeds.std()

In [9]:
word_index = tokenizer.word_index
nb_words = len(word_index)

embedding_matrix = np.random.normal(emb_mean,emb_std,(nb_words,embed_size))

for word,i in word_index.items():
    if i>=max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


In [10]:
len(tokenizer.word_index)

210337

In [34]:
[embedding_matrix]

[array([[ 0.65890277,  0.23410571,  0.82085032, ...,  0.12365394,
          0.28637895,  0.86281029],
        [ 0.41800001,  0.24968   , -0.41242   , ..., -0.18411   ,
         -0.11514   , -0.78580999],
        [ 0.68046999, -0.039263  ,  0.30186   , ..., -0.073297  ,
         -0.064699  , -0.26043999],
        ...,
        [ 0.48654174,  1.14485043,  0.54501572, ...,  0.04782041,
         -0.75511799,  0.84744533],
        [-0.21611203, -0.28589162,  0.62905614, ..., -0.78789508,
         -0.24259589,  0.28527396],
        [ 0.62834479,  0.70891561,  0.80176995, ..., -0.49718219,
         -0.79705197,  0.61482074]])]

In [11]:
inp = Input(shape=(maxlen,))
x = Embedding(len(tokenizer.word_index),embed_size,weights=[embedding_matrix],trainable=False)(inp)
x = Bidirectional(LSTM(60,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp,outputs=x)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 50)           10516850  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 120)          53280     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 120)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 120)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6050      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
__________

In [13]:
model.fit(X_train,y,batch_size=32,epochs=4,validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f579f526ba8>

In [14]:
Y_Test = model.predict([X_trest],batch_size=1024,verbose=1)



In [16]:
Y_Test

array([[9.8047763e-01, 1.2285337e-01, 8.9042568e-01, 2.8640604e-02,
        6.4341807e-01, 1.2380260e-01],
       [1.2310317e-03, 2.9519044e-06, 1.8308930e-04, 4.1909943e-06,
        1.1948345e-04, 2.4531590e-05],
       [3.1549134e-03, 2.9887073e-05, 6.2242948e-04, 5.3438915e-05,
        3.4563753e-04, 9.7032280e-05],
       ...,
       [4.6909880e-04, 2.4511138e-07, 3.0577896e-05, 7.9866055e-07,
        1.8276907e-05, 8.3774739e-06],
       [1.2935498e-03, 1.3585394e-06, 4.4940247e-05, 1.2488029e-05,
        2.6538915e-05, 1.5650246e-04],
       [9.6579659e-01, 3.0812507e-02, 8.1698942e-01, 8.1266882e-03,
        5.2297348e-01, 6.4403112e-03]], dtype=float32)

In [33]:
submission = pd.DataFrame.from_dict({'id': Test_Data.id})
count = 0
for class_name in list_classes:
    submission[class_name] = Y_Test[:,count]
    count+=1
    
submission.to_csv('/home/student/Documents/Kaggle/submission.csv',index=False)