In [1]:
import numpy as np
import pandas as pd

In [2]:
import gzip
import os
import gc

In [3]:
from keras.models import Sequential, load_model
from keras.layers import Conv1D, GlobalMaxPool1D, Dense, Dropout, Activation, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import log_loss

In [5]:
from gensim.models import Word2Vec, KeyedVectors



In [6]:
hyperparam = {'sequence_len': 100,
              'embedding_dim': 300, 
              'filters': 200, 
              'kernel_size': 3,
              'dropout' : 0.8,
              'batch_size': 512,
              'epochs': 3,
              'steps_per_epochs': None,
              'early_stopping': False,
              'vocab_size': None,
              'learning_rate' : 0.0005,
              'gradient_clip_value' : None,
              'gradient_clip_norm' : None,
              'validation_split': 0.1,
              'missing_word_vectors': 'normal',
              'conv_activation': 'relu', 
              'dense_activation':'relu',
              'n_class': 6}

In [7]:
if hyperparam['early_stopping']:
    hyperparam['validation_split'] = max(0.1, hyperparam['validation_split'])

In [8]:
name = '_'.join(['CNN_Baseline', 
                 str(hyperparam['sequence_len']), 
                 str(hyperparam['filters']), 
                 str(hyperparam['kernel_size']), 
                 str(int(hyperparam['dropout']*100))])

In [9]:
save_predictions = False
save_model = False
use_best_checkpoint = True

In [10]:
try:
    word_vec
except NameError:
    if os.path.exists('./data/GoogleNews-vectors-negative300.bin'):
        word_vec = KeyedVectors.load_word2vec_format(fname='./data/GoogleNews-vectors-negative300.bin', binary=True)
    elif os.path.exists('./data/GoogleNews-vectors-negative300.bin.gz'):
        google_w2v = gzip.open('./data/GoogleNews-vectors-negative300.bin.gz', 'rb')
        word_vec = KeyedVectors.load_word2vec_format(fname=google_w2v, binary=True)
        del google_w2v
    else:
        print('Embedings not found')

In [11]:
tokenizer = Tokenizer(num_words=hyperparam['vocab_size'], filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'')

In [12]:
train = pd.read_csv('./data/train.csv')

In [13]:
test = pd.read_csv('./data/test.csv')

In [14]:
train_text = train['comment_text'].astype('str').values

In [15]:
test_text = test['comment_text'].astype('str').values

In [16]:
# train.loc[train['toxic']==0, train.columns[2:]].sum()

In [17]:
tokenizer.fit_on_texts(train_text)

In [18]:
train_seq = tokenizer.texts_to_sequences(train_text)

In [19]:
test_seq = tokenizer.texts_to_sequences(test_text)

In [20]:
if not hyperparam['vocab_size']:
    hyperparam['vocab_size'] = len(tokenizer.word_index)
print('Vocab Size:', hyperparam['vocab_size'])

Vocab Size: 193264


In [21]:
if hyperparam['missing_word_vectors']=='normal':
    embed_list = []
    for word, index in tokenizer.word_index.items():
        if index >= hyperparam['vocab_size']: 
            continue
        try:
            embed_list.append(word_vec.wv[word])
        except KeyError:
            pass
    a = np.array(embed_list)
    embedding_matrix = np.array(np.random.normal(a.mean(), a.std(), (hyperparam['vocab_size'], hyperparam['embedding_dim'])), dtype=np.float32)
    del embed_list
    del a
else:
    embedding_matrix = np.zeros((hyperparam['vocab_size'], hyperparam['embedding_dim']), dtype=np.float32)

In [22]:
unknown_count = 0
unknown_freq = {}
for word, index in tokenizer.word_index.items():
    if index >= hyperparam['vocab_size']: 
            continue
    try:
        embedding_matrix[index, :] = word_vec.wv[word]
    except KeyError:
        unknown_freq[word] = tokenizer.word_counts[word]
        unknown_count += 1

In [23]:
print('Unknown words', unknown_count)

Unknown words 127113


In [24]:
print('Unknown Freq', sum(unknown_freq.values()))

Unknown Freq 1450599


In [25]:
X = pad_sequences(train_seq, maxlen=hyperparam['sequence_len'], truncating='post', padding='post')

In [26]:
y = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

In [27]:
def mini_batch_generator(X_train, y_train):
#     global X_train, y_train
    while True:
        yield (X_train[:hyperparam['batch_size'], :], y_train[:hyperparam['batch_size'], :])
        X_train = np.roll(X_train, axis=0, shift=-hyperparam['batch_size'])
        y_train = np.roll(y_train, axis=0, shift=-hyperparam['batch_size'])

In [28]:
def computation_graph():
    model  = Sequential()
    model.add(Embedding(hyperparam['vocab_size'], hyperparam['embedding_dim'], weights=[embedding_matrix], name='Embedding_Layer'))
    model.add(Conv1D(filters=hyperparam['filters'], 
                     kernel_size=hyperparam['kernel_size'],
                     activation = hyperparam['conv_activation'],
                     name= '_'.join(['Convolution_1D', str(hyperparam['filters']), str(hyperparam['kernel_size']), str(hyperparam['conv_activation'])])
                    ))
    model.add(GlobalMaxPool1D(name='Global_Max_Pooling'))
    model.add(Dense(units=hyperparam['filters'], name='Dense_'+str(hyperparam['filters'])))
    model.add(Dropout(rate=hyperparam['dropout'], name = 'Dropout_' + str(hyperparam['dropout'])))
    model.add(Activation(hyperparam['dense_activation'], name='Activation_'+str(hyperparam['dense_activation'])))
    model.add(Dense(units=hyperparam['n_class'], activation='sigmoid', name='Dense_'+str(hyperparam['n_class'])+'_Sigmoid'))
    return model

In [29]:
def cross_validation_fold(X_train, y_train, X_val, y_val):
    model = None
    model = computation_graph()
    validation_data = (X_val, y_val)
    if hyperparam['early_stopping']:
        validation_data = (X_val, y_val)
        callback = [EarlyStopping(verbose=1)]
        if hyperparam['steps_per_epochs']:
            callback = [EarlyStopping(verbose=1, patience=5)]
    else:
        callback = None   
        validation_data = None
    
    if hyperparam['gradient_clip_norm'] is None and hyperparam['gradient_clip_value'] is None:
        model.compile(loss='binary_crossentropy', 
                  optimizer=Adam(hyperparam['learning_rate']),
                  metrics=['accuracy'])
    elif hyperparam['gradient_clip_norm'] is None:
        model.compile(loss='binary_crossentropy', 
                  optimizer=Adam(hyperparam['learning_rate'], 
                  clipvalue=hyperparam['gradient_clip_value']),
                  metrics=['accuracy'])
    elif hyperparam['gradient_clip_value'] is None:
        model.compile(loss='binary_crossentropy', 
                  optimizer=Adam(hyperparam['learning_rate'], 
                  clipnorm = hyperparam['gradient_clip_norm']),
                  metrics=['accuracy'])
    else:
        model.compile(loss='binary_crossentropy', 
                  optimizer=Adam(hyperparam['learning_rate'], 
                  clipvalue=hyperparam['gradient_clip_value'],
                  clipnorm = hyperparam['gradient_clip_norm']),
                  metrics=['accuracy'])
        
    if hyperparam['steps_per_epochs']:
        model = computation_graph()
        history = model.fit_generator(generator=mini_batch_generator(X_train, y_train),
                              epochs=hyperparam['epochs'], 
                              callbacks=callback,
                              validation_data = validation_data, 
                              steps_per_epoch=hyperparam['steps_per_epochs'])
    else:
        history = model.fit(x=X_train, y=y_train,
                          validation_data = validation_data,
                          epochs=hyperparam['epochs'],
                          batch_size=hyperparam['batch_size'], 
                          shuffle=True, 
                          callbacks=callback)
    sc = model.evaluate(x=X_val, y=y_val, batch_size = hyperparam['batch_size'])
    y_score = model.predict(X_val)
    del model
    K.clear_session()
    return sc, history, y_score

In [30]:
model = computation_graph()

In [31]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding_Layer (Embedding)  (None, None, 300)         57979200  
_________________________________________________________________
Convolution_1D_200_3_relu (C (None, None, 200)         180200    
_________________________________________________________________
Global_Max_Pooling (GlobalMa (None, 200)               0         
_________________________________________________________________
Dense_200 (Dense)            (None, 200)               40200     
_________________________________________________________________
Dropout_0.8 (Dropout)        (None, 200)               0         
_________________________________________________________________
Activation_relu (Activation) (None, 200)               0         
_________________________________________________________________
Dense_6_Sigmoid (Dense)      (None, 6)                 1206      
Total para

In [32]:
kfold = KFold(n_splits=10, random_state=22, shuffle=True)
history = []
score = []
y_actual = []
y_predicted = []
k = 1
for train_id, validation_id in kfold.split(X):
    print('-'*20)
    print('\nFold ', k)
    X_train, X_val, y_train, y_val = X[train_id, :], X[validation_id, :], y[train_id, :], y[validation_id, :]
    sc, his, y_score = cross_validation_fold(X_train, y_train,  X_val, y_val)
    print('\nLoss: {:.4f}\t Accuracy: {:.4f}'.format(sc[0], sc[1]))
    score.append(sc)
    history.append(his)
    y_actual.append(y_val)
    y_predicted.append(y_score)

--------------------

Fold  1
Epoch 1/3
Epoch 2/3
Epoch 3/3

Loss: 0.0478	 Accuracy: 0.9818
--------------------

Fold  1
Epoch 1/3
Epoch 2/3
Epoch 3/3

Loss: 0.0489	 Accuracy: 0.9820
--------------------

Fold  1
Epoch 1/3
Epoch 2/3
Epoch 3/3

Loss: 0.0464	 Accuracy: 0.9823
--------------------

Fold  1
Epoch 1/3
Epoch 2/3
Epoch 3/3

Loss: 0.0478	 Accuracy: 0.9820
--------------------

Fold  1
Epoch 1/3
Epoch 2/3
Epoch 3/3

Loss: 0.0490	 Accuracy: 0.9819
--------------------

Fold  1
Epoch 1/3
Epoch 2/3
Epoch 3/3

Loss: 0.0485	 Accuracy: 0.9822
--------------------

Fold  1
Epoch 1/3
Epoch 2/3
Epoch 3/3

Loss: 0.0496	 Accuracy: 0.9815
--------------------

Fold  1
Epoch 1/3
Epoch 2/3
Epoch 3/3

Loss: 0.0477	 Accuracy: 0.9820
--------------------

Fold  1
Epoch 1/3
Epoch 2/3
Epoch 3/3

Loss: 0.0474	 Accuracy: 0.9822
--------------------

Fold  1
Epoch 1/3
Epoch 2/3
Epoch 3/3

Loss: 0.0450	 Accuracy: 0.9830


In [33]:
y_actual_np = np.vstack(y_actual)

In [34]:
y_predicted_np = np.vstack(y_predicted)

In [94]:
y_bin = np.array(y_predicted_np > 0.5, dtype=np.float32)

In [95]:
y_bin = y_bin - 5**-1

In [96]:
y_bin = np.abs(y_bin)

In [97]:
y_bin

array([[0.2, 0.2, 0.2, 0.2, 0.2, 0.2],
       [0.2, 0.2, 0.2, 0.2, 0.2, 0.2],
       [0.2, 0.2, 0.2, 0.2, 0.2, 0.2],
       ...,
       [0.2, 0.2, 0.2, 0.2, 0.2, 0.2],
       [0.8, 0.2, 0.2, 0.2, 0.2, 0.2],
       [0.2, 0.2, 0.2, 0.2, 0.2, 0.2]], dtype=float32)

In [47]:
res = np.hstack((y_actual_np, y_predicted_np))

In [99]:
rd = pd.DataFrame(res)

In [114]:
max(rd[11])

0.5250115394592285

In [116]:
rd.loc[rd[1]==0, [0, 1, 6, 7]].describe()

Unnamed: 0,0,1,6,7
count,157976.0,157976.0,157976.0,157976.0
mean,0.086716,0.0,0.07992415,0.007216651
std,0.281419,0.0,0.2231959,0.04010646
min,0.0,0.0,9.239653e-07,7.650033e-11
25%,0.0,0.0,0.0003100408,1.932321e-07
50%,0.0,0.0,0.001279467,1.208696e-06
75%,0.0,0.0,0.009985795,1.52584e-05
max,1.0,0.0,0.9998904,0.545396


In [98]:
log_loss(y_true=y_actual_np[:], y_pred=y_bin[:])/6

0.05643638829732237

In [44]:
a  = 0 
for i in range(0, 6):
    print(log_loss(y_true=y_actual_np[:, i], y_pred=y_predicted_np[:, i]))

0.09940792493605383
0.023097560937440975
0.05464086626479782
0.01393140877197082
0.06726162203861821
0.028514292935166604
