In [None]:
import numpy as np
import pandas as pd

In [None]:
import gzip
import os
import gc

In [None]:
from keras.models import Sequential, load_model
from keras.layers import Conv1D, GlobalMaxPool1D, Dense, Dropout, Activation, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [None]:
from gensim.models import Word2Vec, KeyedVectors

In [None]:
hyperparam = {'sequence_len': 100,
              'embedding_dim': 300, 
              'filters': 200, 
              'kernel_size': 3,
              'dropout' : 0.8,
              'batch_size': 512,
              'epochs': 1000,
              'steps_per_epochs': None,
              'early_stopping': True,
              'vocab_size': None,
              'learning_rate' : 0.0005,
              'gradient_clip_value' : None,
              'gradient_clip_norm' : None,
              'validation_split': 0.1,
              'missing_word_vectors': 'zero',
              'conv_activation': 'relu', 
              'dense_activation':'relu',
              'n_class': 1}

In [None]:
if hyperparam['early_stopping']:
    hyperparam['validation_split'] = max(0.1, hyperparam['validation_split'])

In [None]:
name = '_'.join(['CNN_Baseline_1_Class_Test', 
                 str(hyperparam['sequence_len']), 
                 str(hyperparam['filters']), 
                 str(hyperparam['kernel_size']), 
                 str(int(hyperparam['dropout']*100))])

In [None]:
save_predictions = False
save_model = False
use_best_checkpoint = True

In [None]:
try:
    word_vec
except NameError:
    if os.path.exists('./data/GoogleNews-vectors-negative300.bin'):
        word_vec = KeyedVectors.load_word2vec_format(fname='./data/GoogleNews-vectors-negative300.bin', binary=True)
    elif os.path.exists('./data/GoogleNews-vectors-negative300.bin.gz'):
        google_w2v = gzip.open('./data/GoogleNews-vectors-negative300.bin.gz', 'rb')
        word_vec = KeyedVectors.load_word2vec_format(fname=google_w2v, binary=True)
        del google_w2v
    else:
        print('Embedings not found')

In [None]:
tokenizer = Tokenizer(num_words=hyperparam['vocab_size'], filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'')

In [None]:
train = pd.read_csv('./data/train.csv')

In [None]:
test = pd.read_csv('./data/test.csv')

In [None]:
train_text = train['comment_text'].astype('str').values

In [None]:
test_text = test['comment_text'].astype('str').values

In [None]:
tokenizer.fit_on_texts(train_text)

In [None]:
train_seq = tokenizer.texts_to_sequences(train_text)

In [None]:
test_seq = tokenizer.texts_to_sequences(test_text)

In [None]:
if not hyperparam['vocab_size']:
    hyperparam['vocab_size'] = len(tokenizer.word_index)
print('Vocab Size:', hyperparam['vocab_size'])

In [None]:
if hyperparam['missing_word_vectors']=='normal':
    embed_list = []
    for word, index in tokenizer.word_index.items():
        if index >= hyperparam['vocab_size']: 
            continue
        try:
            embed_list.append(word_vec.wv[word])
        except KeyError:
            pass
    a = np.array(embed_list)
    embedding_matrix = np.array(np.random.normal(a.mean(), a.std(), (hyperparam['vocab_size'], hyperparam['embedding_dim'])), dtype=np.float32)
    del embed_list
    del a
else:
    embedding_matrix = np.zeros((hyperparam['vocab_size'], hyperparam['embedding_dim']), dtype=np.float32)

In [None]:
unknown_count = 0
unknown_freq = {}
for word, index in tokenizer.word_index.items():
    if index >= hyperparam['vocab_size']: 
            continue
    try:
        embedding_matrix[index, :] = word_vec.wv[word]
    except KeyError:
        unknown_freq[word] = tokenizer.word_counts[word]
        unknown_count += 1

In [None]:
print('Unknown words', unknown_count)

In [None]:
print('Unknown Freq', sum(unknown_freq.values()))

In [None]:
X_train = pad_sequences(train_seq, maxlen=hyperparam['sequence_len'], truncating='post', padding='post')

In [None]:
y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
y_train = y_train[:, 0]

In [None]:
if not os.path.exists(f'./{name}'):
    os.mkdir(f'./{name}')

In [None]:
with open(f'./{name}/Hyperparameters.csv', 'w') as file:
    file.write('{}, {}'.format('Hyperparameter', 'Value'))
    for key in hyperparam.keys():
        file.write('\n {}, {}'.format(str(key), str(hyperparam[key])))

In [None]:
embedding_matrix.shape

In [None]:
def mini_batch_generator():
    global X_train, y_train
    while True:
        yield (X_train[:hyperparam['batch_size'], :], y_train[:hyperparam['batch_size'], :])
        X_train = np.roll(X_train, axis=0, shift=-hyperparam['batch_size'])
        y_train = np.roll(y_train, axis=0, shift=-hyperparam['batch_size'])

In [None]:
def computation_graph():
    model  = Sequential()
    model.add(Embedding(hyperparam['vocab_size'], hyperparam['embedding_dim'], weights=[embedding_matrix], name='Embedding_Layer'))
    model.add(Conv1D(filters=hyperparam['filters'], 
                     kernel_size=hyperparam['kernel_size'],
                     activation = hyperparam['conv_activation'],
                     name= '_'.join(['Convolution_1D', str(hyperparam['filters']), str(hyperparam['kernel_size']), str(hyperparam['conv_activation'])])
                    ))
    model.add(GlobalMaxPool1D(name='Global_Max_Pooling'))
    model.add(Dense(units=hyperparam['filters'], name='Dense_'+str(hyperparam['filters'])))
    model.add(Dropout(rate=hyperparam['dropout'], name = 'Dropout_' + str(hyperparam['dropout'])))
    model.add(Activation(hyperparam['dense_activation'], name='Activation_'+str(hyperparam['dense_activation'])))
    model.add(Dense(units=hyperparam['n_class'], activation='sigmoid', name='Dense_'+str(hyperparam['n_class'])+'_Sigmoid'))
    return model

In [None]:
model = computation_graph()

In [None]:
model.summary()

In [None]:
if hyperparam['gradient_clip_norm'] is None and hyperparam['gradient_clip_value'] is None:
    model.compile(loss='binary_crossentropy', 
                  optimizer=Adam(hyperparam['learning_rate']),
                  metrics=['accuracy'])
elif hyperparam['gradient_clip_norm'] is None:
    model.compile(loss='binary_crossentropy', 
                  optimizer=Adam(hyperparam['learning_rate'], 
                  clipvalue=hyperparam['gradient_clip_value']),
                  metrics=['accuracy'])
elif hyperparam['gradient_clip_value'] is None:
    model.compile(loss='binary_crossentropy', 
                  optimizer=Adam(hyperparam['learning_rate'], 
                  clipnorm = hyperparam['gradient_clip_norm']),
                  metrics=['accuracy'])
else:
    model.compile(loss='binary_crossentropy', 
                  optimizer=Adam(hyperparam['learning_rate'], 
                  clipvalue=hyperparam['gradient_clip_value'],
                  clipnorm = hyperparam['gradient_clip_norm']),
                  metrics=['accuracy'])

In [None]:
if hyperparam['validation_split'] or hyperparam['early_stopping']:
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=hyperparam['validation_split'], random_state=22)

In [None]:
if hyperparam['early_stopping']:
    callback = [EarlyStopping(verbose=1), ModelCheckpoint(f'./{name}/model_best.h5', save_best_only=True)]
    if hyperparam['steps_per_epochs']:
        callback = [EarlyStopping(verbose=1, patience=5), ModelCheckpoint(f'./{name}/model_best.h5', save_best_only=True)]
    validation_data = (X_val, y_val)
elif hyperparam['validation_split']:
    callback = [ModelCheckpoint(f'./{name}/model_best.h5', save_best_only=True)]
    validation_data = (X_val, y_val)
else:
    callback = None
    validation_data = None                                               

In [None]:
if hyperparam['steps_per_epochs']:
        history = model.fit_generator(generator=mini_batch_generator(),
                              epochs=hyperparam['epochs'], 
                              callbacks=callback,
                              validation_data = validation_data, 
                              steps_per_epoch=hyperparam['steps_per_epochs'])
else:
        history = model.fit(x=X_train, y=y_train,
                          validation_data = validation_data,
                          epochs=hyperparam['epochs'],
                          batch_size=hyperparam['batch_size'], 
                          shuffle=True, 
                          callbacks=callback)

In [None]:
if save_predictions:
    X_test = pad_sequences(test_seq, maxlen=hyperparam['sequence_len'], truncating='post', padding='post')
    y_pred = model.predict(X_test, batch_size=hyperparam['batch_size'])
    submission = pd.DataFrame({'id': test['id'], 'toxic': y_pred[:, 0], 'severe_toxic': y_pred[:, 1], 'obscene': y_pred[:, 2], \
                          'threat': y_pred[:, 3], 'insult': y_pred[:, 4], 'identity_hate': y_pred[:, 5]})
    submission.to_csv(f'./{name}/submission.csv', index=False)

In [None]:
if use_best_checkpoint:
    model = load_model(f'./{name}/model_best.h5')
    X_test = pad_sequences(test_seq, maxlen=hyperparam['sequence_len'], truncating='post', padding='post')
    y_pred = model.predict(X_test, batch_size=hyperparam['batch_size'])
    submission = pd.DataFrame({'id': test['id'], 'toxic': y_pred[:, 0], 'severe_toxic': y_pred[:, 1], 'obscene': y_pred[:, 2], \
                          'threat': y_pred[:, 3], 'insult': y_pred[:, 4], 'identity_hate': y_pred[:, 5]})
    submission.to_csv(f'./{name}/submission_checkpoint.csv', index=False)

In [None]:
if save_model:
    model.save(f'./{name}/model_last.h5')
else:
    try:
        os.remove(f'./{name}/model_best.h5')
    except FileNotFoundError:
        pass