In [1]:
import numpy as np
import pandas as pd

import gzip
import os
import gc
from tqdm import tqdm

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
from gensim.models import Word2Vec, KeyedVectors

In [None]:
from keras.models import Sequential, load_model
from keras.layers import Conv1D, GlobalMaxPool1D, Dense, Dropout, Activation
from keras.preprocessing.sequence import pad_sequences

In [2]:
maxlen = 100

In [3]:
hyperparam = {'embedding_dim': 300, 
              'filters': 100, 
              'kernel_size': 3,
              'dropout' : 0.5,
              'n_class': 6,
              'conv_activation': 'relu', 
              'dense_activation':'relu',
              'batch_size': 128}

In [4]:
def get_wordvectors(words):
    try:
        embed = word_vec.wv[words]
    except ValueError:
        embed = None
    except KeyError:
        words_n = []
        for word in words:
            if word in word_vec.wv.vocab:
                words_n.append(word)
        if words_n:
            embed = word_vec.wv[words_n]
        else:
            embed = None
    return embed

In [5]:
def pad_zero(arr):
    if arr is None:
        return np.zeros((maxlen, hyperparam['embedding_dim']))
    pad_len = max(maxlen - arr.shape[0], 0)
    if pad_len > 0:
        arr = np.vstack((arr, np.zeros((pad_len, hyperparam['embedding_dim']))))
    return arr[:maxlen, :]

In [6]:
def pad_nan(arr):
    if arr is None:
        arr =  np.zeros((hyperparam['kernel_size'], hyperparam['embedding_dim']))
    pad_len = max(maxlen - arr.shape[0], 0)
    if pad_len > 0:
        padding = np.empty((pad_len, hyperparam['embedding_dim']), dtype=np.float32)
        padding[:] = np.nan
        arr = np.vstack((arr, padding))
    return arr[:maxlen, :]

In [7]:
if os.path.exists('./data/GoogleNews-vectors-negative300.bin'):
    word_vec = KeyedVectors.load_word2vec_format(fname='./data/GoogleNews-vectors-negative300.bin', binary=True)
elif os.path.exists('./data/GoogleNews-vectors-negative300.bin.gz'):
    google_w2v = gzip.open('./data/GoogleNews-vectors-negative300.bin.gz', 'rb')
    word_vec = KeyedVectors.load_word2vec_format(fname=google_w2v, binary=True)
else:
    print('Embedings not found')

In [8]:
TokenVectorizer = CountVectorizer(stop_words=set(stopwords.words('english')), lowercase=True)

In [9]:
tokenizer = TokenVectorizer.build_analyzer()

In [10]:
test = pd.read_csv('./data/test.csv')

In [11]:
test.shape

(226998, 2)

In [12]:
test.fillna(' ', inplace=True)

In [13]:
test['comment_tokens'] = test['comment_text'].apply(tokenizer)

In [14]:
test['comment_wordvec'] = test['comment_tokens'].apply(get_wordvectors) 

In [15]:
# test = test.loc[~test['comment_wordvec'].isnull(), :]

In [16]:
test.index = np.arange(test.shape[0])

In [17]:
X_test = np.empty((test.shape[0], maxlen, 300), dtype=np.float32)

MemoryError: 

In [None]:
for i, row in tqdm(test.iterrows(), total=test.shape[0]):
    X_test[i, :, :] = pad_nan(row['comment_wordvec'])

In [18]:
del test['comment_wordvec']
gc.collect()

7

In [19]:
model  = load_model('./models/Baseline_LSTM_es.h5')

In [20]:
y_pred = model.predict(X_test, batch_size=256)

In [22]:
submission = pd.DataFrame({'id': test['id'], 'toxic': y_pred[:, 0], 'severe_toxic': y_pred[:, 1], 'obscene': y_pred[:, 2], \
                          'threat': y_pred[:, 3], 'insult': y_pred[:, 4], 'identity_hate': y_pred[:, 5]})

In [24]:
submission.to_csv('./Submission_LSTM.csv', index=False, float_format='%.15f')