In [13]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [10]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

max_features = 20000
maxlen = 100


ImportError: No module named keras.models

In [11]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


In [5]:
list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values


In [6]:
tokenizer = text.Tokenizer(num_words=max_features)

In [7]:
tokenizer.fit_on_texts(list(list_sentences_train))

In [8]:
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)

In [9]:
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [10]:
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)

In [11]:
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

####  Set the activation function and train the model

In [12]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [13]:
model = get_model()
batch_size = 32
epochs = 2

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 128)          2560000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 100)          71600     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
__________

In [None]:
file_path="weights_base.best.hdf5"

checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=20)


callbacks_list = [checkpoint, early] #early
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1,
          callbacks=callbacks_list)


Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2

In [None]:
ls

In [None]:
model.load_weights(file_path)

In [None]:
y_test = model.predict(X_te)

In [None]:
sample_submission = pd.read_csv("./input/sample_submission.csv")

In [None]:
sample_submission[list_classes] = y_test

In [None]:
sample_submission.to_csv("baseline.csv", index=False)