In [2]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D

In [3]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
y_test = pd.read_csv("test_labels.csv")

In [4]:
MAX_FEATURES = 20000
MAX_LEN = 512
EMBEDDING_SIZE = 128

In [5]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [7]:
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_test = y_test[classes].values
y = train[classes].values

In [8]:
list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

In [9]:
tokenizer = Tokenizer(num_words=MAX_FEATURES, oov_token="<oov")
tokenizer.fit_on_texts(list(list_sentences_train))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [10]:
X_train_padded = pad_sequences(list_tokenized_train, maxlen =MAX_LEN, padding="post", truncating="post")
X_test_padded = pad_sequences(list_tokenized_test, maxlen=MAX_LEN, padding="post", truncating="post")

In [11]:
inp = Input(shape=(MAX_LEN, ))
x = Embedding(MAX_FEATURES, EMBEDDING_SIZE)(inp)
x = LSTM(60, return_sequences=True, name="lstm-layer")(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)

In [12]:
model = Model(inputs=inp, outputs=x)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [13]:
batch_size = 64
epochs = 2
model.fit(X_train_padded, y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f141414f080>

In [14]:
test_loss, test_acc = model.evaluate(X_test_padded, y_test)



In [15]:
preds = model.predict(X_test_padded)

In [22]:
submid = pd.DataFrame({"id": y_test["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns=classes)], axis=1)

In [23]:
submission.to_csv('submission.csv', index=False)