In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from keras import layers, models
from keras.models import Sequential

from keras.preprocessing.text import Tokenizer
from keras import preprocessing

import matplotlib.pyplot as plt

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [3]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [4]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [5]:
NUM_WORDS = 2000
MAX_LEN = 50
NUM_FEATURES = 6

In [6]:
tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(df_train['comment_text'])

sequences_train = tokenizer.texts_to_sequences(df_train['comment_text'])
sequences_test = tokenizer.texts_to_sequences(df_test['comment_text'])

In [7]:
x_train = preprocessing.sequence.pad_sequences(sequences_train, maxlen=MAX_LEN)
x_test = preprocessing.sequence.pad_sequences(sequences_test, maxlen=MAX_LEN)

ans_columns = df_train.columns[2:]

In [8]:
input_layer = keras.Input(shape=(None,))
x = layers.Embedding(NUM_WORDS + 1, 32, input_length=MAX_LEN)(input_layer)
x = layers.Bidirectional(layers.GRU(32, dropout=0.2))(x)

In [9]:
outputs_layers = list()
losses = list()
y_train = list()
for column in ans_columns:
    output_layer = layers.Dense(1, activation = 'sigmoid')(x)
    outputs_layers.append(output_layer)
    y_train.append(df_train[column].values)
    losses.append('binary_crossentropy')

In [10]:
model = keras.Model(inputs=[input_layer], outputs=outputs_layers)

In [11]:
model.compile(optimizer='rmsprop', loss=losses, metrics=['acc'])

In [17]:
x_train

array([[   0,    0,    0, ...,   73,   89,  985],
       [   0,    0,    0, ...,  992,  589,  182],
       [   0,    0,    0, ...,    1,  737,  468],
       ...,
       [   0,    0,    0, ...,  737,   23,   12],
       [   0,    0,    0, ...,  151,   34,   11],
       [   0,    0,    0, ...,    3, 1627,   88]], dtype=int32)

In [12]:
history = model.fit([x_train], y_train, epochs=25, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [13]:
result = model.predict([x_test], 32)

In [14]:
result = np.array(result)
result = result.reshape(6, 153164)
result = result.T

In [15]:
df_ans = pd.DataFrame(result, columns=ans_columns)
df_ans.index = df_test['id']
df_ans

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.964374,0.155389,0.919356,8.093838e-03,0.854456,0.078526
0000247867823ef7,0.003424,0.000032,0.000836,8.670717e-07,0.000456,0.000035
00013b17ad220c46,0.002997,0.000095,0.000705,1.520311e-05,0.000477,0.000166
00017563c3f7919a,0.001874,0.000023,0.000611,4.940818e-06,0.000404,0.000071
00017695ad8997eb,0.033319,0.000455,0.007053,3.400265e-04,0.003800,0.000341
...,...,...,...,...,...,...
fffcd0960ee309b5,0.675277,0.002947,0.265462,2.438026e-04,0.085664,0.005211
fffd7a9a6eb32c16,0.028768,0.000283,0.006166,6.299827e-05,0.002732,0.000308
fffda9e8d6fafa9e,0.001542,0.000018,0.000567,1.651945e-06,0.000291,0.000034
fffe8f1340a79fc2,0.004853,0.000330,0.001881,7.181635e-05,0.001018,0.001234


In [16]:
df_ans.to_csv('ans.csv')