In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import chainer
from chainer import datasets
from chainer import functions as F
from chainer import links as L
from chainer import Variable
from chainer.backends import cuda
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import nltk
import re
import gensim
from tqdm import tqdm
import gc

In [None]:
xp = cuda.cupy
wnl = nltk.stem.WordNetLemmatizer()
head_check = re.compile('^[A-Z][^A-Z]+$')
URL_slash = re.compile('^//[A-Za-z/.]+')
URL_www = re.compile('^www[A-Za-z/.]+')

In [None]:
MIN_COUNT = 10
BATCH_SIZE = 256
MAX_EPOCH = 10
# length of truncated BPTT
BPROP_LEN = 20
VECTOR_SIZE = 300
TRAIN_SIZE = 0.75
P = 0.4

In [None]:
train_df = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
test_df = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
all_df = pd.concat([train_df['comment_text'], test_df['comment_text']])

y= np.where(train_df['target']>=0.5, 1, 0)
y = y.astype(np.int32)
# y = np.array(train_df['target'], dtype=np.float32)


In [None]:
t_all = []
for s in tqdm(all_df.values):
    t_all.append(nltk.word_tokenize(s))

del all_df
del train_df
del test_df
gc.collect()

In [None]:
s_all = []

for i in range(len(t_all)):
    vector = []
    for t in t_all[i]:
        check = head_check.match(t)
        if check is not None:
            add = t[0].lower() + t[1:]
        elif URL_slash.match(t) is not None or URL_www.match(t):
            add = "URL_text"
        else:
            add = t
        vector.append(wnl.lemmatize(add))
    s_all.append(vector)

del t_all
del head_check
gc.collect()

In [None]:
word2vec = gensim.models.word2vec.Word2Vec.load('../input/word2vec-model/word2vec.model')

In [None]:
t_train = []
t_pre = []
for i in range(len(y)):
    t_train.append(s_all[i])
for i in range(len(y),len(s_all)):
    t_pre.append(s_all[i])

del s_all
gc.collect()

In [None]:
i2w = word2vec.wv.index2word
words = {w: i for i, w in enumerate(i2w)}

In [None]:
zero_index = []
for i in range(len(t_train)):
    vector = []
    for t in t_train[i]:
        if t in words:
            vector.append(words[t])
        else:
            vector.append(0)
    if np.sum(vector)==0:
        zero_index.append(i)

t_train = np.delete(t_train, zero_index)
y = np.delete(y, zero_index)

del vector

In [None]:
y0_index = np.where(y<0.5)[0]
y1_index = np.where(y>=0.5)[0]
y_list = []
x_list = []
while True:
    random = np.random.binomial(BATCH_SIZE, P)
    if len(y1_index)<random or len(y0_index)<(BATCH_SIZE-random):
        break
    index = []
    for i in y1_index[:random]:
        index.append(i)
    for i in y0_index[:(BATCH_SIZE-random)]:
        index.append(i)
    np.random.shuffle(index)
    
    for i in index:
        x_list.append(t_train[i])
        y_list.append(y[i])
         
    for i in range(random):
        y1_index = np.delete(y1_index, 0)
    for i in range(BATCH_SIZE-random):
        y0_index = np.delete(y0_index, 0)

del t_train

In [None]:
zero = np.zeros((VECTOR_SIZE,), dtype=np.float32)
X = []
for i in range(len(x_list)):
    vector = []
    for s in x_list[i]:
        try:
            vector.append(word2vec[s])
        except KeyError:
            vector.append(zero)
    X.append(vector)

y = np.array(y_list, dtype=np.int32)

del x_list, y_list

In [None]:
max_len_train = max(list(map(len, X)))
max_len_test = max(list(map(len, t_pre)))
max_len = max_len_train if max_len_train>max_len_test else max_len_test
del max_len_train
del max_len_test

for i in range(len(X)):
    diff = max_len - len(X[i])
    for j in range(diff):
        X[i].append(zero)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE)

In [None]:
class jigsaw_dataset(chainer.dataset.DatasetMixin):
    def __init__(self, X, y):
        self.X_train = X
        self.y_train = y
        self.n_train = len(self.y_train)
    def __len__(self):
        return self.n_train
        
    def get_example(self, i):
        train = datasets.tuple_dataset.TupleDataset(self.X_train, self.y_train)
        return train[i]

In [None]:
class RNN(chainer.Chain):
    def __init__(self, n_units=300):
        super(RNN, self).__init__(
            l1 = L.Linear(None, n_units),
            l2 = L.LSTM(None, n_units),
            l3 = L.LSTM(None, n_units),
            l4 = L.Linear(None, 2)
        )
    
    def reset_state(self):
        self.l2.reset_state()
        self.l3.reset_state()

    def forward(self, x):
        h1 = F.sigmoid(self.l1(x))
        h2 = self.l2(F.dropout(h1))
        h3 = self.l3(F.dropout(h2))
        y = F.softmax(self.l4(h3))
        return y


model = L.Classifier(RNN(n_units=300))

gpu_id = 0
if gpu_id >= 0:
    model.to_gpu(gpu_id)
    
optimizer = chainer.optimizers.Adam()
optimizer.setup(model)
optimizer.add_hook(chainer.optimizer.GradientClipping(5))

In [None]:
def evaluate(model, X_test, y_test):
    evaluator = model.copy()
    evaluator.predictor.reset_state()
    p_list = []
    auc = 0
    with chainer.configuration.using_config('train', False):
        with chainer.using_config('enable_backprop', False):
            for i in range(len(X_test)):
                p = xp.array([X_test[i]])
                prediction = evaluator.predictor(p)
                p_cpu = cuda.to_cpu(prediction.array)
                p_list.append(p_cpu[0][y_test[i]])

            auc = roc_auc_score(y_test, np.array(p_list))
    return auc

In [None]:
train = jigsaw_dataset(X_train, y_train)
train_iter = chainer.iterators.SerialIterator(train, BATCH_SIZE)
del X_train
del y_train
gc.collect()

In [None]:
train_iter.reset()

In [None]:
%%time
sum_perp = 0
iteration = 0
while train_iter.epoch < MAX_EPOCH:
        loss = 0
        iteration += 1
        for i in range(BPROP_LEN):
            train_batch = train_iter.__next__()
            sentence_train, target_train = chainer.dataset.convert.concat_examples(train_batch, gpu_id)
            loss += optimizer.target(sentence_train, target_train)
            if train_iter.is_new_epoch:
                break
        sum_perp += loss.array
        optimizer.target.cleargrads()
        loss.backward()
        loss.unchain_backward()
        optimizer.update()
        if train_iter.is_new_epoch:
            print('epoch:{}'.format(train_iter.epoch))
            print('test perplexity:{0:10f}'.format(evaluate(model, X_test, y_test)))
# del train_iter

In [None]:
X_pre = []
for i in range(len(t_pre)):
    vector = []
    for s in t_pre[i]:
        try:
            vector.append(word2vec[s])
        except KeyError:
            vector.append(zero)
    X_pre.append(vector)

# del t_pre

for i in range(len(X_pre)):
    diff = max_len - len(X_pre[i])
    for j in range(diff):
        X_pre[i].append(zero)


In [None]:
def predictor(model, X_pre):
    prediction_cpu = []
    with chainer.configuration.using_config('train', False):
        with chainer.using_config('enable_backprop', False):
            for i in range(len(X_pre)):
                p = xp.array([X_pre[i]])
                prediction = model.predictor(p)
                del p
                p_cpu = cuda.to_cpu(prediction.array)
                del prediction
                prediction_cpu.append(p_cpu)
    return prediction_cpu

In [None]:
pre_df = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv")
pre_df.head()

In [None]:
pre_df.tail()

In [None]:
prediction_cpu = predictor(model, X_pre)

for i in range(len(prediction_cpu)):
#     p = np.where(prediction_cpu[i][0][0] > prediction_cpu[i][0][1], 0, 1)
    p = prediction_cpu[i][0][1]
    pre_df.loc[i, 'prediction'] = p

In [None]:
%matplotlib inline
pre_df['prediction'].plot.hist()

In [None]:
pre_df.to_csv('submission.csv', index=False)

In [None]:
pre_df.head()

In [None]:
pre_df.tail()