In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import chainer
from chainer import datasets
from chainer import functions as F
from chainer import links as L
from chainer import Variable
from chainer.backends import cuda
from sklearn.model_selection import train_test_split

In [None]:
class jigsaw_dataset(chainer.dataset.DatasetMixin):
    def __init__(self, train=True, min_count=10, train_size=0.6):
        train_df = pd.read_csv("../input/train.csv")
        
        t = []
        for s in train_df['comment_text']:
            t.append(s.split())
                
        words = {}
        for j in range(len(t)):
            for word in t[j]:
                if word not in words:
                    words[word] = len(words)
        
        t_vec = []
        for j in range(len(t)):
            t_ids = []
            for word in t[j]:
                t_ids.append(words[word])
            t_ids = np.array(t_ids, dtype=np.int32)
            t_vec.append(t_ids)
        
        max_len = max(list(map(len, t_vec)))
        for i in range(len(t_vec)):
            for j in range((max_len-len(t_vec[i]))):
                t_vec[i] = np.append(t_vec[i], -1)
        
        x = np.array(t_vec, dtype=np.int32)
        
        y= np.where(train_df['target']>0.5, 1, 0)
#         label = pd.get_dummies(label)
#         y = label.values
        y = y.astype(np.int32)
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(x, y, train_size=self.train_size, random_state=1)
        self.n_train = len(self.y_train)
        self.n_test = len(self.y_test)
        self.train = train
        self.train_size=train_size
        
        del t, words, t_vec, t_ids, x, y
    
    def __len__(self):
        if self.train:
            return self.n_train
        else:
            return self.n_test
    
    def get_example(self, i):
        
        train = datasets.tuple_dataset.TupleDataset(self.X_train, self.y_train)
        test = datasets.tuple_dataset.TupleDataset(self.X_test, self.y_test)
        
        if self.train:
            return train[i]
        else:
            return test[i]
        

In [None]:
class RNN(chainer.Chain):
    def __init__(self, n_vocab=1670966, n_units=100):
        super(RNN, self).__init__(
            embed = L.EmbedID(n_vocab, n_units),
            l1 = L.LSTM(n_units, n_units),
            l2 = L.LSTM(n_units, n_units),
            l3 = L.Linear(n_units, 2)
        )
    
    def reset_state(self):
        self.l1.reset_state()
        self.l2.reset_state()

    def forward(self, x):
        h0 = self.embed(x)
        h1 = self.l1(F.dropout(h0))
        h2 = self.l2(F.dropout(h1))
        y = F.softmax(self.l3(h2))
        return y


model = RNN()

gpu_id = 0
if gpu_id >= 0:
    model.to_gpu(gpu_id)
    
optimizer = chainer.optimizers.Adam()
optimizer.setup(model)

In [None]:
BATCH_SIZE = 5
MAX_EPOCH = 5

In [None]:
%%time
# train = jigsaw_dataset(train_size=0.1)
# test = jigsaw_dataset(train=False, train_size=0.9)
# train_iter = chainer.iterators.SerialIterator(train, BATCH_SIZE)
# test_iter = chainer.iterators.SerialIterator(test, BATCH_SIZE, repeat=False, shuffle=False)
train_iter = chainer.iterators.SerialIterator(jigsaw_dataset(train_size=0.1), BATCH_SIZE)
test_iter = chainer.iterators.SerialIterator(jigsaw_dataset(train=False, train_size=0.9), BATCH_SIZE, repeat=False, shuffle=False)

In [None]:
while train_iter.epoch < MAX_EPOCH:
#     train_batch = train_iter.next()
#     sentence_train, target_train = chainer.dataset.concat_examples(train_batch)
#     sentence_train, target_train = chainer.dataset.concat_examples(train_batch, gpu_id)
    sentence_train, target_train = chainer.dataset.concat_examples(train_iter.next(), gpu_id)
    
    
#     prediction_train = model(sentence_train)
#     loss = F.softmax_cross_entropy(prediction_train, target_train)
    loss = F.softmax_cross_entropy(model(sentence_train), target_train)
    
    model.cleargrads()
    loss.backward()
    
    optimizer.update()
    if train_iter.is_new_epoch:
        print('epoch:{:02d} train_loss:{:.04f}'.format(train_iter.epoch, float(loss.array)), end='')
        
        test_losses = []
        test_accuracies = []
        while True:
#             test_batch = test_iter.next()
#             sentence_test, target_test = chainer.dataset.concat_examples(test_batch)
#             sentence_test, target_test = chainer.dataset.concat_examples(test_batch, gpu_id)
            sentence_test, target_test = chainer.dataset.concat_examples(test_iter.next(), gpu_id)
            
#             prediction_test = model(sentence_test)
#             loss_test = F.mean_squared_error(prediction_test, target_test)
            loss_test = F.mean_squared_error(model(sentence_test), target_test)
            
#             test_losses.append(loss_test.array)
            test_losses.append(cuda.to_cpu(loss_test.array))
            
            accuracy = F.accuracy(prediction_test, target_test)
            accuracy.cuda.to_cpu()#
            test_accuracies.append(accuracy.array)
            
            if test_iter.is_new_epoch:
                test_iter.epoch = 0
                test_iter.current_position = 0
                test_iter.is_new_epoch = False
                test_iter._pushed_position = None
                break;
        
        print('val_loss:{:.04f} val_accuracy:{:.04f}'.format(np.mean(test_losses), np.mean(test_accuracies)))