In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['train.csv', 'sample_submission.csv', 'test.csv']


In [2]:
import chainer
from chainer import datasets
from chainer import functions as F
from chainer import links as L
from chainer import Variable
from chainer.backends import cuda
from sklearn.model_selection import train_test_split
xp = cuda.cupy

In [3]:
%%time
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
all_df = pd.concat([train_df['comment_text'], test_df['comment_text']])

t_all = []
for s in all_df:
    t_all.append(s.split())
del all_df

words = {}
for j in range(len(t_all)):
    for word in t_all[j]:
        if word not in words:
            words[word] = len(words)
del t_all

t_train = []
for s in train_df['comment_text']:
    t_train.append(s.split())

X = []
for j in range(len(t_train)):
    vector = []
    for word in t_train[j]:
        vector.append(words[word])
    vector = xp.array(vector, dtype=xp.int32)
    X.append(vector)
del t_train

t_test = []
for s in test_df['comment_text']:
    t_test.append(s.split())

X_pre = []
for j in range(len(t_test)):
    vector = []
    for word in t_test[j]:
        vector.append(words[word])
    vector = xp.array(vector, dtype=xp.int32)
    X_pre.append(vector)
del t_test
del test_df

del words
del vector

CPU times: user 5min 14s, sys: 8.62 s, total: 5min 23s
Wall time: 5min 23s


In [4]:
%%time
max_len_train = max(list(map(len, X)))
max_len_test = max(list(map(len, X_pre)))
max_len = max_len_train if max_len_train>max_len_test else max_len_test
del max_len_train
del max_len_test

for i in range(len(X)):
    add = xp.zeros((max_len-len(X[i])), dtype=xp.int32)
    X[i] = xp.hstack((X[i], add))

for i in range(len(X_pre)):
    add = xp.zeros((max_len-len(X_pre[i])), dtype=xp.int32)
    X_pre[i] = xp.hstack((X_pre[i], add))

del add

CPU times: user 3min 28s, sys: 1.84 s, total: 3min 29s
Wall time: 3min 30s


In [5]:
y= np.where(train_df['target']>0.5, 1, 0)
y = xp.array(y, dtype=xp.int32)
del train_df

In [6]:
class jigsaw_dataset(chainer.dataset.DatasetMixin):
    def __init__(self, X, y, train=True, min_count=10, train_size=0.6):
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, train_size=train_size, random_state=1)
        self.n_train = len(self.y_train)
        self.n_test = len(self.y_test)
        self.train = train
        self.train_size=train_size
    
    def __len__(self):
        if self.train:
            return self.n_train
        else:
            return self.n_test
    
    def get_example(self, i):
        if self.train:
            train = datasets.tuple_dataset.TupleDataset(self.X_train, self.y_train)
            return train[i]
        else:
            test = datasets.tuple_dataset.TupleDataset(self.X_test, self.y_test)
            return test[i]


In [7]:
class RNN(chainer.Chain):
    def __init__(self, n_vocab=1670966, n_units=100):
        super(RNN, self).__init__(
            embed = L.EmbedID(n_vocab, n_units),
            l1 = L.LSTM(None, n_units),
            l2 = L.LSTM(None, n_units),
            l3 = L.Linear(None, 2)
        )
    
    def reset_state(self):
        self.l1.reset_state()
        self.l2.reset_state()

    def forward(self, x):
        h0 = self.embed(x)
        h1 = self.l1(F.dropout(h0))
        h2 = self.l2(F.dropout(h1))
        y = F.softmax(self.l3(h2))
        return y


model = L.Classifier(RNN())

gpu_id = 0
if gpu_id >= 0:
    model.to_gpu(gpu_id)
    
optimizer = chainer.optimizers.Adam()
optimizer.setup(model)
optimizer.add_hook(chainer.optimizer.GradientClipping(5))

In [8]:
BATCH_SIZE = 128
MAX_EPOCH = 5
# length of truncated BPTT
BPROP_LEN = 30

In [9]:
train = jigsaw_dataset(X, y)
test = jigsaw_dataset(X, y, train_size=0.9, train=False)
del X
del y
train_iter = chainer.iterators.SerialIterator(train, BATCH_SIZE)
test_iter = chainer.iterators.SerialIterator(test, BATCH_SIZE, repeat=False, shuffle=False)
del train
del test



In [10]:
%%time
sum_perp = 0
count = 0
iteration = 0
while train_iter.epoch < MAX_EPOCH:
        loss = 0
        iteration += 1
        for i in range(BPROP_LEN):
            train_batch = train_iter.__next__()
            sentence_train, target_train = chainer.dataset.convert.concat_examples(train_batch, gpu_id)
            loss += optimizer.target(sentence_train, target_train)
            if train_iter.is_new_epoch:
                break
        count += 1
        sum_perp += loss.array
        optimizer.target.cleargrads()
        loss.backward()
        loss.unchain_backward()
        optimizer.update()
        # 1082924
        if count%100==0:
            print(str(count)+"\t"+str(train_iter.epoch))
del train_iter

100	0
200	0
300	1
400	1
500	1
600	2
700	2
800	2
900	3
1000	3
1100	3
1200	4
1300	4
1400	4
CPU times: user 9min 52s, sys: 3min 15s, total: 13min 7s
Wall time: 13min 9s


In [11]:
cuda.memory_pool.free_all_free()

  """Entry point for launching an IPython kernel.


In [12]:
test_loss = 0
for test_batch in test_iter:
    sentence_test, target_test = chainer.dataset.convert.concat_examples(train_batch, gpu_id)
    test_loss += optimizer.target(sentence_test, target_test)
print(test_loss)

OutOfMemoryError: out of memory to allocate 17152000 bytes (total 15642199040 bytes)

In [13]:
sub_df = pd.read_csv("../input/sample_submission.csv")
sub_df.head()