# CFT2018 contest classification part

In [2]:
import pandas as pd
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb
from keras import backend as K
from keras.callbacks import ModelCheckpoint

from utils import f1

Using TensorFlow backend.


Загрузим данные.

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
train.shape, test.shape

((1991104, 5), (2767639, 3))

### LSTM classifier

In [8]:
%%time

max_features = 20000
maxlen = 92

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(train.fullname.apply(lambda x: np.array([i for i in x])).values, maxlen=maxlen, dtype='object')
x_test = sequence.pad_sequences(test.fullname.apply(lambda x: np.array([i for i in x])).values, maxlen=maxlen, dtype='object')
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (1991104, 92)
x_test shape: (2767639, 92)
CPU times: user 1min 9s, sys: 19.1 s, total: 1min 28s
Wall time: 1min 28s


#### Vectorisation

In [9]:
%%time
chars = []
for ar in [x_test, x_train]:
    for row in ar:
        for char in row:
            chars.append(char)

CPU times: user 1min, sys: 2.91 s, total: 1min 3s
Wall time: 1min 3s


In [10]:
%%time
chars = sorted(list(set([i for i in chars if i != 0])))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 86
CPU times: user 29.1 s, sys: 1.05 s, total: 30.1 s
Wall time: 30.1 s


In [11]:
%%time
print('Vectorization...train')

for i, sentence in enumerate(x_train):
    for t, char in enumerate(sentence):
        if x_train[i, t] == 0:
            x_train[i, t] = 99
        else:
            x_train[i, t] = char_indices[x_train[i, t]]

Vectorization...train
CPU times: user 1min 27s, sys: 449 ms, total: 1min 28s
Wall time: 1min 28s


In [12]:
%%time
print('Vectorization...test')

for i, sentence in enumerate(x_test):
    for t, char in enumerate(sentence):
        if x_test[i, t] == 0:
            x_test[i, t] = 99
        else:
            x_test[i, t] = char_indices[x_test[i, t]]

Vectorization...test
CPU times: user 2min 1s, sys: 932 ms, total: 2min 2s
Wall time: 2min 2s


In [13]:
from keras.utils.np_utils import to_categorical
y_train = to_categorical(train['target'].values)

#### Modelling

In [21]:
batch_size = 32

model = Sequential()
model.add(Embedding(max_features, 64, input_length=maxlen))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(128))
model.add(Dense(3, activation='softmax'))

model.compile(loss='binary_crossentropy',
          optimizer= "adam",
          metrics=[f1])

In [22]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 92, 64)            1280000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 92, 128)           66048     
_________________________________________________________________
dropout_1 (Dropout)          (None, 92, 128)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                41216     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 387       
Total para

In [57]:
checkpoint = ModelCheckpoint('models/best_lstm_classifier.h5', 
                             monitor='val_acc', verbose=1, 
                             save_best_only=False, mode='max')

model.fit(x_train[:], y_train[:], 
          callbacks=[checkpoint],
          batch_size=batch_size*32,
          epochs=20)

Epoch 1/20

Epoch 00001: saving model to models/best_lstm_classifier.h5
Epoch 2/20

Epoch 00002: saving model to models/best_lstm_classifier.h5
Epoch 3/20

Epoch 00003: saving model to models/best_lstm_classifier.h5
Epoch 4/20

Epoch 00004: saving model to models/best_lstm_classifier.h5
Epoch 5/20

Epoch 00005: saving model to models/best_lstm_classifier.h5
Epoch 6/20

Epoch 00006: saving model to models/best_lstm_classifier.h5
Epoch 7/20

Epoch 00007: saving model to models/best_lstm_classifier.h5
Epoch 8/20

Epoch 00008: saving model to models/best_lstm_classifier.h5
Epoch 9/20
 268288/1991104 [===>..........................] - ETA: 1:41:54 - loss: 0.1194 - f1: 0.9361

KeyboardInterrupt: 

Upload of pre-trained model:

In [54]:
%%time

model = load_model('models/lstm_17_e.h5', custom_objects={'f1': f1})

CPU times: user 10 s, sys: 0 ns, total: 10 s
Wall time: 9.46 s


In [24]:
%%time
y_predict = model.predict_classes(x_test)

CPU times: user 3h 50min 49s, sys: 27min 54s, total: 4h 18min 44s
Wall time: 1h 15min 4s


In [25]:
test['target'] = y_predict

In [26]:
test.to_csv('test_classified.csv', index=False)