# CFT2018 contest baseline

In [1]:
import pandas as pd
import numpy as np

Загрузим данные.

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
train.shape, test.shape

((1991104, 5), (2767639, 3))

In [5]:
train.head()

Unnamed: 0,id,fullname,country,target,fullname_true
0,0,AKHMEDOV YGURIY,РОССИЯ,1,AKHMEDOV YURIY
1,1,ФОЗИЛОВ РАМИЛЬ ГУЛЛОВИЧ,РОССИЯ,1,ФОЗИЛОВ РАМИЛЬ ГУЛОВИЧ
2,2,ГОИБОВ АХЛИДДИН ШАМСУДИНОВИЧ,РОССИЯ,0,
3,3,ХУСНЕУТДИНОВА МАРГАРИТА ФАХИМОВНА,РОССИЯ,1,ХУСНУТДИНОВА МАРГАРИТА ФАХИМОВНА
4,4,НОВОКШОНОВА ИННА ВЛАДИМИРОВНА,РОССИЯ,0,


In [6]:
test.head()

Unnamed: 0,id,fullname,country
0,0,ХУДАШКУРОВА ГУЛЗХОДА БЕРДИЕВНА,УЗБЕКИСТАН
1,1,СВЕЖЕТЬФЛОГИСТОН АРСЕН,РОССИЯ
2,2,ГУЛОМОВА СОЖИДА САНАЕВНА,УЗБЕКИСТАН
3,3,КАМПЫШЕВА ГУЛЯИМ БЕЙСЕМБАЕВНА,КАЗАХСТАН
4,4,OROSUMEBTOV MIRLAN,РОССИЯ


---

### Keras try

In [7]:
%%time
from __future__ import print_function
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb

from keras.callbacks import ModelCheckpoint


max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 92

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(train.fullname.apply(lambda x: np.array([i for i in x])).values, maxlen=maxlen, dtype='object')
x_test = sequence.pad_sequences(test.fullname.apply(lambda x: np.array([i for i in x])).values, maxlen=maxlen, dtype='object')
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Using TensorFlow backend.


Pad sequences (samples x time)
x_train shape: (1991104, 92)
x_test shape: (2767639, 92)
CPU times: user 1min 8s, sys: 16.6 s, total: 1min 24s
Wall time: 1min 38s


#### Vectorisation

In [11]:
%%time
chars = []
for ar in [x_test, x_train]:
    for row in ar:
        for char in row:
            chars.append(char)

CPU times: user 1min 5s, sys: 2.72 s, total: 1min 8s
Wall time: 1min 8s


In [12]:
%%time
chars = sorted(list(set([i for i in chars if i != 0])))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 86
CPU times: user 28.4 s, sys: 1.01 s, total: 29.4 s
Wall time: 29.4 s


---

In [15]:
import pickle

In [16]:
with open('char_dicts/char_indices.pickle', 'wb') as handle:
    pickle.dump(char_indices, handle)

In [17]:
with open('char_dicts/indices_char.pickle', 'wb') as handle:
    pickle.dump(indices_char, handle, protocol=pickle.HIGHEST_PROTOCOL)

---

In [18]:
%%time
print('Vectorization...train')

for i, sentence in enumerate(x_train):
    for t, char in enumerate(sentence):
        if x_train[i, t] == 0:
            x_train[i, t] = 99
        else:
            x_train[i, t] = char_indices[x_train[i, t]]

Vectorization...train
CPU times: user 1min 29s, sys: 380 ms, total: 1min 30s
Wall time: 1min 30s


In [19]:
%%time
print('Vectorization...test')

for i, sentence in enumerate(x_test):
    for t, char in enumerate(sentence):
        if x_test[i, t] == 0:
            x_test[i, t] = 99
        else:
            x_test[i, t] = char_indices[x_test[i, t]]

Vectorization...test
CPU times: user 2min 8s, sys: 1.01 s, total: 2min 9s
Wall time: 2min 9s


In [20]:
from keras.utils.np_utils import to_categorical
y_train = to_categorical(train['target'].values)

#### Modelling

In [21]:
# first lstm model
batch_size = 32

model = Sequential()
model.add(Embedding(max_features, 64, input_length=maxlen))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(128))
model.add(Dense(3, activation='softmax'))

from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


model.compile(loss='binary_crossentropy',
          optimizer= "adam",
          metrics=[f1])

In [22]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 92, 64)            1280000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 92, 128)           66048     
_________________________________________________________________
dropout_1 (Dropout)          (None, 92, 128)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                41216     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 387       
Total para

In [54]:
%%time
from keras.models import load_model
model = load_model('models/lstm_17_e.h5', custom_objects={'f1': f1})

CPU times: user 10 s, sys: 0 ns, total: 10 s
Wall time: 9.46 s


In [56]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint('models/best_lstm_classifier.h5', 
                             monitor='val_acc', verbose=1, 
                             save_best_only=False, mode='max')

In [57]:
_idx = np.random.choice(range(len(x_train)), 200000)
model.fit(x_train[:], y_train[:], 
          callbacks=[checkpoint],
          batch_size=batch_size*32,
          epochs=20)

Epoch 1/20

Epoch 00001: saving model to models/best_lstm_classifier.h5
Epoch 2/20

Epoch 00002: saving model to models/best_lstm_classifier.h5
Epoch 3/20

Epoch 00003: saving model to models/best_lstm_classifier.h5
Epoch 4/20

Epoch 00004: saving model to models/best_lstm_classifier.h5
Epoch 5/20

Epoch 00005: saving model to models/best_lstm_classifier.h5
Epoch 6/20

Epoch 00006: saving model to models/best_lstm_classifier.h5
Epoch 7/20

Epoch 00007: saving model to models/best_lstm_classifier.h5
Epoch 8/20

Epoch 00008: saving model to models/best_lstm_classifier.h5
Epoch 9/20
 268288/1991104 [===>..........................] - ETA: 1:41:54 - loss: 0.1194 - f1: 0.9361

KeyboardInterrupt: 

In [24]:
%%time
y_predict = model.predict_classes(x_test)

CPU times: user 3h 50min 49s, sys: 27min 54s, total: 4h 18min 44s
Wall time: 1h 15min 4s


In [25]:
test['target'] = y_predict

In [26]:
test.to_csv('after_nn3.csv', index=False)

### Исправление опечаток

Для коррекции опечаток воспользуемся open-source библиотекой ([github](https://github.com/mammothb/symspellpy)). Можно установить через pip. 

In [27]:
import symspellpy
symspell = symspellpy.SymSpell()

Подготовим обучающую выборку для корректора. Добавим туда весь корректный train. На выходе нам нужно отдать файл с частотами слов.

In [28]:
%%time
train.loc[train.target != 1, 'fullname_true'] = train.loc[train.target != 1, 'fullname']
test_plus = test[test.target == 0]

CPU times: user 2.36 s, sys: 313 ms, total: 2.68 s
Wall time: 2.42 s


In [35]:
to_dict = pd.concat([train.fullname_true, test_plus.fullname], axis=0)

In [37]:
from collections import Counter
dicts = [name for person in to_dict for name in person.split(' ')]
name_freq = Counter(dicts)

In [39]:
with open('dictionary_with_test.txt', 'w') as f:
    for name, freq in name_freq.items():
        f.write('{} {}\n'.format(name, freq))

Загрузим словарь в модель.

In [43]:
symspell.load_dictionary('dictionary_with_test.txt', term_index=0, count_index=1)

True

Будем проводить коррекцию по словам.

In [44]:
def correct(s):
    def correct_word(w):
        tmp = symspell.lookup(w, symspellpy.Verbosity.CLOSEST)
        if len(tmp):
            return tmp[0].term.upper()
        else:
            return w

    return ' '.join([correct_word(word) for word in s.split(' ')])

In [50]:
%%time
correct('КАуРАБОЗ ЛАТИФ АЛИМАМАДОВИЧ')

CPU times: user 8.11 ms, sys: 0 ns, total: 8.11 ms
Wall time: 7.52 ms


'КАРАКОЗ ЛАТИФ АЛИМАМАДОВИЧ'

Посчитаем качество на train-выборке (переобученное!)

In [21]:
%%time
train_1 = train.loc[train.target == 1].copy()
train_1['fullname_corrected'] = train_1.fullname.apply(correct)

In [22]:
np.mean(train_1.fullname_true == train_1.fullname_corrected)

0.8272597104120801

Скорректируем тестовую выборку.

In [52]:
%%time
test['fullname_true'] = None

test.loc[test.target == 1, 'fullname_true'] = test.loc[test.target == 1, 'fullname'].apply(correct)

CPU times: user 26min 25s, sys: 0 ns, total: 26min 25s
Wall time: 26min 26s


Сохраним итоговый файл.

In [53]:
test[['id', 'target', 'fullname_true']].to_csv('subs/submission_lstm17_plus_te_corr.csv', index=False)

In [48]:
test2 = test.copy()

In [49]:
test2['fullname_true'] = 'lalalala'

In [50]:
test2[['id', 'target', 'fullname_true']].to_csv('subs/submission_lstm12_wo_correction.csv', index=False)

In [51]:
0.466884 * 2

0.933768