In [8]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from keras.models import load_model
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences

In [9]:
import os
import numpy as np
from keras import backend as K
from sklearn.model_selection import train_test_split


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# data path initialization
BASE_DIR = '../'
TEXT_DATA_DIR = BASE_DIR + 'data/'
TEXT_DATA_FILE = "ukrainian_reviews_corpus.csv"
HEADER = True

# parameters initialization
VALIDATION_SPLIT = 0.1
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

            Only computes a batch-wise average of recall.

            Computes the recall, a metric for multi-label classification of
            how many relevant items are selected.
            """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

            Only computes a batch-wise average of precision.

            Computes the precision, a metric for multi-label classification of
            how many selected items are relevant.
            """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall))


def load_data():
    # function for loading data
    x = []
    iy = -1
    y = []
    with open(os.path.join(TEXT_DATA_DIR, TEXT_DATA_FILE), "r", encoding="utf-8") as f:
        if HEADER:
            _ = next(f)
        for line in f:
            if len(line) < 2 or line[1] != '|':
                x[iy] = x[iy] + line.rstrip('\n').replace("'", "")
            else:
                temp_y, temp_x = line.rstrip("\n").split("|", 1)
                x.append(temp_x.replace("'", ""))
                y.append(temp_y)
                iy += 1
    return x, y


# In[99]:

data, labels = load_data()
labels = np.asarray(labels, dtype='int8')

# In[100]:

# spliting our original data on train and validation sets
data_train, data_val, labels_train, labels_val = train_test_split(data, np.asarray(labels, dtype='int8'),
                                                                  test_size=VALIDATION_SPLIT, random_state=RANDOM_SEED,
                                                                  stratify=labels)

# initialize dictionary size and maximum sentence length
MAX_SEQUENCE_LENGTH = 400

# In[102]:

import string
from keras.preprocessing.sequence import pad_sequences

# In[103]:

ukr_alphabet = ['а', 'б', 'в', 'г', 'ґ', 'д', 'е', 'є', 'ж', 'з', 'і', 'ї', 'й',
                'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'и',
                'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ь', 'ю', 'я']


def create_vocab_set():
    alphabet = (ukr_alphabet + list(string.digits) +
                list(string.punctuation) + list(string.whitespace))
    vocab_size = len(alphabet)
    vocab = {}
    for ix, t in enumerate(alphabet):
        vocab[t] = ix + 1
    return vocab, vocab_size


def text2sequence(text, vocab):
    temp = []
    for review in text:
        temp.append([])
        for i in review:
            char = vocab.get(i, 0)
            if char != 0:
                temp[-1].append(char)
    return temp


vocab, vocab_size = create_vocab_set()

X_train = text2sequence(data_train, vocab)
X_val = text2sequence(data_val, vocab)

X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, value=0)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH, value=0)

import tensorflow as tf
# ohe function
def ohe(x, sz):
    return tf.to_float(tf.one_hot(x, sz, on_value=1, off_value=0, axis=-1))

from keras.models import Model
from keras.layers import Input, Lambda, MaxPooling1D, Dense, Conv1D, LSTM
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, LearningRateScheduler

from keras import optimizers
NAME = "ohe_cnn_ukrainian"
# input initialization
in_sentence = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int64')
# Lambda layer for ohe transformation
embedded = Lambda(ohe, output_shape=lambda x: (x[0], x[1], vocab_size), arguments={"sz": vocab_size})(in_sentence)
block = embedded
# convolutions with MaxPooling
for i in range(3):
    block = Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid", trainable = True)(block)
    if i == 0:
        block = MaxPooling1D(pool_size=5)(block)
# LSTM cell
block = LSTM(128, dropout=0.1, recurrent_dropout=0.1, trainable = True)(block)
block = Dense(100, activation='relu', trainable = True)(block)
block = Dense(1, activation='sigmoid', trainable = True)(block)

# callbacks initialization
# automatic generation of learning curves
callback_1 = TensorBoard(log_dir='../logs/logs_{}'.format(NAME), histogram_freq=0,
                             write_graph=False, write_images=False)
# stop training model if accuracy does not increase more than five epochs
callback_2 = EarlyStopping(monitor='val_f1', min_delta=0, patience=5, verbose=0, mode='max')
# best model saving
callback_3 = ModelCheckpoint("models/model_{}.hdf5".format(NAME), monitor='val_f1',
                                 save_best_only=True, verbose=0, mode='max')


# initialize model
model = Model(inputs=in_sentence, outputs=block)


opt = optimizers.Adam(lr=0.001)
model.compile(loss='binary_crossentropy',
              optimizer=opt,
              metrics=[f1])

In [10]:
model.load_weights('../models/model_ohe_cnn_ukrainian.hdf5')

In [13]:
np.array(model.predict(X_val)[:,0] > 0.5, int)

array([0, 1, 0, ..., 0, 1, 0])

In [14]:
f1_score(labels_val, np.array(model.predict(X_val)[:,0] > 0.5, int))

0.88937093275488066

In [18]:
BASE_DIR = '../'
TEXT_DATA_DIR = BASE_DIR + 'data/'
TEXT_DATA_FILE = "ukrainian_reviews_corpus.csv"
HEADER = True

# parameters initialization
VALIDATION_SPLIT = 0.1
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

            Only computes a batch-wise average of recall.

            Computes the recall, a metric for multi-label classification of
            how many relevant items are selected.
            """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

            Only computes a batch-wise average of precision.

            Computes the precision, a metric for multi-label classification of
            how many selected items are relevant.
            """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall))


def load_data():
    # function for loading data
    x = []
    iy = -1
    y = []
    with open(os.path.join(TEXT_DATA_DIR, TEXT_DATA_FILE), "r", encoding="utf-8") as f:
        if HEADER:
            _ = next(f)
        for line in f:
            line = line.replace('и', 'ы')
            line = line.replace('і', 'и')
            line = line.replace('є', 'е')
            line = line.replace('ї', 'и')
            if len(line) < 2 or line[1] != '|':
                x[iy] = x[iy] + line.rstrip('\n').replace("'", "")
            else:
                temp_y, temp_x = line.rstrip("\n").split("|", 1)
                x.append(temp_x.replace("'", ""))
                y.append(temp_y)
                iy += 1
    return x, y


data, labels = load_data()
labels = np.asarray(labels, dtype='int8')

# spliting our original data on train and validation sets
data_train, data_val, labels_train, labels_val = train_test_split(data, np.asarray(labels, dtype='int8'),
                                                                  test_size=VALIDATION_SPLIT, random_state=RANDOM_SEED,
                                                                  stratify=labels)\

# initialize dictionary size and maximum sentence length
MAX_NB_WORDS = 81
MAX_SEQUENCE_LENGTH = 400

import string
from keras.preprocessing.sequence import pad_sequences

rus_alphabet = ['а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у',
                'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я']

alphabet = (list(rus_alphabet) + list(string.digits) + list(string.punctuation) + list(string.whitespace))
vocab_size = len(alphabet)


def create_vocab_set():
    alphabet = (list(rus_alphabet) + list(string.digits) +
                list(string.punctuation) + list(string.whitespace))
    vocab_size = len(alphabet)
    vocab = {}
    for ix, t in enumerate(alphabet):
        vocab[t] = ix + 1
    return vocab, vocab_size


def text2sequence(text, vocab):
    temp = []
    for review in text:
        temp.append([])
        for i in review:
            char = vocab.get(i, 0)
            if char != 0:
                temp[-1].append(char)
    return temp


vocab, vocab_size = create_vocab_set()

X_train = text2sequence(data_train, vocab)
X_val = text2sequence(data_val, vocab)

X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, value=0)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH, value=0)

import tensorflow as tf
# ohe function
def ohe(x, sz):
    return tf.to_float(tf.one_hot(x, sz, on_value=1, off_value=0, axis=-1))

from keras.models import Model
from keras.layers import Input, Lambda, MaxPooling1D, Dense, Conv1D, LSTM
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint, LearningRateScheduler

from keras import optimizers
NAME = "transfer_learning_from_russian_to_ukrainian"
# input initialization
in_sentence = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int64')
# Lambda layer for ohe transformation
embedded = Lambda(ohe, output_shape=lambda x: (x[0], x[1], vocab_size), arguments={"sz": vocab_size})(in_sentence)
block = embedded
# convolutions with MaxPooling
for i in range(3):
    block = Conv1D(activation="relu", filters=100, kernel_size=4, padding="valid", trainable = True)(block)
    if i == 0:
        block = MaxPooling1D(pool_size=5)(block)
# LSTM cell
block = LSTM(128, dropout=0.1, recurrent_dropout=0.1, trainable = True)(block)
block = Dense(100, activation='relu', trainable = True)(block)
block = Dense(1, activation='sigmoid', trainable = True)(block)

# callbacks initialization
# automatic generation of learning curves
callback_1 = TensorBoard(log_dir='../logs/logs_{}'.format(NAME), histogram_freq=0,
                             write_graph=False, write_images=False)
# stop training model if accuracy does not increase more than five epochs
callback_2 = EarlyStopping(monitor='val_f1', min_delta=0, patience=5, verbose=0, mode='max')
# best model saving
callback_3 = ModelCheckpoint("models/model_{}.hdf5".format(NAME), monitor='val_f1',
                                 save_best_only=True, verbose=0, mode='max')


def update_func(i):
    base = 0.01
    lr = base / (10**(i // 4 + 1))
    if lr < 0.00001:
        return 0.00001
    return lr

callback_4 = LearningRateScheduler(update_func)
# initialize model
model_transfer = Model(inputs=in_sentence, outputs=block)
model_transfer.load_weights('../models/model_transfer_learning_from_russian_to_ukrainian.hdf5')

opt = optimizers.Adam(lr=0.001)
model_transfer.compile(loss='binary_crossentropy',
              optimizer=opt,
              metrics=[f1])

In [19]:
f1_score(labels_val, np.array(model_transfer.predict(X_val)[:,0] > 0.5, int))

0.9137931034482758

In [80]:
import pickle
from sklearn.metrics import f1_score
model_linear = pickle.load(open("../models/spanish_linear_model.pkl", 'rb'))


In [90]:
for i in np.linspace(0,1,100):
    print(i, ":", f1_score(labels_val,np.array((preds_nn_val*(i) + preds_lr_val*(1-i))[:,1] > 0.5, 'int')))

0.0 : 0.850833333333
0.010101010101 : 0.850478967097
0.020202020202 : 0.85012489592
0.030303030303 : 0.851435705368
0.040404040404 : 0.851435705368
0.0505050505051 : 0.851435705368
0.0606060606061 : 0.850957535387
0.0707070707071 : 0.850478967097
0.0808080808081 : 0.851435705368
0.0909090909091 : 0.851435705368
0.10101010101 : 0.851081530782
0.111111111111 : 0.851913477537
0.121212121212 : 0.851913477537
0.131313131313 : 0.852036575229
0.141414141414 : 0.853344412131
0.151515151515 : 0.854175321978
0.161616161616 : 0.853699085619
0.171717171717 : 0.854175321978
0.181818181818 : 0.85536159601
0.191919191919 : 0.854651162791
0.20202020202 : 0.855006231824
0.212121212121 : 0.85536159601
0.222222222222 : 0.85536159601
0.232323232323 : 0.854885654886
0.242424242424 : 0.854530340815
0.252525252525 : 0.853699085619
0.262626262626 : 0.854530340815
0.272727272727 : 0.854409317804
0.282828282828 : 0.855006231824
0.292929292929 : 0.85595682856
0.30303030303 : 0.856787048568
0.313131313131 : 0.856

In [92]:
for i in np.linspace(0,1,100):
    print(i, ":", f1_score(labels_val,np.array((preds_tr_nn_val*(i) + preds_lr_val*(1-i))[:,1] > 0.5, 'int')))

0.0 : 0.850833333333
0.010101010101 : 0.851666666667
0.020202020202 : 0.85154295246
0.030303030303 : 0.852977925864
0.040404040404 : 0.853333333333
0.0505050505051 : 0.852732582395
0.0606060606061 : 0.853566958698
0.0707070707071 : 0.853566958698
0.0808080808081 : 0.85260960334
0.0909090909091 : 0.853923205342
0.10101010101 : 0.854879065888
0.111111111111 : 0.855
0.121212121212 : 0.855356398499
0.131313131313 : 0.855833333333
0.141414141414 : 0.856666666667
0.151515151515 : 0.856427378965
0.161616161616 : 0.856784968685
0.171717171717 : 0.856784968685
0.181818181818 : 0.855949895616
0.191919191919 : 0.855949895616
0.20202020202 : 0.856307435255
0.212121212121 : 0.855708908407
0.222222222222 : 0.85690376569
0.232323232323 : 0.857859531773
0.242424242424 : 0.857859531773
0.252525252525 : 0.857381848599
0.262626262626 : 0.857740585774
0.272727272727 : 0.856783919598
0.282828282828 : 0.856783919598
0.292929292929 : 0.856783919598
0.30303030303 : 0.857621440536
0.313131313131 : 0.8571428571

In [87]:
np.corrcoef([preds_nn_val[:,0], preds_tr_nn_val[:,0], preds_lr_val[:,1]])

array([[ 1.        ,  0.85948058,  0.87535394],
       [ 0.85948058,  1.        ,  0.87642146],
       [ 0.87535394,  0.87642146,  1.        ]])

In [85]:
preds_nn_val

array([[ 0.99341279],
       [ 0.9970293 ],
       [ 0.90376759],
       ..., 
       [ 0.0013071 ],
       [ 0.95976818],
       [ 0.04299338]], dtype=float32)

In [96]:
for i in np.linspace(0, 1, 20):
    for j in np.linspace(0, 1, 20):
        if i + j < 1:
            print(i, j, f1_score(labels_val,np.array((preds_tr_nn_val*(j) + preds_nn_val*(i) + preds_lr_val*(1-i-j))[:,1] > 0.5, 'int')))

0.0 0.0 0.850833333333
0.0 0.0526315789474 0.852732582395
0.0 0.105263157895 0.854522717799
0.0 0.157894736842 0.856784968685
0.0 0.210526315789 0.85618729097
0.0 0.263157894737 0.857740585774
0.0 0.315789473684 0.85666387259
0.0 0.368421052632 0.857859531773
0.0 0.421052631579 0.856195607128
0.0 0.473684210526 0.852892561983
0.0 0.526315789474 0.850801479655
0.0 0.578947368421 0.845681539091
0.0 0.631578947368 0.838420838421
0.0 0.684210526316 0.836245428688
0.0 0.736842105263 0.834888438134
0.0 0.789473684211 0.832388663968
0.0 0.842105263158 0.830906148867
0.0 0.894736842105 0.829090909091
0.0 0.947368421053 0.826139572408
0.0526315789474 0.0 0.851435705368
0.0526315789474 0.0526315789474 0.854288093256
0.0526315789474 0.105263157895 0.856786011657
0.0526315789474 0.157894736842 0.857262103506
0.0526315789474 0.210526315789 0.857501044714
0.0526315789474 0.263157894737 0.860484544695
0.0526315789474 0.315789473684 0.859531772575
0.0526315789474 0.368421052632 0.85643153527
0.0526315