In [2]:
import numpy as np
from sklearn.base import TransformerMixin
from pymystem3 import Mystem
from gensim.models import KeyedVectors
import re
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import logging
import sys

In [3]:
import os
import string
import tempfile
import tensorflow as tf
import numpy as np

import tf_metrics
from tensorflow.python.keras.preprocessing import sequence
from tensorboard import summary as summary_lib

# tf.logging.set_verbosity(tf.logging.INFO)
# print(tf.__version__)

  return f(*args, **kwds)


In [4]:
import pandas as pd
import os
import numpy as np
import json
from sklearn.model_selection import train_test_split

from vectorizer import Vectorizer

### Работа с данными (1 балл)

Загрузите датасет, с которым вы работали во время соревнования на kaggle. Преобразуйте его в формат, удобный для обучения модели. В качетсве фичей используйте эмбединги слов.

In [5]:
def get_data(to_subm=False, data_dir='../hw1/data/'):
    train_data = pd.read_csv(os.path.join(data_dir, 'train.csv'), index_col='id')
    test_data = pd.read_csv(os.path.join(data_dir, 'test.csv'), index_col='id')
    y = train_data.values[:, 1].astype(int)
    x = train_data.values[:, 0]
    x_test = test_data.values[:, 0]

    x_tokens = np.array(json.load(open(os.path.join(data_dir, 'train_tokens.json'), 'rb')))
    x_tokens_test = np.array(json.load(open(os.path.join(data_dir, 'test_tokens.json'), 'rb')))

    train_idx, val_idx = train_test_split(np.arange(len(x)), train_size=0.8, random_state=0)
    if to_subm:
        train_idx = np.arange(len(x))  # for final submit

    return (x[train_idx], x[val_idx], x_test), \
           (x_tokens[train_idx], x_tokens[val_idx], x_tokens_test), \
           (y[train_idx], y[val_idx])

In [26]:
vocab_size = 50000
embedding_size = 1000
vect = Vectorizer('../hw1/data/word2vec.bin', vocab_size=vocab_size, max_len=30)

In [27]:
(x_train, x_val, x_test), (x_tokens_train, x_tokens_val, x_tokens_test), (y_train, y_val) = get_data(0)
x_train, x_len_train = vect.fit_transform(tqdm(x_train))
x_val, x_len_val = vect.transform(tqdm(x_val))




HBox(children=(IntProgress(value=0, max=89973), HTML(value='')))




HBox(children=(IntProgress(value=0, max=22494), HTML(value='')))




In [8]:
def parser(x, length, y):
    features = {"x": x, "len": length}
    return features, y

def train_input_fn(params):
    dataset = tf.data.Dataset.from_tensor_slices((x_train, x_len_train, y_train))
    dataset = dataset.shuffle(buffer_size=len(x_train))
    dataset = dataset.batch(params['batch_size'])
    dataset = dataset.map(parser)
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

def eval_input_fn(params):
    dataset = tf.data.Dataset.from_tensor_slices((x_val, x_len_val, y_val))
    dataset = dataset.batch(params['batch_size'])
    dataset = dataset.map(parser)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

In [9]:
def model_fn(features, labels, mode, params):   
    # print ('mode: {}'.format(mode))
    logits = params['forward'](features)
    predicted_classes = tf.argmax(logits, 1)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'class_ids': predicted_classes[:, tf.newaxis],
            'probabilities': tf.nn.softmax(logits),
            'logits': logits,
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    
    accuracy = tf.metrics.accuracy(labels=labels,
                               predictions=predicted_classes,
                               name='acc_op')
    f1 = tf_metrics.f1(labels, predicted_classes, 3, average='macro')
    metrics = {
        'accuracy': accuracy,
        'f1': f1
    }

    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(
            mode, loss=loss, eval_metric_ops=metrics)
    
    tf.summary.scalar('accuracy', accuracy[1])
    tf.summary.scalar('f1', f1[1])
    
    optimizer = tf.train.AdagradOptimizer(learning_rate=0.3)
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
    
    return tf.estimator.EstimatorSpec(
        mode, loss=loss, train_op=train_op, eval_metric_ops=metrics)
 
    

In [10]:
models_dir = './models/'
logs_dir = './logs/'

def train_model(forward_fn, name, epochs=10):
    params = {
        'batch_size': 1000,
        'train_size': x_train.shape[0],
        'val_size': x_test.shape[0],
        'num_epochs': epochs,
        'forward': forward_fn
    }
    model_dir = os.path.join(models_dir, name)
    log_dir = os.path.join(logs_dir, name)
    logger = logging.getLogger('tensorflow')
    logger.handlers = []

    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    log_file = os.path.join(logs_dir, '{}.log'.format(name))
    print ('log output: {}'.format(log_file))
    
    fh = logging.FileHandler(log_file)
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    sh = logging.StreamHandler(sys.stdout)
    sh.setLevel(logging.CRITICAL)
    logger.addHandler(sh)
    
    config = tf.estimator.RunConfig(
        save_checkpoints_steps = int(params['train_size'] / params['batch_size']),
        model_dir = model_dir
    )
    classifier = tf.estimator.Estimator(
        model_fn = model_fn,
        params = params,
        config = config,
    )
    
    train_spec = tf.estimator.TrainSpec(
        input_fn = lambda: train_input_fn(params), 
        max_steps=int(params['train_size'] / params['batch_size'] * params['num_epochs'])
    )

    eval_spec = tf.estimator.EvalSpec(
        input_fn = lambda: eval_input_fn(params),
        steps = int(params['val_size'] / params['batch_size']),
        throttle_secs = 0, 
    )

    val_scores = tf.estimator.train_and_evaluate(
        classifier, 
        train_spec, 
        eval_spec
    )
    val_scores = val_scores[0]
    if val_scores:
        print('Val metrics: Accuracy: {:.3}, f1: {:.3}, Loss: {:.3}'.format(
            val_scores['accuracy'],
            val_scores['f1'],
            val_scores['loss']
        ))

### CNN model (1 балл)

Реализуйте модель со следующими слоями, идущими в указанном порядке:
- Conv1D, activation='relu'
- MaxPooling1D
- Dense

Параметры модели подберите сами.

In [10]:
def forward_cnn(features):
    inputs = tf.contrib.layers.embed_sequence(
        features['x'], vocab_size, embedding_size,
        #initializer=tf.random_uniform_initializer(-1.0, 1.0)
        initializer=tf.constant_initializer(vect.get_embeddings()), trainable=False # pretrained embeddings
    )
    out = tf.layers.conv1d(
        inputs=inputs,
        filters = 32,
        kernel_size = 3,
        padding="same",
        activation=tf.nn.relu
    )

    # Global Max Pooling
    out = tf.reduce_max(input_tensor=out, axis=1)
    #out = tf.expand_dims(out, 1)
    
    logits = tf.layers.dense(inputs=out, units=3)
    return logits

In [11]:
%%time
train_model(forward_cnn, 'cnn'.format(embedding_size), epochs=20)

log output: ./logs/cnn.log
Val metrics: Accuracy: 0.854, f1: 0.81, Loss: 0.334
CPU times: user 1min 18s, sys: 12.3 s, total: 1min 30s
Wall time: 1min 25s


### RNN model (1 балл)

Реализуйте модель со следующими слоями, идущими в указанном порядке:
- RNN
- Dense

Параметры модели подберите сами.

In [12]:
def forward_rnn(features):
    inputs = tf.contrib.layers.embed_sequence(
        features['x'], vocab_size, embedding_size,
        #initializer=tf.random_uniform_initializer(-1.0, 1.0)
        initializer=tf.constant_initializer(vect.get_embeddings()), trainable=False # pretrained embeddings
    )
    
    rnn_cell = tf.nn.rnn_cell.BasicRNNCell(10)

    _, final_states = tf.nn.dynamic_rnn(
        rnn_cell, inputs, sequence_length=features['len'], dtype=tf.float32)
    
    logits = tf.layers.dense(inputs=final_states, units=3)
    return logits

In [13]:
%%time
train_model(forward_rnn, 'rnn'.format(embedding_size), epochs=20)

log output: ./logs/rnn.log
Val metrics: Accuracy: 0.798, f1: 0.678, Loss: 0.449
CPU times: user 2min 48s, sys: 17.7 s, total: 3min 5s
Wall time: 1min 52s


### LSTM model (1 балл)

Реализуйте модель со следующими слоями, идущими в указанном порядке:
- LSTM
- Dense

Параметры модели подберите сами.

In [14]:
def forward_lstm(features):
    inputs = tf.contrib.layers.embed_sequence(
        features['x'], vocab_size, embedding_size,
        #initializer=tf.random_uniform_initializer(-1.0, 1.0)
        initializer=tf.constant_initializer(vect.get_embeddings()), trainable=False # pretrained embeddings
    )
    
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(10)

    _, final_states = tf.nn.dynamic_rnn(
        lstm_cell, inputs, sequence_length=features['len'], dtype=tf.float32)
    out = final_states.h
    
    logits = tf.layers.dense(inputs=out, units=3)
    return logits

In [15]:
%%time
train_model(forward_lstm, 'lstm'.format(embedding_size), epochs=20)

log output: ./logs/lstm.log
Val metrics: Accuracy: 0.836, f1: 0.774, Loss: 0.372
CPU times: user 3min 29s, sys: 21.5 s, total: 3min 50s
Wall time: 2min 9s


Сравните все три реализованные модели по времени и по качеству классификации. Какая лучше? Как думаете, почему?

<b>Вывод:</b>

Время обучения:
- cnn: 1min 25s
- rnn: 1min 52s
- lstm: 2min 9s

Рекурентные сети работают медленнее из-за большого количества последовательных операций (плохо параллелить). lstm работает лучше rnn из-за более сложной архитектуры. lstm и cnn работают примерно одинаково, следовательно имеет смысл использовать cnn.

### Baseline model (2 балла)

Реализуйте модель со следующими слоями, идущими в указанном порядке:
- Conv1D, activation='relu'
- MaxPooling1D
- RNN
- Dense

Параметры для Conv1D, MaxPooling1D и RNN подберите сами.

In [16]:
def forward_baseline(features):
    inputs = tf.contrib.layers.embed_sequence(
        features['x'], vocab_size, embedding_size,
        #initializer=tf.random_uniform_initializer(-1.0, 1.0)
        initializer=tf.constant_initializer(vect.get_embeddings()), trainable=False # pretrained embeddings
    )
    out = tf.layers.conv1d(
        inputs=inputs,
        filters = 32,
        kernel_size = 3,
        padding="same",
        activation=tf.nn.relu
    )
    # Global Max Pooling
    out = tf.reduce_max(input_tensor=inputs, axis=1)
    out = tf.expand_dims(out, 1)
    
    rnn_cell = tf.nn.rnn_cell.BasicRNNCell(10)

    _, final_states = tf.nn.dynamic_rnn(
        rnn_cell, inputs, sequence_length=features['len'], dtype=tf.float32)
    out = final_states
    
    logits = tf.layers.dense(inputs=out, units=3)
    return logits

In [17]:
train_model(forward_baseline, 'baseline'.format(embedding_size), epochs=20)

log output: ./logs/baseline.log
Val metrics: Accuracy: 0.795, f1: 0.678, Loss: 0.445


### Dropout (1 балл)

Добавьте dropout к baseline моделе. Пример модефицированной модели:
- Dropout
- Conv1D, activation='relu'
- MaxPooling1D
- RNN
- Dense

Подберите параметры.

In [18]:
def forward_dropout(features):
    inputs = tf.contrib.layers.embed_sequence(
        features['x'], vocab_size, embedding_size,
        #initializer=tf.random_uniform_initializer(-1.0, 1.0)
        initializer=tf.constant_initializer(vect.get_embeddings()), trainable=False # pretrained embeddings
    )
    out = tf.layers.dropout(inputs)
    out = tf.layers.conv1d(
        inputs=out,
        filters = 32,
        kernel_size = 3,
        padding="same",
        activation=tf.nn.relu
    )
    # Global Max Pooling
    out = tf.reduce_max(input_tensor=inputs, axis=1)
    out = tf.expand_dims(out, 1)
    
    rnn_cell = tf.nn.rnn_cell.BasicRNNCell(10)

    _, final_states = tf.nn.dynamic_rnn(
        rnn_cell, inputs, sequence_length=features['len'], dtype=tf.float32)
    out = final_states
    
    logits = tf.layers.dense(inputs=out, units=3)
    return logits

In [19]:
train_model(forward_dropout, 'dropout'.format(embedding_size), epochs=20)

log output: ./logs/dropout.log
Val metrics: Accuracy: 0.807, f1: 0.722, Loss: 0.435


Улучшило ли результат классификации использование Dropout? Как думаете, почему?

<b>Вывод:</b>
    
Использование dropout улучшило результат, так как это хорошая регуляризация при обучении модели (помогает бороться с переобучением)

### Bidirectional (1 балл)

Вместо RNN в первой модели используйте biRNN. Пример модефецированной модели:
- Conv1D, activation='relu'
- MaxPooling1D
- biRNN
- Dense

Подберите параметры.

In [20]:
def forward_birnn(features):
    inputs = tf.contrib.layers.embed_sequence(
        features['x'], vocab_size, embedding_size,
        #initializer=tf.random_uniform_initializer(-1.0, 1.0)
        initializer=tf.constant_initializer(vect.get_embeddings()), trainable=False # pretrained embeddings
    )
    
    out = tf.layers.conv1d(
        inputs=inputs,
        filters = 32,
        kernel_size = 3,
        padding="same",
        activation=tf.nn.relu
    )
    
    # Global Max Pooling
    out = tf.reduce_max(input_tensor=inputs, axis=1)
    out = tf.expand_dims(out, 1)
        
    rnn_cell_fw = tf.nn.rnn_cell.BasicRNNCell(10)
    rnn_cell_bw = tf.nn.rnn_cell.BasicRNNCell(10)
    
    _, final_states = tf.nn.bidirectional_dynamic_rnn(rnn_cell_fw, rnn_cell_bw, out, 
                                                      sequence_length=features['len'], dtype=tf.float32)
    out = tf.concat(final_states, 1)
    logits = tf.layers.dense(inputs=out, units=3)
    return logits

In [21]:
train_model(forward_birnn, 'birnn'.format(embedding_size), epochs=20)

log output: ./logs/birnn.log
Val metrics: Accuracy: 0.827, f1: 0.773, Loss: 0.39


Дало ли буст использовании biRNN вместо RNN? Сильный? Как думаете, почему?

<b>Вывод:</b>
    
Использование biRNN дало хороший прирост, так как это помогает "видеть" слова и смысл с обеих сторон (как и свертка)

### Custom model (2 балла)

Подберите архитектуру сети, используя сверточные слои и слои макспулинга и превзойдите качество полносвязной сети.

Подберите архитектуру сети, используя следующие типы слоев:
- Conv1D
- MaxPooling1D
- RNN
- LSTM
- GRU
- Dropout
- Dense

Можно использовать любые из описанных слоев в любом количестве и в любом порядке. Не обязательно использовать все описанные слои. 

Настройте параметры. Превзойдите лучшее качество, полученное раннее. 

In [31]:
def forward_custom(features):
    inputs = tf.contrib.layers.embed_sequence(
        features['x'], vocab_size, embedding_size,
        #initializer=tf.random_uniform_initializer(-1.0, 1.0)
        initializer=tf.constant_initializer(vect.get_embeddings()), trainable=False # pretrained embeddings
    )
    out = tf.layers.conv1d(
        inputs=inputs,
        filters = 32,
        kernel_size = 3,
        padding="same",
        activation=tf.nn.relu
    )
    out = tf.layers.conv1d(
        inputs=out,
        filters = 32,
        kernel_size = 3,
        padding="same",
        activation=tf.nn.relu
    )
    # Global Max Pooling
    out = tf.reduce_max(input_tensor=out, axis=1)
    #out = tf.expand_dims(out, 1)
    out = tf.layers.dropout(out, rate=0.3)
    logits = tf.layers.dense(inputs=out, units=3)
    return logits

In [32]:
train_model(forward_custom, 'custom_2'.format(embedding_size), epochs=20)

log output: ./logs/custom_2.log
Val metrics: Accuracy: 0.846, f1: 0.799, Loss: 0.35
