In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import os, sys
import logging
import tensorflow as tf
import tf_metrics
import faiss
from sklearn.metrics import f1_score
from sklearn.preprocessing import normalize

import matplotlib.pyplot as plt
from collections import defaultdict

In [3]:
from vectorizer import Vectorizer
from triplet_loss import batch_all_triplet_loss, batch_hard_triplet_loss

# prepare data

In [4]:
def get_data(norm=True, to_subm=False):
    vect = Vectorizer()

    data_train = pd.read_csv('./data/train.csv', index_col='index')
    n_classes = len(np.unique(data_train.labels.values)) - 1

    x_train_FULL = vect.transform(tqdm(data_train.text))
    y_train_FULL = data_train.labels.values

    np.random.seed(42)
    msk = np.random.random(len(data_train)) > 0.15
    data_val = data_train[~msk]

    data_train = data_train[msk]

    data_test = pd.read_csv('./data/test.csv', engine='python', index_col='index')

    x_train = vect.transform(tqdm(data_train.text))
    y_train = data_train.labels.values

    x_val = vect.transform(tqdm(data_val.text))
    y_val = data_val.labels.values

    x_test = vect.transform(data_test.text)
    
    if to_subm:
        x_train = x_train_FULL
        y_train = y_train_FULL
        
    if norm:
        x_train = normalize(x_train)
        x_val = normalize(x_val)
        x_test = normalize(x_test)
        
    return (x_train, y_train), (x_val, y_val), x_test

In [5]:
(x_train, y_train), (x_val, y_val), x_test = get_data(norm=True, to_subm=True)

HBox(children=(IntProgress(value=0, max=150832), HTML(value='')))




HBox(children=(IntProgress(value=0, max=128212), HTML(value='')))




HBox(children=(IntProgress(value=0, max=22620), HTML(value='')))




In [6]:
def parser(x, y):
    features = {"x": x}
    return features, y

def train_input_fn(params):
    def train_get():
        
        train_classes = np.unique(y_train)[1:]
        len_train = len(y_train)
        batch_size = params['batch_size']
        batch_n = int(len_train / batch_size)
        
        
        cl_ids = defaultdict(list)
        for j, cl in enumerate(y_train):
            cl_ids[cl].append(j)
        
        for n in range(batch_n):
            batch_classes = np.random.choice(train_classes, size=batch_size // 3, replace=False)

            #class_idx = [j for j, cl in enumerate(y_train) if cl in batch_classes] # this is super slow
            class_idx = []
            for cl in batch_classes:
                class_idx.extend(cl_ids[cl])
            other_idx = list(set(range(len_train)) - set(class_idx))
            add_idx = list(np.random.choice(other_idx, size=(batch_size-len(class_idx)), replace=False))
            batch_idx = class_idx + add_idx
            yield ({'x': x_train[batch_idx]}, y_train[batch_idx])

    dataset =  tf.data.Dataset.from_generator(train_get, ({'x': tf.float32}, tf.int64), output_shapes=({'x': [None, 300]}, [None]))
    dataset = dataset.repeat()
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

def eval_input_fn(params):
    dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
    dataset = dataset.batch(params['batch_size'])
    dataset = dataset.map(parser)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

# training model

## evaluation

In [7]:
def train_inf_input_fn(params):
    dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    dataset = dataset.batch(params['batch_size'])
    dataset = dataset.map(parser)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

In [8]:
def evaluate(estimator):
    val_embs = np.array([x['emb'] for x in estimator.predict(eval_input_fn)])
    train_embs = np.array([x['emb'] for x in estimator.predict(train_inf_input_fn)])
    
    emb_size = train_embs.shape[1]
    index = faiss.IndexFlat(emb_size)

    index.verbose = True  # to see progress
    index.add(normalize(train_embs))

    D, I = index.search(normalize(val_embs), k=1)
    I = I.ravel()
    D = D.ravel()
    
    y_val_gt = np.array([y if y in y_train else -1 for y in y_val])

    f = []
    thrs = np.linspace(0, 1, 100)
    for thr in tqdm(thrs):
        y_pred = [y_train[i] if d < thr else -1 for (i, d) in zip(I, D)]

        f.append(f1_score(y_val_gt, y_pred, average='macro'))
    return max(f), thrs[np.argmax(f)]

In [9]:
class ValidationHook(tf.train.SessionRunHook):
    def __init__(self, model_fn, params, checkpoint_dir,
                 every_n_secs=None, every_n_steps=None):
        self._iter_count = 0
        self._estimator = tf.estimator.Estimator(
            model_fn=model_fn,
            params=params,
            model_dir=checkpoint_dir
        )
        self._timer = tf.train.SecondOrStepTimer(every_n_secs, every_n_steps)
        self._should_trigger = False
        self.summary_writer = tf.summary.FileWriter(checkpoint_dir)

    def begin(self):
        self._timer.reset()
        self._iter_count = 0

    def before_run(self, run_context):
        self._should_trigger = self._timer.should_trigger_for_step(self._iter_count)

    def after_run(self, run_context, run_values):
        if self._should_trigger:
            f1 = evaluate(self._estimator)
            print (f1)
            summary = tf.Summary()
            summary.value.add(tag='f1', simple_value = f1)
            self.summary_writer.add_summary(summary, self._estimator.get_variable_value('global_step'))
            
            self._timer.update_last_triggered_step(self._iter_count)
        self._iter_count += 1

# training model

In [10]:
models_dir = './models/'
logs_dir = './logs/'

def train_model(forward_fn, name, epochs=10, batch_size=100):
    params = {
        'batch_size': batch_size,
        'train_size': x_train.shape[0],
        'val_size': x_test.shape[0],
        'num_epochs': epochs,
        'forward': forward_fn,
    }
    model_dir = os.path.join(models_dir, name)
    log_dir = os.path.join(logs_dir, name)
    logger = logging.getLogger('tensorflow')
    logger.handlers = []

    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    log_file = os.path.join(logs_dir, '{}.log'.format(name))
    print ('log output: {}'.format(log_file))
    
    fh = logging.FileHandler(log_file)
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    sh = logging.StreamHandler(sys.stdout)
    sh.setLevel(logging.CRITICAL)
    logger.addHandler(sh)
    
    config = tf.estimator.RunConfig(
        save_checkpoints_steps = int(params['train_size'] / params['batch_size']),
        model_dir = model_dir
    )
    classifier = tf.estimator.Estimator(
        model_fn = model_fn,
        params = params,
        config = config,
    )
    
    train_spec = tf.estimator.TrainSpec(
        input_fn = lambda: train_input_fn(params), 
        max_steps=int(params['train_size'] / params['batch_size'] * params['num_epochs'])
    )

    eval_spec = tf.estimator.EvalSpec(
        input_fn = lambda: eval_input_fn(params),
        steps = int(params['val_size'] / params['batch_size']),
        throttle_secs = 0, 
#         hooks = [ValidationHook(
#             model_fn = model_fn,
#             params = params,
#             checkpoint_dir = model_dir,
#             every_n_steps = int(params['train_size'] / params['batch_size'] * 10)
#         )]
    )
    try:
        val_scores = tf.estimator.train_and_evaluate(
            classifier, 
            train_spec, 
            eval_spec
        )
    except KeyboardInterrupt:
        print('Interrupted')

    return classifier

In [11]:
def model_fn(features, labels, mode, params):   
    emb = params['forward'](features)

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'emb': emb,
          #  'prob': prob
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    loss = batch_hard_triplet_loss(labels, emb, 1, 'l2')
    if type(loss) == tuple:
        loss = loss[0]
        
    if mode == tf.estimator.ModeKeys.EVAL:
        
        return tf.estimator.EstimatorSpec(
            mode, loss=loss)
        
    optimizer = tf.train.AdagradOptimizer(learning_rate=0.3)
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
    
    return tf.estimator.EstimatorSpec(
        mode, loss=loss, train_op=train_op)
 

In [12]:
def forward_00(features):
    inputs = features['x']
    print ('*')
    out = tf.layers.dense(inputs, 512, activation=tf.nn.tanh)
    out = tf.layers.dropout(out)
    out = tf.layers.batch_normalization(out)


    emb = tf.layers.dense(out, units=512)

    return emb#, probs

In [22]:
estimator = train_model(forward_00, 'l2_normed', epochs=50, batch_size=1000)

log output: ./logs/l2_normed.log
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*


### evaluation

In [26]:
evaluate(estimator)

*
*


HBox(children=(IntProgress(value=0), HTML(value='')))




(0.5757690006395229, 0.27272727272727276)

# make submission

In [15]:
def train_full_inf_input_fn(params):
    dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
    dataset = dataset.batch(params['batch_size'])
    dataset = dataset.map(parser)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

def test_inf_input_fn(params):
    dataset = tf.data.Dataset.from_tensor_slices((x_test, np.zeros_like(x_test)))
    dataset = dataset.batch(params['batch_size'])
    dataset = dataset.map(parser)
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

In [16]:
test_embs = np.array([x['emb'] for x in estimator.predict(test_inf_input_fn)])
train_FULL_embs = np.array([x['emb'] for x in estimator.predict(train_full_inf_input_fn)])

*
*


In [17]:
import faiss
emb_size = train_FULL_embs.shape[1]
index = faiss.IndexFlat(emb_size)

index.verbose = True  # to see progress
index.add(normalize(train_FULL_embs))

print('Calculating KNN')
D, I = index.search(normalize(test_embs), k=1)
I = I.ravel()
D = D.ravel()

Calculating KNN


In [18]:
best_thr = 0.273
y_subm = [y_train[i] if d < best_thr else -1 for (i, d) in zip(I, D)]

In [19]:
with open('./subm/1.csv', 'w') as f:
    f.write('index,labels\n' )
    for j, y in enumerate(y_subm):
        f.write('{},{}\n'.format(j,y))