# More about embeddings

Outline:  
1. Char CNN embedding
1. FastText
1. BPE
1. Elmo
1. Siamise Networks
1. Triplet Loss
1. Hard Negative Mining
1. KNN
1. LSH

In [78]:
# some routines
import pandas as pd
import numpy as np
import gensim
from tqdm import tqdm_notebook

from sklearn import metrics
from sklearn.model_selection import train_test_split

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset

from torchtext import data


SEED = 42
np.random.seed(SEED)



def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        pred = model(batch)
        loss = criterion(pred, batch.label)
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            pred = model(batch)
            loss = criterion(pred, batch.label)
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        valid_loss = _test_epoch(model, valid_iterator, criterion)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

# 1 Char CNN Embeddings

Treat token as a sequence of symbols.  
Stack 1-dim Conv layer + polling layer to produce fixed size embedding of the token.  

Use `torchtext.data.NestedField` for char embeddings

<img src=images/char.png height=400/>

In [21]:
nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>")
CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>")

In [31]:
classes={
    'negative':0,
    'neutral':1,
    'positive':2
}

TEXT = data.Field(tokenize=list, 
                  unk_token="<cunk>", 
                  pad_token="<cpad>",
                  init_token="<w>", 
                  eos_token="</w>")

CHAR = data.NestedField(
            TEXT,
            init_token="<s>",
            eos_token="</s>",
            fix_length=30, # max token length
            dtype=tt.long,
            include_lengths=True,
        )


LABEL = data.LabelField(dtype=tt.int64, use_vocab=True, preprocessing=lambda x: classes[x])

dataset = data.TabularDataset('../seminar_10/Tweets.csv', format='csv', 
                         fields=[(None, None),('label', LABEL), (None, None),(None, None),
                                 ('char', CHAR)], 
                         skip_header=True)

CHAR.build_vocab(dataset, min_freq=5)
LABEL.build_vocab(dataset)

train, valid = dataset.split(0.7, stratified=True)

print(len(CHAR.vocab.itos))

154


In [26]:
CHARS.vocab.itos

['<cunk>', '<cpad>', '<s>', '</s>', '<w>', '</w>', 'a', 'b']

In [28]:
dataset.examples[0].char

[['@', 'V', 'i', 'r', 'g', 'i', 'n', 'A', 'm', 'e', 'r', 'i', 'c', 'a'],
 ['W', 'h', 'a', 't'],
 ['@', 'd', 'h', 'e', 'p', 'b', 'u', 'r', 'n'],
 ['s', 'a', 'i', 'd', '.']]

In [30]:
CHAR.pad([dataset.examples[0].char])

([[['<w>',
    '<s>',
    '</w>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>'],
   ['<w>',
    '@',
    'V',
    'i',
    'r',
    'g',
    'i',
    'n',
    'A',
    'm',
    'e',
    'r',
    'i',
    'c',
    'a',
    '</w>'],
   ['<w>',
    'W',
    'h',
    'a',
    't',
    '</w>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>'],
   ['<w>',
    '@',
    'd',
    'h',
    'e',
    'p',
    'b',
    'u',
    'r',
    'n',
    '</w>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>'],
   ['<w>',
    's',
    'a',
    'i',
    'd',
    '.',
    '</w>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>',
    '<cpad>'],
   ['<w>',
    '</s>',
    '</w>',
    '<cpad>',
    '<cpad>',


In [32]:
_, seq_len, words_len = CHAR.pad([dataset.examples[0].char])
seq_len, words_len

([6],
 [[3,
   16,
   6,
   11,
   7,
   3,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0]])

In [35]:
class ConvCharEmbedding(nn.Module):

    def __init__(self, char_vocab_size, char_dim, embed_dim, hidden_size, filters=None):
        super(ConvCharEmbedding, self).__init__()

        self.char_embed = nn.Embedding(char_vocab_size, char_dim)

        if filters is None:
            filters = [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256]]
        self.convs = nn.ModuleList([
            nn.Conv1d(char_dim, out_channels=n_channels, kernel_size=k, padding=int((k-1)/2)) for k, n_channels in filters
        ])

        input_size = np.sum(x[1] for x in filters)
        
        self.hidden = nn.Linear(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, embed_dim)
        
    def forward(self, x):
        """
        :param x: shape (B, W, L)
        :return:
        """

        x = self.char_embed(x)

        batch_size, n_words, length, embed_size = x.size()

        x = x.transpose(2,3).contiguous().view(-1, embed_size, length)

        out = []
        for conv in self.convs:
            z = conv(x)
            z = F.relu(z)
            z = F.max_pool1d(z, z.size(2)).squeeze()
            out.append(z)

        x = tt.cat(out, -1)
        x = self.hidden(x)
        x = F.relu(x)
        x = self.fc(x)

        x = x.contiguous().view(batch_size, n_words, -1)

        return x


In [39]:
class MyModel(nn.Module):
    
    def __init__(self, char_vocab_size, char_dim, embed_size, hidden_size):
        super(MyModel, self).__init__()
        self.embedding = ConvCharEmbedding(char_vocab_size, char_dim, embed_size, hidden_size=embed_size*2)
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                          )
        
        self.fc = nn.Linear(hidden_size * 2 *2, 3)
        
    def forward(self, batch):
        
        x, x_lengths, _ = batch.char
        
        x = self.embedding(x)

        if x_lengths is not None:
            x_lengths = x_lengths.view(-1).tolist()
            x = nn.utils.rnn.pack_padded_sequence(x, x_lengths, batch_first=True)
            
        _, (hidden, cell) = self.rnn(x)
        
        hidden = hidden.transpose(0,1)
        cell = cell.transpose(0,1)
        hidden = hidden.contiguous().view(hidden.size(0),-1)
        cell = cell.contiguous().view(cell.size(0),-1)
        x = tt.cat([hidden, cell], dim=1).squeeze(1)
        x = self.fc(x)
        return x

# tt.cuda.empty_cache()

batch_size = 32

model = MyModel(len(CHAR.vocab.itos),
                char_dim=25,
                embed_size=100,
                hidden_size=128,
               )

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train, valid),
    batch_sizes=(batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.char),
    sort_within_batch=True,
)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=10, early_stopping=2)

  


HBox(children=(IntProgress(value=0, description='epoch 0', max=321, style=ProgressStyle(description_width='ini…

validation loss 0.91809


HBox(children=(IntProgress(value=0, description='epoch 1', max=321, style=ProgressStyle(description_width='ini…

validation loss 0.74027


HBox(children=(IntProgress(value=0, description='epoch 2', max=321, style=ProgressStyle(description_width='ini…

validation loss 0.63047


HBox(children=(IntProgress(value=0, description='epoch 3', max=321, style=ProgressStyle(description_width='ini…

validation loss 0.71646


HBox(children=(IntProgress(value=0, description='epoch 4', max=321, style=ProgressStyle(description_width='ini…

validation loss 0.59166


HBox(children=(IntProgress(value=0, description='epoch 5', max=321, style=ProgressStyle(description_width='ini…

validation loss 0.58378


HBox(children=(IntProgress(value=0, description='epoch 6', max=321, style=ProgressStyle(description_width='ini…

validation loss 0.56827


HBox(children=(IntProgress(value=0, description='epoch 7', max=321, style=ProgressStyle(description_width='ini…

validation loss 0.56845


HBox(children=(IntProgress(value=0, description='epoch 8', max=321, style=ProgressStyle(description_width='ini…

validation loss 0.59877
Early stopping! best epoch: 6 val 0.56827



# 2 FastText

Word is devided into subword n-grams with word boundaries:  
`<where> -> <wh, whe, her, ere, re>`  

Let  
$w$ - word,  
$c$ - context vector  
$G_w \in \{ 1, ..., G \}$ - set of n-grams appeared in word $w$  
$z_g$ - vector embedding of n-gram $g$

Assumption: represent word embedding as a sum of n-grams' embeddings  

Then, scoring function:  
$$ s(w,c) = \sum_{g \in G_w} z_g^T v_c $$

Probability of context word:  
$$ p(w_c | w_t) = \frac { \exp^{s(w_t, w_c) }} { \sum_{j=1}^W \exp^{s(w_t, j)} }$$

Loss function:  

$$ \sum_{t=1}^T [ \sum_{c \in C_t} \log (1 + \exp^{- s(w_t, w_c)}) + \sum_{n \in N_{t,c}} \log (1 + \exp^{ s(w_t, n)}) ]  \rightarrow \min_{z, w}$$


Rather good implementation of FastText you can found in gensim. `gensim.models.fasttext.FastText`  
Interface is similar to Word2Vec  

In [None]:
from gensim import Fast

# 3 Byte Pair Encoding (BPE)

In NLP, BPE is a simple form of sequence compression when the most common pair of consecutive tokens (chars) is replaced with a token that does not occur within that data.

`a,a,b,a,a,c,a ->  aa, b, aa, c, a`

`https://github.com/bheinzerling/bpemb`

In [40]:
import bpemb

bpe_en = bpemb.BPEmb(lang='en', vs=1000)
bpe_en.encode('simple resolution')

['▁sim', 'ple', '▁res', 'ol', 'ution']

# 4 ELMO

https://arxiv.org/pdf/1802.05365
    
ELMO is an example of context embeddings: embedding depends not only on the word, but on whole sentence.

ELMO uses char CNN for token embeddings on 0 layer.  

You can find pretrained models in `allennlp.modules.elmo.Elmo` and `https://github.com/HIT-SCIR/ELMoForManyLangs`  

<img src=images/elmo1.png height=300/>


In [19]:
from allennlp.modules.elmo import Elmo, batch_to_ids

options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

# Compute two different representation for each token.
# Each representation is a linear weighted combination for the
# 3 layers in ELMo (i.e., charcnn, the outputs of the two BiLSTM))
elmo = Elmo(options_file, weight_file, 2, dropout=0)

# use batch_to_ids to convert sentences to character ids
sentences = [['First', 'sentence', '.', 'custom', 'service'], ['Another', '.']]
character_ids = batch_to_ids(sentences)

embeddings = elmo(character_ids)

  index_range = sequence_lengths.new_tensor(torch.arange(0, len(sequence_lengths)))


In [21]:
embeddings['elmo_representations'][0].shape

torch.Size([2, 5, 1024])

# 5 Siamese Networks

<img src=images/siam1.jpeg height=300/>

In [2]:
# sample from Quora duplicate detection

df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,199530,301049,301050,"As a Canadian student, is it wiser to complete...",How much will it cost to Indian student to stu...,0
1,387099,29541,519407,What is your favorite Indian sweet dish?,What's your favorite Indian dish? Why?,0
2,337316,464776,464777,Is there proof of Jon being Rhaegar and Lyanna...,Where does GRRM imply that Jon Snow is Rhaegar...,0
3,164415,255489,255490,Knowing how Prithviraj's last 3 films were flo...,Which is the Best Comedy scene in Malayalam ci...,0
4,382707,514592,514593,What causes damage to the somatosensory cortex...,What causes damage to the somatosensory cortex...,1


In [5]:
from allennlp.modules.elmo import Elmo, batch_to_ids

options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

# Compute two different representation for each token.
# Each representation is a linear weighted combination for the
# 3 layers in ELMo (i.e., charcnn, the outputs of the two BiLSTM))
elmo = Elmo(options_file, weight_file, 2, dropout=0)

02/28/2019 12:48:59 - INFO - allennlp.modules.elmo -   Initializing ELMo


In [102]:
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

xq1_train = batch_to_ids(df_train.question1.values)
xq2_train = batch_to_ids(df_train.question2.values)
y_train = tt.from_numpy(df_train.is_duplicate.values).float()

xq1_val = batch_to_ids(df_val.question1.values)
xq2_val = batch_to_ids(df_val.question2.values)
y_val = tt.from_numpy(df_val.is_duplicate.values).float()

In [103]:
batch_size = 32
train_loader = DataLoader(TensorDataset(xq1_train, xq2_train, y_train), batch_size=batch_size)
val_loader = DataLoader(TensorDataset(xq1_val, xq2_val, y_val), batch_size=batch_size)


In [104]:
def _train_epoch(model, iterator, optimizer, curr_epoch):

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        loss = model(batch)
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            loss = model(batch)
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, epoch)
        valid_loss = _test_epoch(model, valid_iterator)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

In [None]:
class MyModel(nn.Module):
    
    def __init__(self, elmo, criterion):
        super(MyModel, self).__init__()
        self.elmo = elmo
        self.criterion = criterion
        
        self.fc = nn.Linear(1024*2, 128)
        
        self.out = nn.Linear(128*3, 1)
        
    def branch(self, x):
        x = self.elmo(x)['elmo_representations']
        x = tt.cat(x, dim=-1)
        x = x.mean(dim=1)
        x = self.fc(x)
        return x
        
    def forward(self, batch):
        
        q1, q2, y = batch
        
        q1 = self.branch(q1)
        q2 = self.branch(q2)
        
        # simetric functions
        x = tt.cat([tt.abs(q1-q2), q1*q2, q1+q2], dim=-1)
        
        x = self.out(x).squeeze(1)
        loss = self.criterion(x,y)
        
        return loss



model = MyModel(elmo, nn.BCEWithLogitsLoss())

optimizer = optim.Adam(model.parameters())

nn_train(model, train_loader, val_loader, optimizer, n_epochs=2)

# 6 Triplet loss

Distance to samples from the same class should be less than to samples from other classes

Euclidean distance: 
<img src=images/triplet.png height=200/>

Cosine distance: 
<img src=images/triplet2.png height=400/>

In [106]:
def triplet_loss(anchor_embed, pos_embed, neg_embed):
    return F.cosine_similarity(anchor_embed, neg_embed) - F.cosine_similarity(anchor_embed, pos_embed)
    
    
class Tripletnet(nn.Module):
    def __init__(self):
        super(Tripletnet, self).__init__()
        ...
        
    def branch(self, x):
        ....

    def forward(self, anchor, pos, neg):
        
        anchor = self.branch(anchor)
        pos = self.branch(pos)
        neg = self.branch(neg)
        
        return triplet_loss(anchor, pos, neg)

# 7 Hard Negative Mining


Sometimes, if you use random samples as negative examples, classification may be too easy for you model.  
You can consider taking samples from previous epoch, where your model made mistakes, as negative examples.  

# 8 K-Nearest Neighbors (KNN)

Training complexity: O(1) - just remember all train set  
Inference complexity: O(n) - have to compare each test sample with all train samples  

<img src=images/knn.png height=400/>

# 9 Locale Sensitive Hashing (LSH)

Good implementation can be found here `https://github.com/spotify/annoy`

Definition:  
LSH family $F$ is a family of hash functions that maps metric space $M$ to set of buckets $S$.  
$$ h: M \rightarrow S $$

Let  
$p,q \in M $ - points in space  
$d$ be the metric in $M$  
$c$ - some scalar, $c > 1$
, then for $h \in F$:  

* if $d(p,q) \leq R$ then $P[h(p) = h(q)] \geq p_1$  
* if $d(p,q) \geq cR$ then $P[h(p) = h(q)] \leq p_2$  

And family $F$ is called $(R, cR, p_1, p_2)$ - sensitive


Assumption: uniform distribution
    
<img src=images/lsh1.jpeg height=400/>

Amplification:  

1. AND construction

Define new family of hash functions $G = {g}$, where each consists of k hash functions from $F$ chosen at random $g = h_1, ..., h_k$.

$g(p) = g(q)$ iff $h_i(p) = h_i(q)$ **for all** $i$  

Then, family $G$ is $(d_1, d_2, p_1^k, p_2^k)$ - sensitive

2. OR construction

Define new family of hash functions $G = {g}$, where each consists of k hash functions from $F$ chosen at random $g = h_1, ..., h_k$.

$g(p) = g(q)$ iff $h_i(p) = h_i(q)$ **at least for one** $i$  

Then, family $G$ is $(d_1, d_2, 1 - (1 - p_1)^k, 1 - (1- p_2)^k)$ - sensitive

LSH maps:  
<img src=images/lsh2.png height=400/>