In [0]:
!pip3 -qq install torch==0.4.1
!pip -qq install torchtext==0.3.1
!pip -qq install spacy==2.0.16
!pip install -qq gensim==3.6.0
!python -m spacy download en
!wget -O squad.zip -qq --no-check-certificate "https://drive.google.com/uc?export=download&id=1h8dplcVzRkbrSYaTAbXYEAjcbApMxYQL"
!unzip squad.zip
!wget -O opensubs.zip -qq --no-check-certificate "https://drive.google.com/uc?export=download&id=1x1mNHweP95IeGFbDJPAI7zffgxrbqb7b"
!unzip opensubs.zip

In [0]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


if torch.cuda.is_available():
    from torch.cuda import FloatTensor, LongTensor
    DEVICE = torch.device('cuda')
else:
    from torch import FloatTensor, LongTensor
    DEVICE = torch.device('cpu')

np.random.seed(42)

# General Conversation

Сегодня разбираем, как устроена болталка.

![](https://meduza.io/image/attachments/images/002/547/612/large/RLnxN4VdUmWFcBp8GjxUmA.jpg =x200)  
*From [«Алиса, за мной следит ФСБ?»: в соцсетях продолжают издеваться над голосовым помощником «Яндекса»](https://meduza.io/shapito/2017/10/11/alisa-za-mnoy-sledit-fsb-v-sotssetyah-prodolzhayut-izdevatsya-nad-golosovym-pomoschnikom-yandeksa)*

Вообще, мы уже обсудили Seq2Seq модели, которые могут быть использованы для реализации болталки - однако, у них недостаток: высока вероятность сгенерировать что-то неграмматичное. Ну, как те пирожки.

Поэтому почти всегда идут другим путем - вместо генерации применяют ранжирование. Нужно заранее составить большую базу ответов и просто выбирать наиболее подходящий к контексту каждый раз.

## DSSM

Для этого используют DSSM (Deep Structured Semantic Models):

![](https://qph.fs.quoracdn.net/main-qimg-b90431ff9b4c60c5d69069d7bc048ff0)  
*From [What are Siamese neural networks, what applications are they good for, and why?](https://www.quora.com/What-are-Siamese-neural-networks-what-applications-are-they-good-for-and-why)*

Эта сеть состоит из (обычно) пары башен: левая кодирует запрос, правая - ответ. Задача - научиться считать близость между запросом и ответом.

Дальше набирают большой корпус из пар запрос-ответ (запрос может быть как одним вопросом, так и контекстом - несколькими последними вопросами/ответами). 

Для ответов предпосчитывают их векторы, каждый новый запрос кодируют с помощью правой башни и находят среди предпосчитанных векторов ближайший.

## Данные

Будем использовать для начала [Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer/). Вообще, там задача - найти в тексте ответ на вопрос. Но мы будем просто выбирать среди предложений текста наиболее близкое к вопросу.

*Эта часть ноутбука сильно основана на [шадовском ноутбуке](https://github.com/yandexdataschool/nlp_course/blob/master/week10_dialogue/seminar.ipynb)*.

In [0]:
import pandas as pd

train_data = pd.read_json('train.json')
test_data = pd.read_json('test.json')

In [4]:
row = train_data.iloc[40]
print('QUESTION:', row.question, '\n')
for i, cand in enumerate(row.options):
    print('[ ]' if i not in row.correct_indices else '[v]', cand)

QUESTION: In 1874 the telegraph was known as the what of commerce? 

[v] In 1874, telegraph message traffic was rapidly expanding and in the words of Western Union President William Orton, had become "the nervous system of commerce".
[ ] Orton had contracted with inventors Thomas Edison and Elisha Gray to find a way to send multiple telegraph messages on each telegraph line to avoid the great cost of constructing new lines.
[ ] When Bell mentioned to Gardiner Hubbard and Thomas Sanders that he was working on a method of sending multiple tones on a telegraph wire using a multi-reed device, the two wealthy patrons began to financially support Bell's experiments.
[ ] Patent matters would be handled by Hubbard's patent attorney, Anthony Pollok.


Токенизируем предложения:

In [0]:
import spacy

spacy = spacy.load('en')

train_data.question = train_data.question.apply(lambda text: [tok.text.lower() for tok in spacy.tokenizer(text)])
train_data.options = train_data.options.apply(lambda options: [[tok.text.lower() for tok in spacy.tokenizer(text)] for text in options])

test_data.question = test_data.question.apply(lambda text: [tok.text.lower() for tok in spacy.tokenizer(text)])
test_data.options = test_data.options.apply(lambda options: [[tok.text.lower() for tok in spacy.tokenizer(text)] for text in options])

У нас не так-то много данных, чтобы учить всё с нуля, поэтому будем сразу использовать предобученные эмбеддинги:

In [8]:
import gensim.downloader as api

w2v_model = api.load('glove-wiki-gigaword-100')



**Задание** Постройте матрицу предобученных эмбеддингов для самых частотных слов в выборке.

In [0]:
from collections import Counter


def build_word_embeddings(data, w2v_model, min_freq=5):
    words = Counter()
    
    for text in data.question:
        for word in text:
            words[word] += 1
            
    for options in data.options:
        for text in options:
            for word in text:
                words[word] += 1
                
    word2ind = {
        '<pad>': 0,
        '<unk>': 1
    }
    
    embeddings = [
        np.zeros(w2v_model.vectors.shape[1]),
        np.zeros(w2v_model.vectors.shape[1])
    ]
    
    for word, count in words.most_common():
        if count < min_freq:
            break
        
        if word not in w2v_model.vocab:
            continue
            
        word2ind[word] = len(word2ind)
        embeddings.append(w2v_model.get_vector(word))

    return word2ind, np.array(embeddings)

In [10]:
word2ind, embeddings = build_word_embeddings(train_data, w2v_model, min_freq=8)
print('Vocab size =', len(word2ind))

Vocab size = 35350


Для генерации батчей будем использовать такой класс:

In [0]:
import random
import math


def to_matrix(lines, word2ind):
    max_sent_len = max(len(line) for line in lines)
    matrix = np.zeros((len(lines), max_sent_len))

    for batch_ind, line in enumerate(lines):
        matrix[batch_ind, :len(line)] = [word2ind.get(word, 1) for word in line]

    return LongTensor(matrix)


class BatchIterator():
    def __init__(self, data, batch_size, word2ind, shuffle=True):
        self._data = data
        self._num_samples = len(data)
        self._batch_size = batch_size
        self._word2ind = word2ind
        self._shuffle = shuffle
        self._batches_count = int(math.ceil(len(data) / batch_size))
        
    def __len__(self):
        return self._batches_count
    
    def __iter__(self):
        return self._iterate_batches()

    def _iterate_batches(self):
        indices = np.arange(self._num_samples)
        if self._shuffle:
            np.random.shuffle(indices)

        for start in range(0, self._num_samples, self._batch_size):
            end = min(start + self._batch_size, self._num_samples)

            batch_indices = indices[start: end]

            batch = self._data.iloc[batch_indices]
            questions = batch['question'].values
            correct_answers = np.array([
                row['options'][random.choice(row['correct_indices'])]
                for i, row in batch.iterrows()
            ])
            wrong_answers = np.array([
                row['options'][random.choice(row['wrong_indices'])]
                for i, row in batch.iterrows()
            ])

            yield {
                'questions': to_matrix(questions, self._word2ind),
                'correct_answers': to_matrix(correct_answers, self._word2ind),
                'wrong_answers': to_matrix(wrong_answers, self._word2ind)
            }

In [0]:
train_iter = BatchIterator(train_data, 64, word2ind)
test_iter = BatchIterator(test_data, 128, word2ind)

Он просто сэмплирует последовательности из вопросов, правильных и неправильных ответов на них:

In [13]:
batch = next(iter(train_iter))

batch

{'correct_answers': tensor([[ 748,   22, 2084,  ...,    0,    0,    0],
         [4499,   36,  464,  ...,    0,    0,    0],
         [  29,   38, 2609,  ...,    0,    0,    0],
         ...,
         [  18,    2,   69,  ...,    0,    0,    0],
         [  65, 7685,    2,  ...,    0,    0,    0],
         [   7,  343, 2127,  ...,    0,    0,    0]], device='cuda:0'),
 'questions': tensor([[  23,   25,  951,  ...,    0,    0,    0],
         [  25, 2358,   54,  ...,    0,    0,    0],
         [1941, 2609,    8,  ...,    0,    0,    0],
         ...,
         [  25,   86,   11,  ...,    0,    0,    0],
         [  25, 1102,    4,  ...,    0,    0,    0],
         [   7, 2127,    9,  ...,    0,    0,    0]], device='cuda:0'),
 'wrong_answers': tensor([[  52, 4346, 1850,  ...,    0,    0,    0],
         [ 134, 2229,   26,  ...,    0,    0,    0],
         [   2,  955,  332,  ...,    0,    0,    0],
         ...,
         [ 845,   58,    3,  ...,    0,    0,    0],
         [   7,    2,  

## Модель

**Задание** Реализуйте модель энкодера для текстов - башни DSSM модели.

*Это не обязательно должна быть сложная модель, вполне сойдет сверточная, которая будет учиться гораздо быстрее.*

In [0]:
class Encoder(nn.Module):
    def __init__(self, embeddings, hidden_dim=128, output_dim=128):
        super().__init__()
        
        self._embs = nn.Embedding.from_pretrained(FloatTensor(embeddings))
        self._conv = nn.Sequential(
            nn.Conv1d(embeddings.shape[1], hidden_dim, kernel_size=3),
            nn.ReLU(inplace=True)
        )
        self._out = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, inputs):
        embs = self._embs(inputs)
        embs = embs.permute(0, 2, 1)
        
        outputs = self._conv(embs)
        outputs = torch.max(outputs, -1)[0]
        
        return self._out(outputs)

### Triplet Loss

Мы хотим не просто научить энкодер строить эмбеддинги для предложений. Мы хотим, чтобы притягивать векторы правильных ответов к вопросам и отталкивать неправильные. Для этого используют, например, *Triplet Loss*:

$$ L = \frac 1N \underset {q, a^+, a^-} \sum max(0, \space \delta - sim[V_q(q), V_a(a^+)] + sim[V_q(q), V_a(a^-)] ),$$

где
* $sim[a, b]$ функция похожести (например, dot product или cosine similarity)
* $\delta$ - гиперпараметр модели. Если $sim[a, b]$ линейно по $b$, то все $\delta > 0$ эквиватентны.

![img](https://raw.githubusercontent.com/yandexdataschool/nlp_course/master/resources/margin.png)

**Задание** Реализуйте triplet loss, а также подсчет recall - процента случаев, когда правильный ответ был ближе неправильного.

In [0]:
class DSSM(nn.Module):
    def __init__(self, question_encoder, answer_encoder):
        super().__init__()
        self.question_encoder = question_encoder
        self.answer_encoder = answer_encoder
        
    def forward(self, questions, correct_answers, wrong_answers):
        question_embeddings = self.question_encoder(questions)
        correct_answer_embeddings = self.answer_encoder(correct_answers)
        wrong_answer_embeddings = self.answer_encoder(wrong_answers)
        
        return question_embeddings, correct_answer_embeddings, wrong_answer_embeddings

    def calc_triplet_loss(self, question_embeddings, correct_answer_embeddings, wrong_answer_embeddings, delta=1.0):
        """Returns the triplet loss based on the equation above"""
        return F.relu(delta
                      - self.similarity(question_embeddings, correct_answer_embeddings)
                      + self.similarity(question_embeddings, wrong_answer_embeddings)).mean()
        
    def calc_recall_at_1(self, question_embeddings, correct_answer_embeddings, wrong_answer_embeddings):
        """Returns the number of cases when the correct answer were more similar than incorrect one"""
        correct_similarities = self.similarity(question_embeddings, correct_answer_embeddings)
        wrong_similarities = self.similarity(question_embeddings, wrong_answer_embeddings)
        
        
        return (correct_similarities > wrong_similarities).float().sum()
        
    @staticmethod
    def similarity(question_embeddings, answer_embeddings):
        """Returns sim[a, b]"""
        return (question_embeddings * answer_embeddings).sum(-1)

In [0]:
class ModelTrainer():
    def __init__(self, model, optimizer):
        self._model = model
        self._optimizer = optimizer
        
    def on_epoch_begin(self, is_train, name, batches_count):
        """
        Initializes metrics
        """
        self._epoch_loss = 0
        self._correct_count, self._total_count = 0, 0
        self._is_train = is_train
        self._name = name
        self._batches_count = batches_count
        
        self._model.train(is_train)
        
    def on_epoch_end(self):
        """
        Outputs final metrics
        """
        return '{:>5s} Loss = {:.5f}, Recall@1 = {:.2%}'.format(
            self._name, self._epoch_loss / self._batches_count, self._correct_count / self._total_count
        )
        
    def on_batch(self, batch):
        """
        Performs forward and (if is_train) backward pass with optimization, updates metrics
        """
        
        question_embs, correct_answer_embs, wrong_answer_embs = self._model(
            batch['questions'], batch['correct_answers'], batch['wrong_answers']
        )
        loss = self._model.calc_triplet_loss(question_embs, correct_answer_embs, wrong_answer_embs)
        correct_count = self._model.calc_recall_at_1(question_embs, correct_answer_embs, wrong_answer_embs)
        total_count = len(batch['questions'])
        
        self._correct_count += correct_count
        self._total_count += total_count
        self._epoch_loss += loss.item()
        
        if self._is_train:
            self._optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(self._model.parameters(), 1.)
            self._optimizer.step()

        return '{:>5s} Loss = {:.5f}, Recall@1 = {:.2%}'.format(
            self._name, loss.item(), correct_count / total_count
        )

In [0]:
import math
from tqdm import tqdm
tqdm.get_lock().locks = []


def do_epoch(trainer, data_iter, is_train, name=None):
    trainer.on_epoch_begin(is_train, name, batches_count=len(data_iter))
    
    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=len(data_iter)) as progress_bar:
            for i, batch in enumerate(data_iter):
                batch_progress = trainer.on_batch(batch)

                progress_bar.update()
                progress_bar.set_description(batch_progress)
                
            epoch_progress = trainer.on_epoch_end()
            progress_bar.set_description(epoch_progress)
            progress_bar.refresh()

            
def fit(trainer, train_iter, epochs_count=1, val_iter=None):
    best_val_loss = None
    for epoch in range(epochs_count):
        name_prefix = '[{} / {}] '.format(epoch + 1, epochs_count)
        do_epoch(trainer, train_iter, is_train=True, name=name_prefix + 'Train:')
        
        if not val_iter is None:
            do_epoch(trainer, val_iter, is_train=False, name=name_prefix + '  Val:')

Запустим, наконец, учиться модель:

In [18]:
embeddings = FloatTensor(embeddings)

model = DSSM(
    Encoder(embeddings),
    Encoder(embeddings)
).to(DEVICE)

optimizer = optim.Adam(model.parameters())

trainer = ModelTrainer(model, optimizer)

fit(trainer, train_iter, epochs_count=5, val_iter=test_iter)

[1 / 5] Train: Loss = 0.67149, Recall@1 = 70.14%: 100%|██████████| 909/909 [00:33<00:00, 27.53it/s]
[1 / 5]   Val: Loss = 0.57129, Recall@1 = 75.62%: 100%|██████████| 211/211 [00:09<00:00, 23.04it/s]
[2 / 5] Train: Loss = 0.51111, Recall@1 = 78.38%: 100%|██████████| 909/909 [00:33<00:00, 28.18it/s]
[2 / 5]   Val: Loss = 0.54672, Recall@1 = 77.06%: 100%|██████████| 211/211 [00:09<00:00, 23.27it/s]
[3 / 5] Train: Loss = 0.45427, Recall@1 = 81.28%: 100%|██████████| 909/909 [00:33<00:00, 28.24it/s]
[3 / 5]   Val: Loss = 0.55122, Recall@1 = 77.53%: 100%|██████████| 211/211 [00:08<00:00, 23.52it/s]
[4 / 5] Train: Loss = 0.42037, Recall@1 = 82.79%: 100%|██████████| 909/909 [00:33<00:00, 27.95it/s]
[4 / 5]   Val: Loss = 0.54098, Recall@1 = 78.44%: 100%|██████████| 211/211 [00:09<00:00, 22.84it/s]
[5 / 5] Train: Loss = 0.39392, Recall@1 = 84.08%: 100%|██████████| 909/909 [00:33<00:00, 27.57it/s]
[5 / 5]   Val: Loss = 0.54709, Recall@1 = 78.24%: 100%|██████████| 211/211 [00:09<00:00, 23.34it/s]


### Точность предсказаний

Оценим, насколько хорошо модель предсказывает правильный ответ.

**Задание** Для каждого вопроса найдите индекс ответа, генерируемого сетью:

In [0]:
from tqdm import tqdm

In [20]:
predictions = []
for _, row in tqdm(test_data.iterrows(), total=len(test_data)):
    question_embeddings = model.question_encoder(to_matrix([row.question], word2ind))
    answer_embeddings = model.answer_encoder(to_matrix(row.options, word2ind))
    
    predictions.append(model.similarity(question_embeddings, answer_embeddings).argmax())
    
accuracy = np.mean([
    answer in correct_ind
    for answer, correct_ind in zip(predictions, test_data['correct_indices'].values)
])
print("Accuracy: %0.5f" % accuracy)

100%|██████████| 26970/26970 [00:57<00:00, 469.66it/s]


Accuracy: 0.53211


In [0]:
def draw_results(question, possible_answers, predicted_index, correct_indices):
    print("Q:", ' '.join(question), end='\n\n')
    for i, answer in enumerate(possible_answers):
        print("#%i: %s %s" % (i, '[*]' if i == predicted_index else '[ ]', ' '.join(answer)))
    
    print("\nVerdict:", "CORRECT" if predicted_index in correct_indices else "INCORRECT", 
          "(ref: %s)" % correct_indices, end='\n' * 3)

In [22]:
for i in [1, 100, 1000, 2000, 3000, 4000, 5000]:
    draw_results(test_data.iloc[i].question, test_data.iloc[i].options,
                 predictions[i], test_data.iloc[i].correct_indices)

Q: what did bell call his special spot in the back of the property ?

#0: [*] at the homestead , bell set up his own workshop in the converted carriage house near to what he called his " dreaming place " , a large hollow nestled in trees at the back of the property above the river .
#1: [ ] despite his frail condition upon arriving in canada , bell found the climate and environs to his liking , and rapidly improved .
#2: [ ] [ n 10 ] he continued his interest in the study of the human voice and when he discovered the six nations reserve across the river at onondaga , he learned the mohawk language and translated its unwritten vocabulary into visible speech symbols .
#3: [ ] for his work , bell was awarded the title of honorary chief and participated in a ceremony where he donned a mohawk headdress and danced traditional dances .
#4: [ ] [ n 11 ]

Verdict: CORRECT (ref: [0])


Q: how much more gin than beer was made in england in 1740 ?

#0: [ ] the 18th century saw a huge growth in the

## Hard-negatives mining

На самом деле, в большинстве случаев у нас отрицательных примеров.

Например, есть база диалогов - и где брать отрицательные примеры к ответам?

Для этого используют *hard-negatives mining*. Берут в качестве отрицательного примера самый близкий из неправильных примеров в батче:
$$a^-_{hard} = \underset {a^-} {argmax} \space sim[V_q(q), V_a(a^-)]$$

Неправильные в данном случае - все, кроме правильного :)

Реализуется это как-то так:
* Батч состоит из правильных пар вопрос-ответ.
* Для всех вопросов и всех ответов считают эмбеддинги.
* Положительные примеры у нас есть - осталось найти для каждого вопроса наиболее похожие на него ответы, которые предназначались другим вопросам.

**Задание** Обновите `DSSM`, чтобы делать hard-negatives mining внутри него.

*Может понадобиться нормализовывать векторы с помощью `F.normalize` перед подсчетом `similarity`*

In [0]:
class DSSM(nn.Module):
    def __init__(self, question_encoder, answer_encoder):
        super().__init__()
        self.question_encoder = question_encoder
        self.answer_encoder = answer_encoder
        
    def forward(self, questions, correct_answers, wrong_answers):
        """Ignore wrong_answers, they are here just for compatibility sake"""
        <perform forward pass>

    def calc_triplet_loss(self, question_embeddings, answer_embeddings, delta=1.0):
        """Returns the triplet loss based on the equation above"""
        <calc triple loss with hard-negatives>
        
    def calc_recall_at_1(self, question_embeddings, answer_embeddings):
        """Returns the number of cases when the correct answer were more similar than incorrect one"""
        <calc recall>
        
    @staticmethod
    def similarity(question_embeddings, answer_embeddings):
        <calc it>

In [0]:
model = DSSM(
    question_encoder=Encoder(embeddings),
    answer_encoder=Encoder(embeddings)
).to(DEVICE)

optimizer = optim.Adam(model.parameters())

trainer = ModelTrainer(model, optimizer)

fit(trainer, train_iter, epochs_count=30, val_iter=test_iter)

**Задание** Есть также вариант с semi-hard negatives - когда в качестве отрицательного примера берется наилучший среди тех, чья similarity меньше similarity вопроса с положительным примером. Попробуйте реализовать его.

# Болталка

Чтобы реализовать болталку, нужен нормальный корпус с диалогами. Например, OpenSubtitles.

In [0]:
!head train.txt

Ну, примерно нормальный.

Считаем датасет.

In [0]:
from nltk import wordpunct_tokenize

def read_dataset(path):
    data = []
    with open(path) as f:
        for line in tqdm(f):
            query, response = line.strip().split('\t')
            data.append((
                wordpunct_tokenize(query.strip()),
                wordpunct_tokenize(response.strip())
            ))
    return data

train_data = read_dataset('train.txt')
val_data = read_dataset('valid.txt')
test_data = read_dataset('test.txt')

In [0]:
from torchtext.data import Field, Example, Dataset, BucketIterator

query_field = Field(lower=True)
response_field = Field(lower=True)

fields = [('query', query_field), ('response', response_field)]

train_dataset = Dataset([Example.fromlist(example, fields) for example in train_data], fields)
val_dataset = Dataset([Example.fromlist(example, fields) for example in val_data], fields)
test_dataset = Dataset([Example.fromlist(example, fields) for example in test_data], fields)

query_field.build_vocab(train_dataset, min_freq=5)
response_field.build_vocab(train_dataset, min_freq=5)

print('Query vocab size =', len(query_field.vocab))
print('Response vocab size =', len(response_field.vocab))

train_iter, val_iter, test_iter = BucketIterator.splits(
    datasets=(train_dataset, val_dataset, test_dataset), batch_sizes=(512, 1024, 1024), 
    shuffle=True, device=DEVICE, sort=False
)

**Задание** Реализовать болталку по аналогии с тем, что уже написали.

# Дополнительные материалы

## Статьи
Learning Deep Structured Semantic Models for Web Search using Clickthrough Data, 2013 [[pdf]](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/cikm2013_DSSM_fullversion.pdf)  
Deep Learning and Continuous Representations for Natural Language Processing, Microsoft tutorial [[pdf]](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/NAACL-HLT-2015_tutorial.pdf)

## Блоги
[Neural conversational models: как научить нейронную сеть светской беседе](https://habr.com/company/yandex/blog/333912/)  
[Искусственный интеллект в поиске. Как Яндекс научился применять нейронные сети, чтобы искать по смыслу, а не по словам](https://habr.com/company/yandex/blog/314222/)  
[Triplet loss, Olivier Moindrot](https://omoindrot.github.io/triplet-loss)

# Сдача

[Форма для сдачи](https://goo.gl/forms/bf2auPe8FL5C0jzp2)  
[Feedback](https://goo.gl/forms/9aizSzOUrx7EvGlG3)