<a href="https://colab.research.google.com/github/respect5716/deep-learning-paper-implementation/blob/main/03_NLP/DPR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DPR

## 0. Info

### Paper
* Title: Dense Passage Retrieval for Open-Domain Question Answering
* Author: Vladimir Karpukhin et al.
* URL: https://arxiv.org/abs/2004.04906

### Features
* Dataset: KLUE MRC + KorQUAD


### Reference
* https://github.com/facebookresearch/DPR

## 1. Setup

In [1]:
!pip install -q transformers datasets

[K     |████████████████████████████████| 3.4 MB 14.7 MB/s 
[K     |████████████████████████████████| 311 kB 78.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 66.0 MB/s 
[K     |████████████████████████████████| 596 kB 71.9 MB/s 
[K     |████████████████████████████████| 895 kB 61.1 MB/s 
[K     |████████████████████████████████| 67 kB 5.7 MB/s 
[K     |████████████████████████████████| 133 kB 77.9 MB/s 
[K     |████████████████████████████████| 243 kB 66.1 MB/s 
[K     |████████████████████████████████| 1.1 MB 61.4 MB/s 
[K     |████████████████████████████████| 271 kB 77.4 MB/s 
[K     |████████████████████████████████| 144 kB 66.0 MB/s 
[K     |████████████████████████████████| 94 kB 3.4 MB/s 
[?25h

In [1]:
import easydict
from tqdm.auto import tqdm

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel

In [2]:
config = easydict.EasyDict(
    model_name_or_path = 'klue/roberta-small',

    batch_size = 16,
    qry_max_length = 256,
    doc_max_length = 512,

    lr = 1e-5,
    num_epochs = 3,
)

## 2. Data

In [3]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        doc = item['context']
        qry = np.random.choice(item['question'])
        return qry, doc
    
def collate_fn(batch, tokenizer, config):
    qry, doc = zip(*batch)
    qry = tokenizer(list(qry), max_length=config.qry_max_length, padding='max_length', truncation=True, return_tensors='pt')
    doc = tokenizer(list(doc), max_length=config.doc_max_length, padding='max_length', truncation=True, return_tensors='pt')
    return qry, doc

In [4]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name_or_path)

In [5]:
klue = load_dataset('klue', 'mrc')['train'].to_pandas()
korquad = load_dataset('squad_kor_v1')['train'].to_pandas()

train_data = pd.concat([klue[['context', 'question']], korquad[['context', 'question']]], ignore_index=True)
train_data = train_data.groupby('context')['question'].apply(list).reset_index()
train_data.shape

Reusing dataset klue (/root/.cache/huggingface/datasets/klue/mrc/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)


  0%|          | 0/2 [00:00<?, ?it/s]

Reusing dataset squad_kor_v1 (/root/.cache/huggingface/datasets/squad_kor_v1/squad_kor_v1/1.0.0/18d4f44736b8ee85671f63cb84965bfb583fa0a4ff2df3c2e10eee9693796725)


  0%|          | 0/2 [00:00<?, ?it/s]

(22661, 2)

In [6]:
train_dataset = Dataset(train_data)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=lambda x: collate_fn(x, tokenizer, config))

qry, doc = next(iter(train_loader))
qry.input_ids.size(), doc.input_ids.size()

(torch.Size([16, 256]), torch.Size([16, 512]))

## 3. Model

In [7]:
class DPR(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.qry_encoder = AutoModel.from_pretrained(config.model_name_or_path)
        self.doc_encoder = AutoModel.from_pretrained(config.model_name_or_path)
    
    def forward(self, qry, doc):
        qry_out = self.qry_encoder(**qry)
        doc_out = self.doc_encoder(**doc)

        qry_emb = qry_out.last_hidden_state[:, 0]
        doc_emb = doc_out.last_hidden_state[:, 0]
        return qry_emb, doc_emb

In [8]:
model = DPR(config)
optim = torch.optim.Adam(model.parameters(), lr=config.lr)

_ = model.cuda().train()

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

## 4. Train

In [9]:
for ep in range(config.num_epochs):
    losses = 0.
    pbar = tqdm(train_loader)
    for qry, doc in pbar:
        qry, doc = qry.to('cuda'), doc.to('cuda')
        qry_embed, doc_embed = model(qry, doc)
        logits = torch.matmul(qry_embed, doc_embed.transpose(-1, -2))
        labels = torch.arange(qry_embed.size(0)).to('cuda')
        loss = F.cross_entropy(logits, labels)
        
        optim.zero_grad()
        loss.backward()
        optim.step()
        
        losses += loss.item()
        pbar.set_postfix({'loss': loss.item()})
    
    losses /= len(train_loader)
    print(f'ep {ep:02d} | loss {losses:.3f}')

  0%|          | 0/1417 [00:00<?, ?it/s]

ep 00 | loss 0.151


  0%|          | 0/1417 [00:00<?, ?it/s]

ep 01 | loss 0.099


  0%|          | 0/1417 [00:00<?, ?it/s]

ep 02 | loss 0.076


In [10]:
tokenizer.save_pretrained('transformers')
model.doc_encoder.save_pretrained('transformers/doc_encoder')
model.qry_encoder.save_pretrained('transformers/qry_encoder')

## 5. Eval

In [3]:
def get_embeddings(texts, model, tokenizer, max_length, batch_size=32):
    embeddings = []
    num_batches = np.ceil(len(texts) / batch_size).astype(int)
    for i in tqdm(range(num_batches)):
        text = texts[i*batch_size: (i+1)*batch_size]
        inputs = tokenizer(text, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
        inputs = inputs.to(model.device)
        with torch.no_grad():
            out = model(**inputs)
            embed = out.last_hidden_state[:, 0].cpu()
        embeddings.append(embed)
    embeddings = torch.cat(embeddings, dim=0)
    return embeddings

In [4]:
tokenizer = AutoTokenizer.from_pretrained('transformers')
doc_encoder = AutoModel.from_pretrained('transformers/doc_encoder').cuda()
qry_encoder = AutoModel.from_pretrained('transformers/qry_encoder').cuda()

In [5]:
klue = load_dataset('klue', 'mrc')['validation'].to_pandas()
korquad = load_dataset('squad_kor_v1')['validation'].to_pandas()

eval_data = pd.concat([klue[['context', 'question']], korquad[['context', 'question']]], ignore_index=True)
eval_data.shape

Reusing dataset klue (/root/.cache/huggingface/datasets/klue/mrc/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)


  0%|          | 0/2 [00:00<?, ?it/s]

Reusing dataset squad_kor_v1 (/root/.cache/huggingface/datasets/squad_kor_v1/squad_kor_v1/1.0.0/18d4f44736b8ee85671f63cb84965bfb583fa0a4ff2df3c2e10eee9693796725)


  0%|          | 0/2 [00:00<?, ?it/s]

(11615, 2)

In [6]:
qry_set = eval_data['question'].tolist()
doc_set = eval_data['context'].unique().tolist()
doc2id = {d:i for i, d in enumerate(doc_set)}

In [7]:
qry_embeddings = get_embeddings(qry_set, qry_encoder, tokenizer, config.qry_max_length)

  0%|          | 0/363 [00:00<?, ?it/s]

In [8]:
doc_embeddings = get_embeddings(doc_set, doc_encoder, tokenizer, config.doc_max_length)

  0%|          | 0/196 [00:00<?, ?it/s]

In [9]:
labels = eval_data['context'].map(doc2id).tolist()
score = torch.matmul(qry_embeddings, doc_embeddings.transpose(-1, -2))
_, topk = torch.topk(score, k=5, dim=-1)
topk = topk.tolist()

In [10]:
hr1, hr5 = 0, 0
for true, pred in zip(labels, topk):
    if true in pred[:1]:
        hr1 += 1
    if true in pred[:5]:
        hr5 += 1

hr1 /= len(labels)
hr5 /= len(labels)

print(f'hr1 {hr1:.3f} | hr5 {hr5:.3f}')

hr1 0.539 | hr5 0.783
