In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False
%cd ../

/workspace


# Import Module

In [15]:
import json
import os
from collections import OrderedDict, defaultdict
from functools import partial

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

from typing import Any, Dict, List, Tuple

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from joblib import Parallel, delayed
from pyserini.search.lucene import LuceneSearcher
from sklearn.model_selection import train_test_split
from torch.utils.data import ConcatDataset, DataLoader, Subset
from tqdm.auto import tqdm
from transformers import AutoConfig, AutoModel, AutoTokenizer

from src.datasets import Dataset, bert_collate_fn
from src.metrics import get_mrr
from src.monobert.models import MonoBERT

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load Data

In [3]:
train_data_path = "./data/train.json"
test_data_path = "./data/test_data.json"
with open(train_data_path, "r") as f1, open(test_data_path, "r") as f2:
    train_data = json.load(f1)
    test_data = json.load(f2)
test_question = pd.read_csv("./data/test_questions.csv", encoding="utf8")
sample = pd.read_csv("./data/sample_submission.csv", encoding="utf8")
print(f"# of train data: {len(train_data['data']):,}")
print(f"# of test data: {len(test_data['data']):,}")

# of train data: 1,862
# of test data: 827


In [17]:
def preprocess_data(
    data: Dict[str, Any],
    return_query_to_docs: bool = True,
) -> Tuple[Dict[str, str], Dict[str, List[str]], Dict[str, str]]:
    queries = OrderedDict()
    docs = OrderedDict()
    query_to_docs = defaultdict(list)

    for d in data["data"]:
        for paragraph in d["paragraphs"]:
            if return_query_to_docs:
                for q in paragraph["qas"]:
                    queries[q["question_id"]] = q["question"]
                    query_to_docs[q["question_id"]].append(paragraph["paragraph_id"])
            docs[paragraph["paragraph_id"]] = paragraph["context"]

    ret = (queries, docs)
    if return_query_to_docs:
        ret += (query_to_docs,)
    return ret


train_queries, train_docs, train_query_to_docs = preprocess_data(train_data)
_, test_docs = preprocess_data(test_data, return_query_to_docs=False)
test_queries = dict(zip(test_question["question_id"], test_question["question_text"]))

print(f"# of queries: {len(train_queries):,}")
print(f"# of documents: {len(train_docs):,}")

# of queries: 233,121
# of documents: 137,335


In [5]:
query_lengths = np.array([len(q) for q in train_queries.values()])
print(f"Max length of query: {np.max(query_lengths):,}")
print(f"Min length of query: {np.min(query_lengths):,}")
print(f"Avg. length of query: {np.mean(query_lengths):.2f}")
print(f"Std. length of query: {np.std(query_lengths):.2f}")
print("-" * 40)
doc_lengths = np.array([len(d) for d in train_docs.values()])
print(f"Max length of document: {np.max(doc_lengths):,}")
print(f"Min length of document: {np.min(doc_lengths):,}")
print(f"Avg. length of document: {np.mean(doc_lengths):.2f}")
print(f"Std. length of document: {np.std(doc_lengths):.2f}")

Max length of query: 176
Min length of query: 6
Avg. length of query: 41.20
Std. length of query: 12.99
----------------------------------------
Max length of document: 676
Min length of document: 353
Avg. length of document: 508.54
Std. length of document: 72.12


---

# BM25 Baseline

In [33]:
def get_contents(data):
    contents = []
    for d in data["data"]:
        for paragraph in d["paragraphs"]:
            contents.append(
                {"id": paragraph["paragraph_id"], "contents": paragraph["context"]}
            )
    return contents


data_list = [
    (train_data, "./data/index/train/doc.json"),
    (test_data, "./data/index/test/doc.json"),
]

for data, index_path in data_list:
    contents = get_contents(data)
    os.makedirs(os.path.dirname(index_path), exist_ok=True)
    with open(index_path, "w", encoding="utf8") as f:
        json.dump(contents, f)

In [None]:
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input data/index/train/ \
  --language ko \
  --index data/train.index \
  --generator DefaultLuceneDocumentGenerator \
  --threads 16 \
  --storePositions --storeDocvectors --storeRaw

In [None]:
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input data/index/test/ \
  --language ko \
  --index data/test.index \
  --generator DefaultLuceneDocumentGenerator \
  --threads 16 \
  --storePositions --storeDocvectors --storeRaw

In [35]:
searcher = LuceneSearcher("./data/test.index")
searcher.set_language("ko")

In [54]:
answer = []
for q in tqdm(test_question["question_text"]):
    answer.append(",".join([r.docid for r in searcher.search(q, k=10)]))
sample["paragraph_id"] = answer
sample.to_csv("submission.csv", index=False)

  0%|          | 0/11434 [00:00<?, ?it/s]

---

# monoBERT

## First Stage Retrieval

In [23]:
def make_shard(
    queries: Dict[str, str],
    tsv_root_path: str,
    tsv_filename: str,
    shard_size: int = 10000,
) -> None:
    query_ids = list(queries.keys())
    query_str = list(queries.values())
    query_df = pd.DataFrame(
        data={"q_id": np.arange(len(query_str)), "query": query_str}
    )
    num_shards = (len(query_df) + shard_size - 1) // shard_size

    for n in range(num_shards):
        tsv_path = os.path.join(tsv_root_path, f"{tsv_filename}_{n:02d}.tsv")
        os.makedirs(os.path.dirname(tsv_path), exist_ok=True)
        query_df[n * shard_size : (n + 1) * shard_size].to_csv(
            tsv_path, index=False, header=None, sep="\t"
        )

In [27]:
train_query_id_i2s = dict(zip(range(len(train_queries)), train_queries.keys()))
make_shard(train_queries, "./data/top1000", "train_queries")

test_query_id_i2s = dict(zip(range(len(test_queries)), test_queries.keys()))
make_shard(test_queries, "./data/top1000", "test_queries", len(test_queries))

In [None]:
!scripts/gen_top1000.sh

In [9]:
%%time
tsv_path = "./data/top1000/train_top1000_00.txt"
df = pd.read_csv(tsv_path, sep=" ", header=None)
df[0] = df[0].map(lambda x: query_id_i2s[x])
candidates0 = df.groupby(0)[2].apply(list).to_dict()

CPU times: user 19.8 s, sys: 1.35 s, total: 21.1 s
Wall time: 21.1 s


In [10]:
%%time
tsv_path = "./data/top1000/train_top1000_01.txt"
df = pd.read_csv(tsv_path, sep=" ", header=None)
df[0] = df[0].map(lambda x: query_id_i2s[x])
candidates1 = df.groupby(0)[2].apply(list).to_dict()

CPU times: user 19.6 s, sys: 1.35 s, total: 21 s
Wall time: 20.9 s


## PLM

In [11]:
pretrained_model_name = 'monologg/koelectra-base-v3-discriminator'
bert = MonoBERT(
    pretrained_model_name=pretrained_model_name,
    use_layernorm=True,
)

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

_ = bert.cuda()

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## DataLoader

In [70]:
shard_size = 10000
candidates = {}
candidates.update(candidates0)
candidates.update(candidates1)

train_query_ids =[]
train_query_str = []

for shard_idx in [0, 1]:
    train_query_ids.extend(query_ids[shard_idx * shard_size : (shard_idx + 1) * shard_size]),
    train_query_str.extend(query_str[shard_idx * shard_size : (shard_idx + 1) * shard_size]),
    
train_query_ids = np.array(train_query_ids)
train_query_str = np.array(train_query_str)

dataset = Dataset(
    train_query_ids,
    train_query_str,
    train_docs,
    train_query_to_docs,
    candidates,
    num_neg=1,
    topk=50,
    is_training=True,
)

train_idx, valid_idx = train_test_split(np.arange(len(dataset)), test_size=200)
train_dataset = Subset(dataset, train_idx)
valid_dataset = Subset(dataset, valid_idx)

In [73]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=16,
    collate_fn=partial(bert_collate_fn, tokenizer=tokenizer, max_length=512),
    num_workers=0,
)

val_dataloader = DataLoader(
    valid_dataset,
    batch_size=1,
    collate_fn=partial(bert_collate_fn, tokenizer=tokenizer, max_length=512),
    num_workers=8,
)

In [74]:
valid_dataset.dataset.is_training = False
topk = 10
topk_candidates = 50

bert.eval()
indices_list = []
for batch_x, batch_y in tqdm(val_dataloader, total=len(valid_dataset)):
    batch_x = {k: v.cuda() for k, v in batch_x.items()}
    with torch.no_grad():
        outputs = bert(batch_x)
        scores, indices = torch.topk(outputs, k=topk)
        scores = scores.sigmoid().float().cpu()
        indices_list.append(indices.cpu())

  0%|          | 0/200 [00:01<?, ?it/s]



In [114]:
predicted = {}
for query_id, rank_indices in zip(train_query_ids[valid_idx], indices_list):
    predicted[query_id] = [
        candidates[query_id][:topk_candidates][idx] for idx in rank_indices
    ]
val_y_true = {k: train_query_to_docs[k] for k in train_query_ids[valid_idx]}

In [116]:
mrr = get_mrr(val_y_true, predicted)
mrr

0.0345436507936508

## Inference