In [2]:
# # !mkdir data
# # !mkdir data/clariq
# !cd data/clariq; wget https://github.com/aliannejadi/ClariQ/raw/master/data/dev.tsv
# !cd data/clariq; wget https://github.com/aliannejadi/ClariQ/raw/master/data/train.tsv
# !cd data/clariq; wget https://github.com/aliannejadi/ClariQ/raw/master/data/question_bank.tsv
# !mv data/clariq/train.tsv data/clariq/train_original.tsv

In [3]:
import pandas as pd
data_path = "./data/"

train = pd.read_csv(data_path+"clariq/train_original.tsv", sep="\t")
valid = pd.read_csv(data_path+"clariq/dev.tsv", sep="\t")

train = train[["initial_request", "question"]]
train.columns = ["query", "clarifying_question"]
train = train[~train["clarifying_question"].isnull()]

valid = valid[["initial_request", "question"]]
valid.columns = ["query", "clarifying_question"]
valid = valid[~valid["clarifying_question"].isnull()]

train.to_csv(data_path+"clariq/train.tsv", sep="\t", index=False)
valid.to_csv(data_path+"clariq/valid.tsv", sep="\t", index=False)

In [4]:
# For transformer-rankers we only need a pandas DF with query (here the initial request) 
# and relevant documents (here the clarifying questions).
train.head()

Unnamed: 0,query,clarifying_question
0,Tell me about Obama family tree.,are you interested in seeing barack obamas family
1,Tell me about Obama family tree.,would you like to know barack obamas geneology
2,Tell me about Obama family tree.,would you like to know about obamas ancestors
3,Tell me about Obama family tree.,would you like to know who is currently alive ...
4,Tell me about Obama family tree.,are you looking for biological information on ...


In [5]:
# We will sample negative samples for training using the question bank
question_bank = pd.read_csv(data_path+"clariq/question_bank.tsv", sep="\t")
question_bank.head()

Unnamed: 0,question_id,question
0,Q00001,
1,Q00002,a total cholesterol of 180 to 200 mgdl 10 to 1...
2,Q00003,about how many years experience do you want th...
3,Q00004,according to anima the bible or what other source
4,Q00005,ae you looking for examples of septic system d...


In [9]:
import wandb
wandb.init()

2020-12-15 12:13:59,616 [ERROR] Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable


[34m[1mwandb[0m: Wandb version 0.10.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


2020-12-15 12:13:59,738 [INFO] system metrics and metadata threads started
2020-12-15 12:13:59,741 [INFO] checking resume status, waiting at most 10 seconds
2020-12-15 12:13:59,903 [INFO] resuming run from id: UnVuOnYxOjEweW9iMXN1OnRyYW5zZm9ybWVyX3JhbmtlcnMtbm90ZWJvb2tzOnZlbmRp
2020-12-15 12:13:59,916 [INFO] upserting run before process can begin, waiting at most 10 seconds
2020-12-15 12:14:00,103 [INFO] saving pip packages
2020-12-15 12:14:00,110 [INFO] initializing streaming files api
2020-12-15 12:14:00,111 [INFO] unblocking file change observer, beginning sync with W&B servers


W&B Run: https://app.wandb.ai/vendi/transformer_rankers-notebooks/runs/10yob1su

2020-12-15 12:14:00,120 [INFO] shutting down system stats and metadata service
2020-12-15 12:14:00,250 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/config.yaml
2020-12-15 12:14:00,427 [INFO] file/dir created: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-events.jsonl
2020-12-15 12:14:00,437 [INFO] file/dir created: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json
2020-12-15 12:14:00,439 [INFO] file/dir created: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-summary.json
2020-12-15 12:14:00,443 [INFO] file/dir created: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-history.jsonl
2020-12-15 12:14:00,450 [INFO] file/dir created: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/requirements.txt
2020-12-15 12:14:00,743 [INFO] stopping strea

In [10]:
from transformer_rankers.trainers import transformer_trainer
from transformer_rankers.datasets import dataset
from transformer_rankers.negative_samplers import negative_sampling
from transformer_rankers.eval import results_analyses_tools

from transformers import BertTokenizer, BertForSequenceClassification

import logging
import os
import sys
import torch
import random
import numpy as np

np.random.seed(42)
random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

logging.basicConfig(
  level=logging.INFO,
  format="%(asctime)s [%(levelname)s] %(message)s",
  handlers=[
      logging.StreamHandler(sys.stdout)
  ]
)

#The combination of query and question are not that big.
max_seq_len = 50

#Lets use an almost balanced amount of positive and negative samples during training.
average_relevant_per_query = train.groupby("query").count().mean().values[0]

#Instantiate BM25 negative sampler.
ns_train = negative_sampling.BM25NegativeSamplerPyserini(list(question_bank["question"].values[1:]), int(average_relevant_per_query) , 
                    "./data/clariq/anserini_train/", -1, "./anserini/")
ns_val = negative_sampling.BM25NegativeSamplerPyserini(list(question_bank["question"].values[1:]), int(average_relevant_per_query), 
                    "./data/clariq/anserini_train/", -1, "./anserini/")

# We could also use random sampling which does not require Anserini.
# ns_train = negative_sampling.RandomNegativeSampler(list(question_bank["question"].values[1:]), int(average_relevant_per_query))
# ns_val = negative_sampling.RandomNegativeSampler(list(question_bank["question"].values[1:]), int(average_relevant_per_query))

#Create the loaders for the dataset, with the respective negative samplers
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataloader = dataset.QueryDocumentDataLoader(train_df=train,
                    val_df=valid, test_df=valid,
                    tokenizer=tokenizer, negative_sampler_train=ns_train,
                    negative_sampler_val=ns_val, task_type='classification',
                    train_batch_size=12, val_batch_size=12, max_seq_len=max_seq_len,
                    sample_data=-1, cache_path="./data/clariq/")

train_loader, val_loader, test_loader = dataloader.\
  get_pytorch_dataloaders()

#Use BERT (any model that has SequenceClassification class from HuggingFace would work here)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

#Instantiate trainer that handles fitting.
trainer = transformer_trainer.TransformerTrainer(model=model,
  train_loader=train_loader,
  val_loader=val_loader, test_loader=test_loader,
  num_ns_eval=int(average_relevant_per_query), task_type="classification", tokenizer=tokenizer,
  validate_every_epochs=1, num_validation_batches=-1,
  num_epochs=1, lr=5e-7, sacred_ex=None)

#Train (our validation eval uses the NS sampling procedure)
trainer.fit()

2020-12-15 12:14:03,527 [INFO] Train instances per batch 48
2020-12-15 12:14:03,562 [INFO] Loading instances from ./data/clariq//pointwise_set_train_n_cand_docs_45_ns_sampler_BM25NS_seq_max_l_50_sample_-1_for_classification_using_BertTokenizer
2020-12-15 12:14:03,894 [INFO] Total of 16981 instances were cached.
2020-12-15 12:14:03,906 [INFO] Loading instances from ./data/clariq//pointwise_set_val_n_cand_docs_45_ns_sampler_BM25NS_seq_max_l_50_sample_-1_for_classification_using_BertTokenizer
2020-12-15 12:14:04,060 [INFO] Total of 2025 instances were cached.
2020-12-15 12:14:04,072 [INFO] Loading instances from ./data/clariq//pointwise_set_test_n_cand_docs_45_ns_sampler_BM25NS_seq_max_l_50_sample_-1_for_classification_using_BertTokenizer
2020-12-15 12:14:04,086 [INFO] Total of 2025 instances were cached.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

2020-12-15 12:14:08,017 [INFO] Device cuda
2020-12-15 12:14:08,017 [INFO] Num GPU 4
2020-12-15 12:14:08,095 [INFO] Total batches per epoch : 354
2020-12-15 12:14:08,095 [INFO] Validating every 1 epoch.


Epoch 0, steps:   0%|          | 0/354 [00:00<?, ?it/s]

2020-12-15 12:14:12,545 [ERROR] Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable


[34m[1mwandb[0m: Wandb version 0.10.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


2020-12-15 12:14:12,679 [INFO] system metrics and metadata threads started
2020-12-15 12:14:12,680 [INFO] checking resume status, waiting at most 10 seconds
2020-12-15 12:14:12,829 [INFO] resuming run from id: UnVuOnYxOjEweW9iMXN1OnRyYW5zZm9ybWVyX3JhbmtlcnMtbm90ZWJvb2tzOnZlbmRp
2020-12-15 12:14:12,840 [INFO] upserting run before process can begin, waiting at most 10 seconds
2020-12-15 12:14:13,012 [INFO] saving pip packages
2020-12-15 12:14:13,017 [INFO] initializing streaming files api
2020-12-15 12:14:13,019 [INFO] unblocking file change observer, beginning sync with W&B servers




2020-12-15 12:14:13,546 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-summary.json
2020-12-15 12:14:13,550 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/requirements.txt
2020-12-15 12:14:13,552 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/config.yaml
2020-12-15 12:14:13,732 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-events.jsonl
2020-12-15 12:14:13,736 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


Epoch 0, steps:  11%|█         | 39/354 [00:21<02:11,  2.40it/s]

2020-12-15 12:14:29,571 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


Epoch 0, steps:  21%|██        | 73/354 [00:35<01:54,  2.45it/s]

2020-12-15 12:14:43,597 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-events.jsonl


Epoch 0, steps:  22%|██▏       | 78/354 [00:37<01:54,  2.42it/s]

2020-12-15 12:14:45,600 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


Epoch 0, steps:  33%|███▎      | 117/354 [00:53<01:37,  2.43it/s]

2020-12-15 12:15:01,639 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


Epoch 0, steps:  42%|████▏     | 150/354 [01:07<01:29,  2.28it/s]

2020-12-15 12:15:15,780 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-events.jsonl


Epoch 0, steps:  44%|████▍     | 155/354 [01:09<01:21,  2.43it/s]

2020-12-15 12:15:17,783 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


Epoch 0, steps:  55%|█████▍    | 194/354 [01:25<01:05,  2.46it/s]

2020-12-15 12:15:33,810 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


Epoch 0, steps:  64%|██████▍   | 228/354 [01:39<00:51,  2.45it/s]

2020-12-15 12:15:47,833 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-events.jsonl


Epoch 0, steps:  66%|██████▌   | 233/354 [01:41<00:50,  2.42it/s]

2020-12-15 12:15:49,836 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


Epoch 0, steps:  77%|███████▋  | 271/354 [01:57<00:34,  2.41it/s]

2020-12-15 12:16:05,927 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


Epoch 0, steps:  86%|████████▌ | 305/354 [02:11<00:20,  2.42it/s]

2020-12-15 12:16:20,046 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-events.jsonl


Epoch 0, steps:  88%|████████▊ | 310/354 [02:13<00:19,  2.26it/s]

2020-12-15 12:16:22,049 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


Epoch 0, steps:  99%|█████████▊| 349/354 [02:29<00:02,  2.45it/s]

2020-12-15 12:16:38,147 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


Epoch 0, steps: 100%|██████████| 354/354 [02:31<00:00,  2.33it/s]
Predicting:  59%|█████▊    | 99/169 [00:12<00:08,  8.18it/s]

2020-12-15 12:16:52,195 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-events.jsonl


Predicting:  69%|██████▊   | 116/169 [00:14<00:06,  8.23it/s]

2020-12-15 12:16:54,203 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


Predicting: 100%|██████████| 169/169 [00:20<00:00,  8.14it/s]

2020-12-15 12:17:00,693 [INFO] shutting down system stats and metadata service





2020-12-15 12:17:01,223 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-summary.json
2020-12-15 12:17:01,227 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-history.jsonl
2020-12-15 12:17:01,230 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-events.jsonl
2020-12-15 12:17:01,241 [INFO] stopping streaming files and file change observer
2020-12-15 12:17:02,226 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


In [11]:
rerank_top_k = 30
# Imports required packages, defines stem & tokenizez function
import pandas as pd
from rank_bm25 import BM25Okapi
import nltk
from nltk.stem.porter import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

def stem_tokenize(text, remove_stopwords=True):
  stemmer = PorterStemmer()
  tokens = [word for sent in nltk.sent_tokenize(text) \
                                      for word in nltk.word_tokenize(sent)]
  tokens = [word for word in tokens if word not in \
          nltk.corpus.stopwords.words('english')]
  return [stemmer.stem(word) for word in tokens]

# Files paths
request_file_path = './ClariQ-repo/data/dev.tsv'
question_bank_path = './ClariQ-repo/data/question_bank.tsv'
run_file_path = './ClariQ-repo/sample_runs/dev_bm25'

# Reads files and build bm25 corpus (index)
dev = pd.read_csv(request_file_path, sep='\t')
question_bank = pd.read_csv(question_bank_path, sep='\t').fillna('')
question_bank['tokenized_question_list'] = question_bank['question'].map(stem_tokenize)
question_bank['tokenized_question_str'] = question_bank['tokenized_question_list'].map(lambda x: ' '.join(x))
bm25_corpus = question_bank['tokenized_question_list'].tolist()
bm25 = BM25Okapi(bm25_corpus)

# Runs bm25 for every query and stores output in file.
examples = []
all_preds_bm25 = []
with open(run_file_path, 'w') as fo:
  for tid in dev['topic_id'].unique():
    query = dev.loc[dev['topic_id']==tid, 'initial_request'].tolist()[0]
    bm25_ranked_list = bm25.get_top_n(stem_tokenize(query, True), 
                                    bm25_corpus, 
                                    n=rerank_top_k)
    bm25_q_list = [' '.join(sent) for sent in bm25_ranked_list]
    docs = question_bank.set_index('tokenized_question_str').loc[bm25_q_list, 'question'].tolist()
    preds = question_bank.set_index('tokenized_question_str').loc[bm25_q_list, 'question_id'].tolist()
    all_preds_bm25.append(preds)
    for doc in docs[:rerank_top_k]:
      examples.append((query, doc))

[nltk_data] Downloading package punkt to /home/svakule/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/svakule/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
from transformers.data.data_collator import default_data_collator
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset, DataLoader
from transformer_rankers.utils import utils

class SimpleDataset(Dataset):
    def __init__(self, features):
        self.features = features
    def __len__(self):
        return len(self.features)
    def __getitem__(self, index):
        return self.features[index]

batch_encoding = tokenizer.batch_encode_plus(examples, 
                max_length=max_seq_len, pad_to_max_length=True)
features = []
for i in range(len(examples)):
    inputs = {k: batch_encoding[k][i] for k in batch_encoding}
    feature = InputFeatures(**inputs, label=0)
    features.append(feature)

dataset = SimpleDataset(features)
dataloader = DataLoader(dataset, batch_size=16, shuffle=False, collate_fn=default_data_collator)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [13]:
logits, _, softmax_output = trainer.predict(dataloader)
softmax_output_by_query = utils.acumulate_list(softmax_output[0], rerank_top_k)

Predicting: 100%|██████████| 94/94 [00:11<00:00,  7.97it/s]


In [14]:
import numpy as np
run_file_path = './ClariQ-repo/sample_runs/dev_BERT-reranker'
with open(run_file_path, 'w') as fo:
  for tid_idx, tid in enumerate(dev['topic_id'].unique()):
    document_scores = np.array(softmax_output_by_query[tid_idx])
    top_k_scores_idx = (-document_scores).argsort()[:rerank_top_k]  
    preds = np.array(all_preds_bm25[tid_idx])[top_k_scores_idx]
    for i, qid in enumerate(preds):
      fo.write('{} 0 {} {} {} BERT-reranker\n'.format(tid, qid, i, len(preds)-i))

In [15]:
# Report question relevance performance
! python ./ClariQ-repo/src/clariq_eval_tool.py  --eval_task question_relevance\
                                                --data_dir ./ClariQ-repo/data/ \
                                                --experiment_type dev \
                                                --run_file {run_file_path} \
                                                --out_file {run_file_path}_question_relevance.eval

Recall5: 0.10574913811678517
Recall10: 0.22855252384354552
Recall20: 0.38955943024983275
Recall30: 0.6912818698329535


In [16]:
! python ./ClariQ-repo/src/clariq_eval_tool.py  --eval_task document_relevance\
                                                --data_dir ./ClariQ-repo/data/ \
                                                --experiment_type dev \
                                                --run_file {run_file_path} \
                                                --out_file {run_file_path}.eval

Traceback (most recent call last):
  File "./ClariQ-repo/src/clariq_eval_tool.py", line 274, in <module>
    main()
  File "./ClariQ-repo/src/clariq_eval_tool.py", line 267, in main
    input_args.out_file, input_args.multi_turn, input_args.leaderboard)
  File "./ClariQ-repo/src/clariq_eval_tool.py", line 101, in evaluate_document_relevance
    return evaluate_document_relevance_single_turn(experiment_type, data_dir, run_file, out_file, leaderboard)
  File "./ClariQ-repo/src/clariq_eval_tool.py", line 45, in evaluate_document_relevance_single_turn
    eval_dict = load_eval_dict(eval_file_path, topic_file_path)
TypeError: load_eval_dict() missing 1 required positional argument: 'multi_turn'
