In [2]:
# # !mkdir data
# # !mkdir data/clariq
# !cd data/clariq; wget https://github.com/aliannejadi/ClariQ/raw/master/data/dev.tsv
# !cd data/clariq; wget https://github.com/aliannejadi/ClariQ/raw/master/data/train.tsv
# !cd data/clariq; wget https://github.com/aliannejadi/ClariQ/raw/master/data/question_bank.tsv
# !mv data/clariq/train.tsv data/clariq/train_original.tsv

In [3]:
import pandas as pd
data_path = "./data/"

train = pd.read_csv(data_path+"clariq/train_original.tsv", sep="\t")
valid = pd.read_csv(data_path+"clariq/dev.tsv", sep="\t")

train = train[["initial_request", "question"]]
train.columns = ["query", "clarifying_question"]
train = train[~train["clarifying_question"].isnull()]

valid = valid[["initial_request", "question"]]
valid.columns = ["query", "clarifying_question"]
valid = valid[~valid["clarifying_question"].isnull()]

train.to_csv(data_path+"clariq/train.tsv", sep="\t", index=False)
valid.to_csv(data_path+"clariq/valid.tsv", sep="\t", index=False)

In [4]:
# For transformer-rankers we only need a pandas DF with query (here the initial request) 
# and relevant documents (here the clarifying questions).
train.head()

Unnamed: 0,query,clarifying_question
0,Tell me about Obama family tree.,are you interested in seeing barack obamas family
1,Tell me about Obama family tree.,would you like to know barack obamas geneology
2,Tell me about Obama family tree.,would you like to know about obamas ancestors
3,Tell me about Obama family tree.,would you like to know who is currently alive ...
4,Tell me about Obama family tree.,are you looking for biological information on ...


In [5]:
# We will sample negative samples for training using the question bank
question_bank = pd.read_csv(data_path+"clariq/question_bank.tsv", sep="\t")
question_bank.head()

Unnamed: 0,question_id,question
0,Q00001,
1,Q00002,a total cholesterol of 180 to 200 mgdl 10 to 1...
2,Q00003,about how many years experience do you want th...
3,Q00004,according to anima the bible or what other source
4,Q00005,ae you looking for examples of septic system d...


In [9]:
import wandb
wandb.init()

2020-12-15 12:13:59,616 [ERROR] Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable


[34m[1mwandb[0m: Wandb version 0.10.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


2020-12-15 12:13:59,738 [INFO] system metrics and metadata threads started
2020-12-15 12:13:59,741 [INFO] checking resume status, waiting at most 10 seconds
2020-12-15 12:13:59,903 [INFO] resuming run from id: UnVuOnYxOjEweW9iMXN1OnRyYW5zZm9ybWVyX3JhbmtlcnMtbm90ZWJvb2tzOnZlbmRp
2020-12-15 12:13:59,916 [INFO] upserting run before process can begin, waiting at most 10 seconds
2020-12-15 12:14:00,103 [INFO] saving pip packages
2020-12-15 12:14:00,110 [INFO] initializing streaming files api
2020-12-15 12:14:00,111 [INFO] unblocking file change observer, beginning sync with W&B servers


W&B Run: https://app.wandb.ai/vendi/transformer_rankers-notebooks/runs/10yob1su

2020-12-15 12:14:00,120 [INFO] shutting down system stats and metadata service
2020-12-15 12:14:00,250 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/config.yaml
2020-12-15 12:14:00,427 [INFO] file/dir created: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-events.jsonl
2020-12-15 12:14:00,437 [INFO] file/dir created: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json
2020-12-15 12:14:00,439 [INFO] file/dir created: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-summary.json
2020-12-15 12:14:00,443 [INFO] file/dir created: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-history.jsonl
2020-12-15 12:14:00,450 [INFO] file/dir created: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/requirements.txt
2020-12-15 12:14:00,743 [INFO] stopping strea

In [None]:
from transformer_rankers.trainers import transformer_trainer
from transformer_rankers.datasets import dataset
from transformer_rankers.negative_samplers import negative_sampling
from transformer_rankers.eval import results_analyses_tools

from transformers import BertTokenizer, BertForSequenceClassification

import logging
import os
import sys
import torch
import random
import numpy as np

np.random.seed(42)
random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

logging.basicConfig(
  level=logging.INFO,
  format="%(asctime)s [%(levelname)s] %(message)s",
  handlers=[
      logging.StreamHandler(sys.stdout)
  ]
)

#The combination of query and question are not that big.
max_seq_len = 50

#Lets use an almost balanced amount of positive and negative samples during training.
average_relevant_per_query = train.groupby("query").count().mean().values[0]

#Instantiate BM25 negative sampler.
ns_train = negative_sampling.BM25NegativeSamplerPyserini(list(question_bank["question"].values[1:]), int(average_relevant_per_query) , 
                    "./data/clariq/anserini_train/", -1, "./anserini/")
ns_val = negative_sampling.BM25NegativeSamplerPyserini(list(question_bank["question"].values[1:]), int(average_relevant_per_query), 
                    "./data/clariq/anserini_train/", -1, "./anserini/")

# We could also use random sampling which does not require Anserini.
# ns_train = negative_sampling.RandomNegativeSampler(list(question_bank["question"].values[1:]), int(average_relevant_per_query))
# ns_val = negative_sampling.RandomNegativeSampler(list(question_bank["question"].values[1:]), int(average_relevant_per_query))

#Create the loaders for the dataset, with the respective negative samplers
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataloader = dataset.QueryDocumentDataLoader(train_df=train,
                    val_df=valid, test_df=valid,
                    tokenizer=tokenizer, negative_sampler_train=ns_train,
                    negative_sampler_val=ns_val, task_type='classification',
                    train_batch_size=12, val_batch_size=12, max_seq_len=max_seq_len,
                    sample_data=-1, cache_path="./data/clariq/")

train_loader, val_loader, test_loader = dataloader.\
  get_pytorch_dataloaders()

#Use BERT (any model that has SequenceClassification class from HuggingFace would work here)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

#Instantiate trainer that handles fitting.
trainer = transformer_trainer.TransformerTrainer(model=model,
  train_loader=train_loader,
  val_loader=val_loader, test_loader=test_loader,
  num_ns_eval=int(average_relevant_per_query), task_type="classification", tokenizer=tokenizer,
  validate_every_epochs=1, num_validation_batches=-1,
  num_epochs=1, lr=5e-7, sacred_ex=None)

#Train (our validation eval uses the NS sampling procedure)
trainer.fit()

2020-12-15 12:14:03,527 [INFO] Train instances per batch 48
2020-12-15 12:14:03,562 [INFO] Loading instances from ./data/clariq//pointwise_set_train_n_cand_docs_45_ns_sampler_BM25NS_seq_max_l_50_sample_-1_for_classification_using_BertTokenizer
2020-12-15 12:14:03,894 [INFO] Total of 16981 instances were cached.
2020-12-15 12:14:03,906 [INFO] Loading instances from ./data/clariq//pointwise_set_val_n_cand_docs_45_ns_sampler_BM25NS_seq_max_l_50_sample_-1_for_classification_using_BertTokenizer
2020-12-15 12:14:04,060 [INFO] Total of 2025 instances were cached.
2020-12-15 12:14:04,072 [INFO] Loading instances from ./data/clariq//pointwise_set_test_n_cand_docs_45_ns_sampler_BM25NS_seq_max_l_50_sample_-1_for_classification_using_BertTokenizer
2020-12-15 12:14:04,086 [INFO] Total of 2025 instances were cached.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

2020-12-15 12:14:08,017 [INFO] Device cuda
2020-12-15 12:14:08,017 [INFO] Num GPU 4
2020-12-15 12:14:08,095 [INFO] Total batches per epoch : 354
2020-12-15 12:14:08,095 [INFO] Validating every 1 epoch.


Epoch 0, steps:   0%|          | 0/354 [00:00<?, ?it/s]

2020-12-15 12:14:12,545 [ERROR] Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable


[34m[1mwandb[0m: Wandb version 0.10.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


2020-12-15 12:14:12,679 [INFO] system metrics and metadata threads started
2020-12-15 12:14:12,680 [INFO] checking resume status, waiting at most 10 seconds
2020-12-15 12:14:12,829 [INFO] resuming run from id: UnVuOnYxOjEweW9iMXN1OnRyYW5zZm9ybWVyX3JhbmtlcnMtbm90ZWJvb2tzOnZlbmRp
2020-12-15 12:14:12,840 [INFO] upserting run before process can begin, waiting at most 10 seconds
2020-12-15 12:14:13,012 [INFO] saving pip packages
2020-12-15 12:14:13,017 [INFO] initializing streaming files api
2020-12-15 12:14:13,019 [INFO] unblocking file change observer, beginning sync with W&B servers




2020-12-15 12:14:13,546 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-summary.json
2020-12-15 12:14:13,550 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/requirements.txt
2020-12-15 12:14:13,552 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/config.yaml
2020-12-15 12:14:13,732 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-events.jsonl
2020-12-15 12:14:13,736 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


Epoch 0, steps:  11%|█         | 39/354 [00:21<02:11,  2.40it/s]

2020-12-15 12:14:29,571 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


Epoch 0, steps:  21%|██        | 73/354 [00:35<01:54,  2.45it/s]

2020-12-15 12:14:43,597 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-events.jsonl


Epoch 0, steps:  22%|██▏       | 78/354 [00:37<01:54,  2.42it/s]

2020-12-15 12:14:45,600 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json


Epoch 0, steps:  33%|███▎      | 117/354 [00:53<01:37,  2.43it/s]

2020-12-15 12:15:01,639 [INFO] file/dir modified: /home/svakule/transformer_rankers/notebooks/wandb/run-20201215_121358-10yob1su/wandb-metadata.json
