# Clickbait spoiling: Rerank

In [1]:
import pandas as pd
    
def load_dataset(file_name):
    import json
    ret = []
    
    with open('/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-clickbait-spoiling/' + file_name) as f:
        for i in f:
            i = json.loads(i)
            query = i['postText']
            spoiler = i['spoiler']
            paragraphs = i['targetParagraphs']
            
            if i['tags'][0] == 'multi':
                continue
                
            assert len(query) == 1
            query = query[0]

            if len(spoiler) != 1:
                spoiler = ' '.join(spoiler)
            else:
                spoiler = spoiler[0]

            ret += [{'query': query, 'uuid': i['uuid'], 'paragraphs': paragraphs, 'spoiler': spoiler}]
            
    return pd.DataFrame(ret)


test_dataset = load_dataset('test.jsonl')
train_dataset = load_dataset('train.jsonl')
validation_dataset = load_dataset('validation.jsonl')
test_pilot_200 = load_dataset('200_test.jsonl')
all_datasets = pd.concat([test_dataset, train_dataset, validation_dataset])

In [10]:
len(test_dataset)

826

# Reranking utilities

In [2]:
from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import MonoT5, MonoBERT
from tqdm import tqdm

monoT5 = MonoT5()
monoBert = MonoBERT()

2021-11-14 19:30:32 [INFO] loader: Loading faiss with AVX2 support.
2021-11-14 19:30:32 [INFO] loader: Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'",)
2021-11-14 19:30:32 [INFO] loader: Loading faiss.
2021-11-14 19:30:32 [INFO] loader: Successfully loaded faiss.


In [3]:
def rerank(datapoint, reranker):
    passages = datapoint['paragraphs']
    passages = zip(range(len(passages)), passages)
    documents = [Text(i[1], {'docid': i[0]}, 0) for i in passages]
    ret = sorted(reranker.rerank(Query(datapoint['query']), documents), key=lambda i: i.score, reverse=True)

    return [{'score': i.score, 'id': i.metadata['docid'], 'text': i.text} for i in ret]


In [29]:
def run_reranking(output_dir, model, dataset, ground_truth_file_name):
    import json
    
    with open(output_dir + 'predictions.json', 'w') as preds, open(output_dir + 'passage-truth.jsonl', 'w') as passage_truth, open(output_dir + 'phrase-truth.jsonl', 'w') as phrase_truth, open('/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-clickbait-spoiling/' + ground_truth_file_name) as f:
        for i in f:
            i_parsed = json.loads(i)
            if i_parsed['tags'][0].startswith('passage'):
                passage_truth.write(i.strip() + '\n')
            elif i_parsed['tags'][0].startswith('phrase'):
                phrase_truth.write(i.strip() + '\n')
        
        result = {}
        dataset = [i for _, i in dataset.iterrows()]
        for i in tqdm(dataset):
            result[i['uuid']] = rerank(i, model)

        preds.write(json.dumps(result))

In [6]:
run_reranking('all-passages-retrieval-t5/', monoT5, all_datasets, 'test.jsonl')

100%|██████████| 4124/4124 [03:27<00:00, 19.90it/s]


In [30]:
run_reranking('test-200/retrieval-t5/', monoT5, test_pilot_200, '200_test.jsonl')

100%|██████████| 182/182 [00:10<00:00, 16.86it/s]


In [31]:
run_reranking('test-200/retrieval-bert/', monoBert, test_pilot_200, '200_test.jsonl')

100%|██████████| 182/182 [00:43<00:00,  4.16it/s]


In [75]:
run_reranking('passage-retrieval-t5/', monoT5, test_dataset)

100%|██████████| 826/826 [00:44<00:00, 18.60it/s]


In [76]:
run_reranking('passage-retrieval-bert/', monoBert)

100%|██████████| 826/826 [02:56<00:00,  4.68it/s]


In [35]:
rerank(test_dataset[0], monoT5)

[{'score': -8.611593246459961,
  'id': 11,
  'body': 'The goal, Zaikis said, is to help educate children about hygiene, and to give them the tools to protect themselves -- an uphill climb. The United Nations estimates that some 2,000 children younger than 5 die every day from diarrheal diseases, and the majority of those deaths are tied to water, sanitation and hygiene.'},
 {'score': -10.837021827697754,
  'id': 4,
  'body': "During one such visit, she watched as middle school-age children entered and left a bathroom without washing their hands. With the help of a translator, she asked the children if they had soap, and was startled to learn they didn't know what soap was. The closest shop that sold it was a several-hour walk away. Zaikis made the trek and bought roughly 150 bars of pink and blue soap, which cost her $30 U.S."},
 {'score': -11.322731018066406,
  'id': 6,
  'body': 'Zaikis knew her soap purchase was a temporary fix, and made plans to spend the summer partnering with sch

In [37]:
rerank(test_dataset[0], monoBert)

[{'score': -11.018206596374512,
  'id': 6,
  'body': 'Zaikis knew her soap purchase was a temporary fix, and made plans to spend the summer partnering with schools and local organizations on hygiene projects. But within days, she collapsed and was rushed to the hospital with dengue hemorrhagic fever -- an often deadly infection spread by mosquitos. Her parents took her back to Boston, where she was treated at Massachusetts General Hospital.'},
 {'score': -11.207947731018066,
  'id': 10,
  'body': 'The Thailand soap is scented with lemongrass and pomelo, and a dollar from the sale of each soap supports the Hug Project\'s mission of funding medical professionals who schedule health visits with "street boys" and children at risk of being trafficked, as well as providing them with hygiene kits. Proceeds from the lavender and shea butter soap go to Ghana, where the Unlock Foundation is funding the construction of a sink and well in one school and supplying soap to the area, while proceeds f

# Sentence Retrieval

In [59]:
def load_dataset(file_name):
    from nltk import sent_tokenize
    import json
    ret = []
    
    with open('/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-clickbait-spoiling/' + file_name) as f:
        for i in f:
            i = json.loads(i)
            query = i['postText']
            spoiler = i['spoiler']
            paragraphs = i['targetParagraphs']
            
            if i['tags'][0] == 'multi':
                continue
                
            assert len(query) == 1
            query = query[0]
            
            if len(spoiler) != 1:
                spoiler = ' '.join(spoiler)
            else:
                spoiler = spoiler[0]
            
            sentences = []
            for paragraph in paragraphs:
                for sentence in sent_tokenize(paragraph):
                    sentences += [sentence]
            
            ret += [{'query': query, 'uuid': i['uuid'], 'paragraphs': sentences, 'spoiler': spoiler}]
            
    return ret


test_sent_dataset = load_dataset('test.jsonl')
test_sent_dataset = load_dataset('200_test.jsonl')

In [60]:
def run_sent_reranking(output_dir, model, ground_truth_file_name):

    import json
    
    with open(output_dir + 'predictions.json', 'w') as preds, open(output_dir + 'passage-truth.jsonl', 'w') as passage_truth, open(output_dir + 'phrase-truth.jsonl', 'w') as phrase_truth, open('/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-clickbait-spoiling/' + ground_truth_file_name) as f:
        for i in f:
            i_parsed = json.loads(i)
            if i_parsed['tags'][0].startswith('passage'):
                passage_truth.write(i.strip() + '\n')
            elif i_parsed['tags'][0].startswith('phrase'):
                phrase_truth.write(i.strip() + '\n')
    
        result = {}
        
        for i in tqdm(test_sent_dataset):
            result[i['uuid']] = rerank(i, model)
        
        preds.write(json.dumps(result))

In [None]:
run_sent_reranking('sentence-retrieval-t5/', monoT5, 'test.jsonl')

In [61]:
run_sent_reranking('test-200/sentence-retrieval-bert/', monoBert, '200_test.jsonl')

100%|██████████| 182/182 [01:26<00:00,  2.09it/s]


In [62]:
run_sent_reranking('test-200/sentence-retrieval-t5/', monoT5, '200_test.jsonl')

100%|██████████| 182/182 [00:18<00:00,  9.82it/s]


In [85]:
run_sent_reranking('sentence-retrieval-bert/', monoBert)

100%|██████████| 826/826 [05:57<00:00,  2.31it/s]


# Reranking with BM25 or QL

In [77]:
def rerank_bm25_passage(i, k1, b, rm3):
    return rerank_bm25(
        query = i['query'],
        index = 'anserini-indexes-200test/paragraphs-' + i['uuid'] + '/index',
        k1 = k1,
        b = b,
        rm3 = rm3,
    )
    
def rerank_bm25_sent(i, k1, b, rm3):
    return rerank_bm25(
        query = i['query'],
        index = 'anserini-indexes-200test/sentences-' + i['uuid'] + '/index',
        k1 = k1,
        b = b,
        rm3 = rm3,
    )

def rerank_bm25(query, index, k1, b):
    from pyserini.search import SimpleSearcher
    import json
    
    searcher = SimpleSearcher(index)
    searcher.set_bm25(k1=k1, b=b)
    hits = searcher.search(query)
    
    ret = []
    for hit in hits:
        ret += [{
            'score': hit.score,
            'id': hit.docid,
            'text': json.loads(searcher.doc(hit.docid).raw())['contents'],
        }]
        
    return ret


def rerank_bm25(query, index, k1=float(0.9), b=float(0.4), rm3=False):
    from pyserini.search import SimpleSearcher
    import json
    
    searcher = SimpleSearcher(index)
    searcher.set_bm25(k1=k1, b=b)
    
    if rm3:
         searcher.set_rm3(fb_terms=20, fb_docs=20, original_query_weight=float(0.2))
    
    hits = searcher.search(query)
    
    ret = []
    for hit in hits:
        ret += [{
            'score': hit.score,
            'id': hit.docid,
            'text': json.loads(searcher.doc(hit.docid).raw())['contents'],
        }]
        
    return ret


In [41]:
rerank_bm25_passage(test_dataset.iloc[0], k1=0.9, b=0.4)

[{'score': 2.2070000171661377,
  'id': '12',
  'text': "Zaikis works on Sundara full time, making all of the soaps and their packaging by hand. The business helps her pay rent on the Manhattan apartment she shares with her sister, but Zaikis frequently takes babysitting and dog-sitting jobs to supplement her income. She used money she'd saved while living in Thailand to buy soap ingredients -- things like spirulina and kelp -- which now crowd her apartment."},
 {'score': 0.949999988079071,
  'id': '1',
  'text': "In high school, Zaikis' mother was diagnosed with breast cancer, and the teen slipped into depression -- trapped in what she described as a cycle of feeling sorry for herself. In college, she took a course in global poverty, which helped put her own struggles in perspective. And at 19, the Boston native traveled to Mumbai, where she spent a summer living in an orphanage that housed around 100 girls, some of whom had been left in trash cans or abandoned in train stations."}]

In [46]:
rerank_bm25_sent(test_dataset.iloc[0], k1=0.9, b=0.4)

[{'score': 3.0964999198913574,
  'id': '35',
  'text': "She used money she'd saved while living in Thailand to buy soap ingredients -- things like spirulina and kelp -- which now crowd her apartment."},
 {'score': 1.3396999835968018,
  'id': '3',
  'text': 'And at 19, the Boston native traveled to Mumbai, where she spent a summer living in an orphanage that housed around 100 girls, some of whom had been left in trash cans or abandoned in train stations.'}]

In [38]:
def run_bm25_reranking(output_dir, ranking_method, ground_truth_file_name):

    import json
    
    !mkdir -p {output_dir}
    
    with open(output_dir + 'predictions.json', 'w') as preds, open(output_dir + 'passage-truth.jsonl', 'w') as passage_truth, open(output_dir + 'phrase-truth.jsonl', 'w') as phrase_truth, open('/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-clickbait-spoiling/' + ground_truth_file_name) as f:
        for i in f:
            i_parsed = json.loads(i)
            if i_parsed['tags'][0].startswith('passage'):
                passage_truth.write(i.strip() + '\n')
            elif i_parsed['tags'][0].startswith('phrase'):
                phrase_truth.write(i.strip() + '\n')
    
        result = {}
        
        for _, i in tqdm(test_dataset.iterrows()):
            ret = ranking_method(i)
            
            if len(ret) == 0:
                continue
            
            result[i['uuid']] = ranking_method(i)
        
        preds.write(json.dumps(result))

#for k1 in [0.8, 0.9, 1.0]:
#    for b in [0.3, 0.4, 0.5]:
#        output_dir = 'passage-retrieval-bm25/k1=' + str(k1) + '-b=' + str(b) + '/'
#        run_bm25_reranking(output_dir, lambda i: rerank_bm25_passage(i, k1, b))
#        
#        output_dir = 'sentence-retrieval-bm25/k1=' + str(k1) + '-b=' + str(b) + '/'
#        run_bm25_reranking(output_dir, lambda i: rerank_bm25_sent(i, k1, b))
        

In [50]:
tmp_del = test_dataset
test_dataset = test_pilot_200
run_bm25_reranking('test-200/sentence-retrieval-bm25/', lambda i: rerank_bm25_sent(i, k1=0.9, b=0.4, rm3=False), '200_test.jsonl')

test_dataset = tmp_del

182it [00:01, 153.66it/s]


In [78]:
tmp_del = test_dataset
test_dataset = test_pilot_200
run_bm25_reranking('test-200/sentence-retrieval-bm25-prf/', lambda i: rerank_bm25_sent(i, k1=0.9, b=0.4, rm3=True), '200_test.jsonl')

test_dataset = tmp_del

182it [00:00, 270.53it/s]


In [48]:
!ls anserini-indexes/

paragraphs-00026258-2eb7-4844-ae29-b78e3890c791
paragraphs-001aec0d-8c99-4ad4-bebf-c37c3da51abe
paragraphs-0040b89e-6461-48d9-b5d8-fff40229dbd7
paragraphs-004eca55-d423-4dba-8428-2789a6e06a2a
paragraphs-0059a443-fc99-4e1f-9c28-762974edc4cb
paragraphs-007e8017-7183-42e7-8f9e-9191e75d3f8e
paragraphs-00807e0e-d8e9-429b-aa54-160409acb574
paragraphs-0080ae60-c526-4c72-90b2-def2348f3506
paragraphs-008ac91f-954d-458d-9728-af6e78b6a7e3
paragraphs-008b7b19-0445-4e16-8f9e-075b73f80ca4
paragraphs-008e49fc-6986-479f-adcd-a16f01ba686d
paragraphs-0096ce82-6c79-4af8-8ff7-0d7e377ba173
paragraphs-009895cd-0207-4207-b427-647963131b27
paragraphs-00a40f50-a1a5-486a-8e5a-7010627257cb
paragraphs-00b53907-b1f2-4588-9578-e4aafa09446c
paragraphs-00b85ea7-cfb1-4206-82b4-efac21e631ae
paragraphs-00bbdeef-7932-4a3e-af07-79e7d583d4c0
paragraphs-00c08974-69aa-4cd7-b7fb-69f3a760ca43
paragraphs-0101495f-ebc5-4f7f-9f1c-c17021c9fb29
paragraphs-01087f73-14bb-4acb-9dcd-a892b634e25a
paragraphs-01191bcc-

paragraphs-c744ebfe-320d-4049-a0d0-eb83dba81591
paragraphs-c779824c-2ee2-424b-867d-645dd972c5c7
paragraphs-c78dd854-2537-48e4-a695-2ac32c7b615a
paragraphs-c7998e57-5138-498b-862a-a6e5fd1b03f1
paragraphs-c79ef679-499d-4ce5-ba7b-d47588326d21
paragraphs-c7c196fd-6ae0-451d-9620-153c04f2d723
paragraphs-c7c7e643-f477-43ac-a4d6-12090ee8b6a7
paragraphs-c7d84a93-acda-4b82-a902-8f75c692aa31
paragraphs-c7fcc8bb-dc35-4bbe-904c-80cb7ef7b29a
paragraphs-c836118c-85e3-4407-bac5-24f0690a1658
paragraphs-c8504a15-0181-4d23-be87-d5ebf3363f79
paragraphs-c850c3c7-0453-40ec-a5aa-feeb63f24abc
paragraphs-c852f82b-53e3-4ba6-9787-e6f3911d99e3
paragraphs-c85fc06f-d233-4c8e-a2d0-c452a9c0105e
paragraphs-c8656dd8-fcb1-47a4-b4e5-727f5448e3d2
paragraphs-c8803108-426c-4945-ad17-5d5db905713b
paragraphs-c8935d8a-a637-4fb4-974f-a9b445c1c059
paragraphs-c8a365db-aae9-4258-a84d-f8321f53421d
paragraphs-c8bd0ff0-fa59-412a-984c-996e363b4604
paragraphs-c8cffe57-451e-44a8-b2d5-c6b4ab0a25d3
paragraphs-c902573b-

In [70]:
def rerank_qld_passage(i, mu, rm3):
    return rerank_qld(
        query = i['query'],
        index = 'anserini-indexes-200test/paragraphs-' + i['uuid'] + '/index',
        mu = mu,
        rm3 = rm3,
    )
    
def rerank_qld_sent(i, mu):
    return rerank_qld(
        query = i['query'],
        index = 'anserini-indexes-200test/sentences-' + i['uuid'] + '/index',
        mu = mu,
        rm3 = rm3,
    )

def rerank_qld(query, index, mu, rm3):
    from pyserini.search import SimpleSearcher
    import json
    
    searcher = SimpleSearcher(index)
    searcher.set_qld(mu = mu)
    if rm3:
        searcher.set_rm3(fb_terms=20, fb_docs=20, original_query_weight=float(0.2))
    
    hits = searcher.search(query)
    
    ret = []
    for hit in hits:
        ret += [{
            'score': hit.score,
            'id': hit.docid,
            'text': json.loads(searcher.doc(hit.docid).raw())['contents'],
        }]
        
    return ret


In [66]:
for mu in [float(800), float(850), float(900), float(950), float(1000), float(1050), float(1100), float(1150), float(1200),]:
        output_dir = 'passage-retrieval-qld/mu=' + str(mu) + '/'
        run_bm25_reranking(output_dir, lambda i: rerank_qld_passage(i, mu))
        
        output_dir = 'sentence-retrieval-qld/mu=' + str(mu) + '/'
        run_bm25_reranking(output_dir, lambda i: rerank_qld_sent(i, mu))
     

826it [00:01, 494.57it/s]
826it [00:01, 480.79it/s]
826it [00:01, 600.53it/s]
826it [00:01, 554.15it/s]
826it [00:01, 601.45it/s]
826it [00:01, 549.27it/s]
826it [00:01, 605.98it/s]
826it [00:01, 453.68it/s]
826it [00:01, 602.20it/s]
826it [00:01, 558.68it/s]
826it [00:01, 567.20it/s]
826it [00:01, 570.65it/s]
826it [00:01, 594.51it/s]
826it [00:01, 573.44it/s]
826it [00:01, 585.26it/s]
826it [00:01, 566.38it/s]
826it [00:01, 586.84it/s]
826it [00:01, 571.88it/s]


In [71]:
tmp_del = test_dataset
test_dataset = test_pilot_200
run_bm25_reranking('test-200/sentence-retrieval-ax/', lambda i: rerank_qld_passage(i, float(1000), True), '200_test.jsonl')

test_dataset = tmp_del

182it [00:00, 247.99it/s]


In [56]:
tmp_del = test_dataset
test_dataset = test_pilot_200
run_bm25_reranking('test-200/sentence-retrieval-qld+rm3/', lambda i: rerank_qld_passage(i, float(1000), True), '200_test.jsonl')

test_dataset = tmp_del

182it [00:00, 275.10it/s]


# Create Evaluation Files for BM25/QLD Passage Retrieval with

```
find /mnt/ceph/storage/data-tmp/2021/kibi9872/ecir22-zero-shot/src/main/jupyter/clickbait-spoiling/sentence-retrieval-bm25/ -name '*k1=*' |grep -v json|xargs -i bash -c 'export INPUT_DIR="{}" && ./eval-in-progress-maik.sh'


find /mnt/ceph/storage/data-tmp/2021/kibi9872/ecir22-zero-shot/src/main/jupyter/clickbait-spoiling/passage-retrieval-bm25/ -name '*k1=*' |grep -v json|xargs -i bash -c 'export INPUT_DIR="{}" && ./eval-in-progress-maik.sh'

find /mnt/ceph/storage/data-tmp/2021/kibi9872/ecir22-zero-shot/src/main/jupyter/clickbait-spoiling/sentence-retrieval-qld/ -name '*mu=*' |grep -v json|xargs -i bash -c 'export INPUT_DIR="{}" && ./eval-in-progress-maik.sh'

find /mnt/ceph/storage/data-tmp/2021/kibi9872/ecir22-zero-shot/src/main/jupyter/clickbait-spoiling/passage-retrieval-qld/ -name '*mu=*'|xargs -i bash -c 'export INPUT_DIR="{}" && ./eval-in-progress-maik.sh'
```

# Create Evaluations on Pilot Experiments

```
INPUT_DIR="/mnt/ceph/storage/data-tmp/2021/kibi9872/ecir22-zero-shot/src/main/jupyter/clickbait-spoiling/test-200/sentence-retrieval-bert/" ./eval-in-progress-maik.sh

INPUT_DIR="/mnt/ceph/storage/data-tmp/2021/kibi9872/ecir22-zero-shot/src/main/jupyter/clickbait-spoiling/test-200/sentence-retrieval-t5/" ./eval-in-progress-maik.sh

INPUT_DIR="/mnt/ceph/storage/data-tmp/2021/kibi9872/ecir22-zero-shot/src/main/jupyter/clickbait-spoiling/test-200/sentence-retrieval-bm25/" ./eval-in-progress-maik.sh

INPUT_DIR="/mnt/ceph/storage/data-tmp/2021/kibi9872/ecir22-zero-shot/src/main/jupyter/clickbait-spoiling/test-200/sentence-retrieval-bm25-rm3/" ./eval-in-progress-maik.sh

INPUT_DIR="/mnt/ceph/storage/data-tmp/2021/kibi9872/ecir22-zero-shot/src/main/jupyter/clickbait-spoiling/test-200/sentence-retrieval-bm25-prf/" ./eval-in-progress-maik.sh

INPUT_DIR="/mnt/ceph/storage/data-tmp/2021/kibi9872/ecir22-zero-shot/src/main/jupyter/clickbait-spoiling/test-200/sentence-retrieval-bm25-ax/" ./eval-in-progress-maik.sh



INPUT_DIR="/mnt/ceph/storage/data-tmp/2021/kibi9872/ecir22-zero-shot/src/main/jupyter/clickbait-spoiling/test-200/sentence-retrieval-qld/" ./eval-in-progress-maik.sh

INPUT_DIR="/mnt/ceph/storage/data-tmp/2021/kibi9872/ecir22-zero-shot/src/main/jupyter/clickbait-spoiling/test-200/sentence-retrieval-rm3/" ./eval-in-progress-maik.sh


INPUT_DIR="/mnt/ceph/storage/data-tmp/2021/kibi9872/ecir22-zero-shot/src/main/jupyter/clickbait-spoiling/test-200/sentence-retrieval-prf/" ./eval-in-progress-maik.sh

INPUT_DIR="/mnt/ceph/storage/data-tmp/2021/kibi9872/ecir22-zero-shot/src/main/jupyter/clickbait-spoiling/test-200/sentence-retrieval-ax/" ./eval-in-progress-maik.sh
```